feat(agent): support multimodal intent input and ASR transcribe endpoint

2026-03-08 17:34:28 +08:00
parent 5ada60e834
commit 1060503a2d
11 changed files with 422 additions and 74 deletions
@@ -1,5 +1,9 @@
 from __future__ import annotations

+from types import SimpleNamespace
+from typing import Any
+from unittest.mock import AsyncMock, patch
+
 from core.agent.infrastructure.litellm.client import run_completion


@@ -53,3 +57,46 @@ def test_run_completion_omits_optional_params_when_none(monkeypatch) -> None:
    assert "temperature" not in captured
    assert "max_tokens" not in captured
    assert "timeout" not in captured
+
+
+def test_image_content_block_is_preserved_for_llm(monkeypatch) -> None:
+    captured: dict[str, object] = {}
+
+    def _fake_completion(**kwargs):  # type: ignore[no-untyped-def]
+        captured.update(kwargs)
+        return SimpleNamespace(model_dump=lambda: {"choices": []})
+
+    monkeypatch.setattr(
+        "core.agent.infrastructure.litellm.client.completion",
+        _fake_completion,
+    )
+
+    messages_with_image = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "分析这个图片"},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": "https://example.com/image.png"},
+                },
+            ],
+        }
+    ]
+
+    run_completion(
+        model="dashscope/qwen3.5-flash",
+        api_key="key",
+        messages=messages_with_image,
+    )
+
+    assert "messages" in captured
+    result_messages = captured["messages"]
+    assert isinstance(result_messages, list)
+    assert len(result_messages) == 1
+    content = result_messages[0]["content"]
+    assert isinstance(content, list)
+    assert len(content) == 2
+    assert content[0]["type"] == "text"
+    assert content[1]["type"] == "image_url"
+    assert content[1]["image_url"]["url"] == "https://example.com/image.png"