fix(agent): stabilize live e2e tool execution and loop isolation

2026-03-08 22:41:59 +08:00
parent 14508c52f6
commit 2980213a5b
32 changed files with 3076 additions and 560 deletions
@@ -0,0 +1,22 @@
+# Live E2E Test Suite
+
+`backend/tests/e2e/test_agent_live_flow.py` 是真实依赖端到端测试，依赖真实 LLM、Supabase DB、Supabase Storage。
+
+## Command Split
+
+- CI 默认测试（不跑 live）：
+
+```bash
+uv run pytest -m "not live"
+```
+
+- 手动运行 live 真实端到端：
+
+```bash
+uv run pytest backend/tests/e2e/test_agent_live_flow.py -m live -v
+```
+
+## Notes
+
+- live 用例默认通过 marker 与常规回归隔离，避免 CI 因外部环境波动失败。
+- tool result 存储使用私有 bucket 读取校验，不依赖公共下载链接。
@@ -0,0 +1,562 @@
+from __future__ import annotations
+
+import base64
+import json
+import os
+import uuid
+from decimal import Decimal
+from pathlib import Path
+
+import pytest
+from sqlalchemy import delete, select
+
+from core.agent.application.resume_service import ResumeService
+from core.agent.application.run_service import RunService
+from core.agent.infrastructure.queue.tasks import run_agent_task
+from core.agent.infrastructure.storage.tool_result_storage import (
+    create_tool_result_storage,
+)
+from core.db import AsyncSessionLocal, engine
+from models.agent_chat_message import AgentChatMessage, AgentChatMessageRole
+from models.agent_chat_session import AgentChatSession, AgentChatSessionStatus
+from models.llm import Llm
+from models.llm_factory import LlmFactory
+from models.profile import Profile
+from models.schedule_items import ScheduleItem
+from models.system_agents import SystemAgents
+from services.base.supabase import supabase_service
+
+IMAGE_FIXTURE = (
+    Path(__file__).resolve().parents[1] / "fixtures" / "images" / "calendar_text_cn.png"
+)
+
+
+def _live_enabled() -> bool:
+    return os.getenv("AGENT_LIVE_E2E") == "1"
+
+
+async def _init_supabase_admin_client():
+    initialized = await supabase_service.initialize()
+    if not initialized:
+        pytest.skip("Supabase service unavailable")
+    return supabase_service.get_admin_client()
+
+
+async def _create_owner_profile(admin_client) -> tuple[uuid.UUID, str]:
+    user_email = f"agent-live-{uuid.uuid4().hex[:8]}@example.com"
+    created = admin_client.auth.admin.create_user(
+        {
+            "email": user_email,
+            "password": "Passw0rd!123",
+            "email_confirm": True,
+        }
+    )
+    user_id = str(created.user.id)
+    owner_id = uuid.UUID(user_id)
+    return owner_id, user_id
+
+
+async def _resolve_llm_id(
+    *,
+    target_model_code: str = "deepseek-chat",
+    target_factory_name: str = "deepseek",
+) -> tuple[uuid.UUID, uuid.UUID | None, uuid.UUID | None]:
+    await engine.dispose()
+    async with AsyncSessionLocal() as session:
+        llm_row = await session.execute(
+            select(Llm.id).where(Llm.model_code == target_model_code).limit(1)
+        )
+        llm_id = llm_row.scalar_one_or_none()
+        if llm_id is not None:
+            return llm_id, None, None
+
+    factory_id = uuid.uuid4()
+    llm_id = uuid.uuid4()
+    created_factory = False
+    async with AsyncSessionLocal() as session:
+        factory_row = await session.execute(
+            select(LlmFactory.id).where(LlmFactory.name == target_factory_name).limit(1)
+        )
+        existing_factory_id = factory_row.scalar_one_or_none()
+        if existing_factory_id is not None:
+            factory_id = existing_factory_id
+        else:
+            session.add(
+                LlmFactory(
+                    id=factory_id,
+                    name=target_factory_name,
+                    request_url=f"https://{target_factory_name}.example",
+                )
+            )
+            await session.commit()
+            created_factory = True
+
+    async with AsyncSessionLocal() as session:
+        session.add(
+            Llm(
+                id=llm_id,
+                factory_id=factory_id,
+                model_code=target_model_code,
+            )
+        )
+        await session.commit()
+    return llm_id, llm_id, factory_id if created_factory else None
+
+
+async def _seed_session_with_active_agent(
+    *,
+    session_id: uuid.UUID,
+    owner_id: uuid.UUID,
+    agent_type: str,
+    llm_id: uuid.UUID,
+) -> None:
+    await engine.dispose()
+    async with AsyncSessionLocal() as session:
+        session.add(SystemAgents(agent_type=agent_type, llm_id=llm_id, status="active"))
+        session.add(AgentChatSession(id=session_id, user_id=owner_id))
+        await session.commit()
+
+
+async def _cleanup_session_and_agent(
+    *,
+    session_id: uuid.UUID,
+    agent_type: str,
+    owner_id: uuid.UUID,
+    llm_id_to_cleanup: uuid.UUID | None,
+    factory_id_to_cleanup: uuid.UUID | None,
+) -> None:
+    async with AsyncSessionLocal() as session:
+        await session.execute(
+            delete(AgentChatSession).where(AgentChatSession.id == session_id)
+        )
+        await session.execute(
+            delete(SystemAgents).where(SystemAgents.agent_type == agent_type)
+        )
+        await session.execute(delete(Profile).where(Profile.id == owner_id))
+        if llm_id_to_cleanup is not None:
+            await session.execute(delete(Llm).where(Llm.id == llm_id_to_cleanup))
+        if factory_id_to_cleanup is not None:
+            await session.execute(
+                delete(LlmFactory).where(LlmFactory.id == factory_id_to_cleanup)
+            )
+        await session.commit()
+
+
+async def _cleanup_auth_user(*, admin_client, user_id: str | None) -> None:
+    if user_id is None:
+        return
+    try:
+        admin_client.auth.admin.delete_user(user_id)
+    except Exception:
+        return
+
+
+def _encode_fixture_image_base64() -> str:
+    data = IMAGE_FIXTURE.read_bytes()
+    return base64.b64encode(data).decode("ascii")
+
+
+@pytest.mark.asyncio
+@pytest.mark.live
+async def test_agent_live_intent_only_no_tool() -> None:
+    if not _live_enabled():
+        pytest.skip("Live test disabled")
+    session_id = uuid.uuid4()
+    agent_type = f"LIVE_E2E_{uuid.uuid4().hex[:8]}"
+    admin_client = await _init_supabase_admin_client()
+    owner_id, test_user_id = await _create_owner_profile(admin_client)
+    llm_id, llm_cleanup_id, factory_cleanup_id = await _resolve_llm_id()
+
+    try:
+        await _seed_session_with_active_agent(
+            session_id=session_id,
+            owner_id=owner_id,
+            agent_type=agent_type,
+            llm_id=llm_id,
+        )
+
+        result = await run_agent_task(
+            {
+                "command": "run",
+                "run_input": {
+                    "threadId": str(session_id),
+                    "runId": "run-live-intent-1",
+                    "state": {},
+                    "messages": [
+                        {
+                            "id": "u1",
+                            "role": "user",
+                            "content": "请用一句话介绍你是谁。",
+                        }
+                    ],
+                    "tools": [],
+                    "context": [],
+                    "forwardedProps": {},
+                },
+            },
+            run_service=RunService(),
+            resume_service=ResumeService(),
+        )
+
+        assert result["pending_tool_call_id"] is None
+
+        await engine.dispose()
+        async with AsyncSessionLocal() as session:
+            chat_session = await session.get(AgentChatSession, session_id)
+            assert chat_session is not None
+            assert chat_session.status == AgentChatSessionStatus.COMPLETED
+            rows = await session.execute(
+                select(AgentChatMessage)
+                .where(AgentChatMessage.session_id == session_id)
+                .order_by(AgentChatMessage.seq.asc())
+            )
+            messages = list(rows.scalars().all())
+            assert [m.role for m in messages] == [
+                AgentChatMessageRole.USER,
+                AgentChatMessageRole.ASSISTANT,
+            ]
+    finally:
+        await _cleanup_session_and_agent(
+            session_id=session_id,
+            agent_type=agent_type,
+            owner_id=owner_id,
+            llm_id_to_cleanup=llm_cleanup_id,
+            factory_id_to_cleanup=factory_cleanup_id,
+        )
+        await _cleanup_auth_user(admin_client=admin_client, user_id=test_user_id)
+        await supabase_service.close()
+
+
+@pytest.mark.asyncio
+@pytest.mark.live
+async def test_agent_live_image_calendar_tool_persistence() -> None:
+    if not _live_enabled():
+        pytest.skip("Live test disabled")
+
+    admin_client = await _init_supabase_admin_client()
+
+    tool_result_storage = create_tool_result_storage()
+    if tool_result_storage is None:
+        pytest.skip("Tool result storage unavailable")
+
+    storage = admin_client.storage
+    try:
+        storage.get_bucket("private")
+    except Exception:
+        storage.create_bucket("private", "private", {"public": False})
+
+    probe_path = f"tool-results/probe/{uuid.uuid4().hex}.json"
+    try:
+        storage.from_("private").upload(probe_path, b"{}")
+        storage.from_("private").remove([probe_path])
+    except Exception:
+        pytest.skip("Supabase private storage bucket is not writable")
+
+    owner_id, test_user_id = await _create_owner_profile(admin_client)
+    llm_id, llm_cleanup_id, factory_cleanup_id = await _resolve_llm_id(
+        target_model_code="qwen3.5-flash",
+        target_factory_name="dashscope",
+    )
+    session_id = uuid.uuid4()
+    agent_type = f"LIVE_E2E_{uuid.uuid4().hex[:8]}"
+    uploaded_paths: list[str] = []
+
+    try:
+        await _seed_session_with_active_agent(
+            session_id=session_id,
+            owner_id=owner_id,
+            agent_type=agent_type,
+            llm_id=llm_id,
+        )
+
+        image_b64 = _encode_fixture_image_base64()
+        result = await run_agent_task(
+            {
+                "command": "run",
+                "run_input": {
+                    "threadId": str(session_id),
+                    "runId": "run-live-image-1",
+                    "state": {},
+                    "messages": [
+                        {
+                            "id": "u1",
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": (
+                                        "请先识别图片中的日程文字，然后调用后端日历工具创建事件。"
+                                        "返回时请确保标题和开始时间不为空。"
+                                    ),
+                                },
+                                {
+                                    "type": "binary",
+                                    "mimeType": "image/png",
+                                    "data": image_b64,
+                                },
+                            ],
+                        }
+                    ],
+                    "tools": [],
+                    "context": [],
+                    "forwardedProps": {},
+                },
+            },
+            run_service=RunService(
+                tool_result_storage=tool_result_storage,
+                tool_result_offload_threshold_bytes=1,
+                tool_result_bucket="private",
+                tool_result_prefix="tool-results",
+            ),
+            resume_service=ResumeService(),
+        )
+
+        assert result["pending_tool_call_id"] is None
+
+        await engine.dispose()
+        async with AsyncSessionLocal() as session:
+            chat_session = await session.get(AgentChatSession, session_id)
+            assert chat_session is not None
+            assert chat_session.status == AgentChatSessionStatus.COMPLETED
+
+            schedule_rows = await session.execute(
+                select(ScheduleItem)
+                .where(ScheduleItem.owner_id == owner_id)
+                .order_by(ScheduleItem.created_at.desc())
+            )
+            created_items = list(schedule_rows.scalars().all())
+            assert created_items, (
+                "Expected schedule item created by backend calendar tool"
+            )
+            created_item = created_items[0]
+            assert created_item.title
+            assert created_item.timezone
+            assert created_item.start_at is not None
+
+            tool_rows = await session.execute(
+                select(AgentChatMessage)
+                .where(AgentChatMessage.session_id == session_id)
+                .where(AgentChatMessage.role == AgentChatMessageRole.TOOL)
+                .order_by(AgentChatMessage.seq.desc())
+            )
+            tool_message = tool_rows.scalars().first()
+            assert tool_message is not None
+            metadata = tool_message.metadata_json or {}
+            storage_bucket = metadata.get("storage_bucket")
+            storage_path = metadata.get("storage_path")
+            assert storage_bucket == "private"
+            assert isinstance(storage_path, str)
+            assert storage_path.startswith("tool-results/")
+            uploaded_paths.append(storage_path)
+
+        downloaded = storage.from_("private").download(uploaded_paths[0])
+        if isinstance(downloaded, bytes):
+            payload = json.loads(downloaded.decode("utf-8"))
+        else:
+            payload = json.loads(str(downloaded))
+
+        assert payload["toolName"] == "back.create_calendar_event"
+    finally:
+        if uploaded_paths:
+            try:
+                storage.from_("private").remove(uploaded_paths)
+            except Exception:
+                pass
+        async with AsyncSessionLocal() as cleanup_session:
+            await cleanup_session.execute(
+                delete(ScheduleItem).where(ScheduleItem.owner_id == owner_id)
+            )
+            await cleanup_session.commit()
+        await _cleanup_session_and_agent(
+            session_id=session_id,
+            agent_type=agent_type,
+            owner_id=owner_id,
+            llm_id_to_cleanup=llm_cleanup_id,
+            factory_id_to_cleanup=factory_cleanup_id,
+        )
+        await _cleanup_auth_user(admin_client=admin_client, user_id=test_user_id)
+        await supabase_service.close()
+
+
+@pytest.mark.asyncio
+@pytest.mark.live
+async def test_agent_live_front_tool_interrupt_resume_continue() -> None:
+    if not _live_enabled():
+        pytest.skip("Live test disabled")
+
+    admin_client = await _init_supabase_admin_client()
+    owner_id, test_user_id = await _create_owner_profile(admin_client)
+    llm_id, llm_cleanup_id, factory_cleanup_id = await _resolve_llm_id()
+    session_id = uuid.uuid4()
+    agent_type = f"LIVE_E2E_{uuid.uuid4().hex[:8]}"
+    queued_commands: list[dict[str, object]] = []
+    published_events: list[str] = []
+
+    async def _publish(event: dict[str, object]) -> None:
+        event_type = event.get("type")
+        if isinstance(event_type, str):
+            published_events.append(event_type)
+
+    async def _enqueue(command: dict[str, object]) -> str:
+        queued_commands.append(command)
+        return "task-followup-live"
+
+    try:
+        await _seed_session_with_active_agent(
+            session_id=session_id,
+            owner_id=owner_id,
+            agent_type=agent_type,
+            llm_id=llm_id,
+        )
+
+        run_result = await run_agent_task(
+            {
+                "command": "run",
+                "run_input": {
+                    "threadId": str(session_id),
+                    "runId": "run-live-front-1",
+                    "state": {},
+                    "messages": [
+                        {
+                            "id": "u1",
+                            "role": "user",
+                            "content": "你必须调用 front.navigate_to_route 工具跳转到 /calendar/dayweek。",
+                        }
+                    ],
+                    "tools": [
+                        {
+                            "name": "front.navigate_to_route",
+                            "description": "Navigate frontend route; runtime raises approval interrupt when called.",
+                            "parameters": {
+                                "type": "object",
+                                "properties": {
+                                    "target": {"type": "string"},
+                                    "replace": {"type": "boolean"},
+                                },
+                                "required": ["target"],
+                            },
+                        }
+                    ],
+                    "context": [],
+                    "forwardedProps": {},
+                },
+            },
+            publish_event=_publish,
+            enqueue_command=_enqueue,
+            run_service=RunService(),
+            resume_service=ResumeService(),
+        )
+
+        pending_tool_call_id = run_result["pending_tool_call_id"]
+        assert isinstance(pending_tool_call_id, str), (
+            f"Expected pending tool call, got result: {json.dumps(run_result, ensure_ascii=False)}"
+        )
+        snapshot = run_result["state_snapshot"]
+        assert isinstance(snapshot, dict)
+        pending_tool_nonce = snapshot.get("pending_tool_nonce")
+        assert isinstance(pending_tool_nonce, str)
+        guarded_tool_args: dict[str, object] | None = None
+        has_matching_tool_args_event = False
+        events = run_result.get("events")
+        if isinstance(events, list):
+            for event in events:
+                if not isinstance(event, dict):
+                    continue
+                if event.get("type") != "TOOL_CALL_ARGS":
+                    continue
+                if event.get("toolCallId") != pending_tool_call_id:
+                    continue
+                has_matching_tool_args_event = True
+                delta = event.get("delta")
+                if not isinstance(delta, str):
+                    continue
+                try:
+                    parsed_delta = json.loads(delta)
+                except (TypeError, ValueError):
+                    continue
+                if isinstance(parsed_delta, dict):
+                    guarded_tool_args = parsed_delta
+                    break
+        if has_matching_tool_args_event:
+            assert guarded_tool_args is not None
+        if guarded_tool_args is None:
+            guarded_tool_args = {
+                "target": "/calendar/dayweek",
+                "replace": False,
+                "__nonce": pending_tool_nonce,
+            }
+        assert guarded_tool_args.get("__nonce") == pending_tool_nonce
+
+        await run_agent_task(
+            {
+                "command": "resume",
+                "run_input": {
+                    "threadId": str(session_id),
+                    "runId": "run-live-front-2",
+                    "state": {},
+                    "messages": [
+                        {
+                            "id": "tool-1",
+                            "role": "tool",
+                            "toolCallId": pending_tool_call_id,
+                            "content": json.dumps(
+                                {
+                                    "toolName": "front.navigate_to_route",
+                                    "toolArgs": guarded_tool_args,
+                                    "nonce": pending_tool_nonce,
+                                    "result": {
+                                        "ok": True,
+                                        "route": "/calendar/dayweek",
+                                    },
+                                },
+                                ensure_ascii=True,
+                                separators=(",", ":"),
+                            ),
+                        }
+                    ],
+                    "tools": [],
+                    "context": [],
+                    "forwardedProps": {},
+                },
+            },
+            publish_event=_publish,
+            enqueue_command=_enqueue,
+            run_service=RunService(),
+            resume_service=ResumeService(),
+        )
+
+        assert len(queued_commands) == 1
+        await run_agent_task(
+            queued_commands[0],
+            publish_event=_publish,
+            enqueue_command=_enqueue,
+            run_service=RunService(),
+            resume_service=ResumeService(),
+        )
+
+        await engine.dispose()
+        async with AsyncSessionLocal() as session:
+            chat_session = await session.get(AgentChatSession, session_id)
+            assert chat_session is not None
+            assert chat_session.status == AgentChatSessionStatus.COMPLETED
+            rows = await session.execute(
+                select(AgentChatMessage)
+                .where(AgentChatMessage.session_id == session_id)
+                .order_by(AgentChatMessage.seq.asc())
+            )
+            messages = list(rows.scalars().all())
+            assert any(m.role == AgentChatMessageRole.TOOL for m in messages)
+            assert chat_session.total_cost >= Decimal("0")
+
+        assert "RUN_STARTED" in published_events
+        assert "RUN_FINISHED" in published_events
+    finally:
+        await _cleanup_session_and_agent(
+            session_id=session_id,
+            agent_type=agent_type,
+            owner_id=owner_id,
+            llm_id_to_cleanup=llm_cleanup_id,
+            factory_id_to_cleanup=factory_cleanup_id,
+        )
+        await _cleanup_auth_user(admin_client=admin_client, user_id=test_user_id)
+        await supabase_service.close()
@@ -0,0 +1,37 @@
+from __future__ import annotations
+
+from core.agent.domain.agui_input import extract_latest_user_payload, parse_run_input
+
+
+def test_parse_run_input_accepts_binary_multimodal_content() -> None:
+    run_input = parse_run_input(
+        {
+            "threadId": "00000000-0000-0000-0000-000000000001",
+            "runId": "run-1",
+            "state": {},
+            "messages": [
+                {
+                    "id": "u1",
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "extract image"},
+                        {
+                            "type": "binary",
+                            "mimeType": "image/png",
+                            "data": "ZmFrZS1iYXNlNjQ=",
+                        },
+                    ],
+                }
+            ],
+            "tools": [],
+            "context": [],
+            "forwardedProps": {},
+        }
+    )
+
+    user_text, blocks = extract_latest_user_payload(run_input)
+    assert user_text == "extract image"
+    assert blocks[-1] == {
+        "type": "image_url",
+        "image_url": {"url": "data:image/png;base64,ZmFrZS1iYXNlNjQ="},
+    }
@@ -1,7 +1,5 @@
 from __future__ import annotations

-from pathlib import Path
-
 import pytest

 from core.agent.infrastructure.crewai.loader import (
@@ -35,31 +33,3 @@ def test_load_agent_task_template_returns_matching_pair() -> None:
 def test_load_agent_task_template_rejects_unknown_stage() -> None:
    with pytest.raises(ValueError, match="Unknown CrewAI stage"):
        load_agent_task_template(stage="unknown")
-
-
-def test_load_crewai_agent_templates_rejects_invalid_yaml_shape() -> None:
-    path = (
-        Path(__file__).resolve().parents[4]
-        / "src"
-        / "core"
-        / "config"
-        / "static"
-        / "crewai"
-        / "agents.invalid-shape.yaml"
-    )
-    path.write_text("- invalid\n", encoding="utf-8")
-    try:
-        with pytest.raises(ValueError, match="Invalid CrewAI template format"):
-            load_crewai_agent_templates(path)
-    finally:
-        path.unlink(missing_ok=True)
-
-
-def test_load_crewai_agent_templates_rejects_missing_required_fields() -> None:
-    path = Path(__file__).resolve().parents[4] / "src" / "core" / "config" / "static" / "crewai" / "agents.invalid.yaml"
-    path.write_text("intent:\n  role: Intent Agent\n", encoding="utf-8")
-    try:
-        with pytest.raises(ValueError, match="Invalid CrewAI agent template"):
-            load_crewai_agent_templates(path)
-    finally:
-        path.unlink(missing_ok=True)
@@ -3,8 +3,10 @@ from __future__ import annotations
 from types import MethodType, SimpleNamespace
 from typing import cast

+import core.agent.infrastructure.crewai.runtime as runtime_module
+import core.agent.infrastructure.crewai.runtime_stage_runner as stage_runner_module
 from core.agent.infrastructure.config.resolver import AgentConfigResolver, SettingsLike
-from core.agent.infrastructure.crewai.runtime import CrewAIRuntime
+from core.agent.infrastructure.crewai.runtime import CrewAIRuntime, _parse_intent_result
 from core.agent.infrastructure.litellm.usage_tracker import UsageCost


@@ -127,6 +129,298 @@ def test_runtime_needs_execution_and_collects_front_tool_call() -> None:
    assert result["total_tokens"] == 6


+def test_runtime_extracts_pending_front_tool_from_execution_data() -> None:
+    runtime = _build_runtime()
+
+    def _fake_run_stage(self, **kwargs):
+        stage = kwargs["stage"]
+        if stage == "intent":
+            return (
+                '{"route":"NEEDS_EXECUTION","intent_summary":"navigate","execution_brief":"call tool","safety_flags":[]}',
+                UsageCost(1, 1, 2, 0.01),
+                [],
+                None,
+            )
+        if stage == "execution":
+            return (
+                '{"status":"SUCCESS","execution_summary":"done","execution_data":{"tool_name":"front.navigate_to_route","arguments":{"target":"/calendar/dayweek","replace":false},"result_status":"pending_approval"},"report_brief":"awaiting approval"}',
+                UsageCost(2, 2, 4, 0.02),
+                [],
+                None,
+            )
+        return (
+            '{"assistant_text":"final answer","response_metadata":{"source":"organization"}}',
+            UsageCost(3, 3, 6, 0.03),
+            [],
+            None,
+        )
+
+    runtime._run_stage_with_crewai = MethodType(_fake_run_stage, runtime)  # type: ignore[method-assign]
+    result = runtime.execute(
+        user_input="go",
+        tools=[
+            {
+                "name": "front.navigate_to_route",
+                "description": "navigate",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "target": {"type": "string"},
+                        "replace": {"type": "boolean"},
+                    },
+                    "required": ["target"],
+                },
+            }
+        ],
+    )
+
+    assert result["pending_front_tool"] == {
+        "name": "front.navigate_to_route",
+        "args": {"target": "/calendar/dayweek", "replace": False},
+        "target": "frontend",
+    }
+
+
+def test_runtime_multimodal_intent_receives_execution_tool_awareness() -> None:
+    runtime = _build_runtime()
+    calls: list[dict[str, object]] = []
+
+    def _fake_run_stage(self, **kwargs):
+        stage = kwargs["stage"]
+        tools = kwargs["tools_payload"]
+        calls.append({"stage": stage, "tools": tools})
+        if stage == "intent":
+            return (
+                '{"route":"NEEDS_EXECUTION","intent_summary":"need tool","execution_brief":"call back.create_calendar_event","safety_flags":[]}',
+                UsageCost(1, 1, 2, 0.01),
+                [],
+                None,
+            )
+        if stage == "execution":
+            return (
+                '{"status":"SUCCESS","execution_summary":"done","execution_data":{},"report_brief":"ok"}',
+                UsageCost(2, 2, 4, 0.02),
+                [],
+                None,
+            )
+        return (
+            '{"assistant_text":"final answer","response_metadata":{"source":"organization"}}',
+            UsageCost(3, 3, 6, 0.03),
+            [],
+            None,
+        )
+
+    runtime._run_stage_with_crewai = MethodType(_fake_run_stage, runtime)  # type: ignore[method-assign]
+    runtime.execute(
+        user_input="go",
+        user_input_multimodal=[{"type": "text", "text": "hello"}],
+        tools=[],
+    )
+
+    intent_tools = cast(list[dict[str, object]], calls[0]["tools"])
+    assert any(t.get("name") == "back.create_calendar_event" for t in intent_tools)
+
+
+def test_runtime_synthesizes_backend_call_when_model_skips_react_tool_call() -> None:
+    runtime = _build_runtime()
+
+    backend_calls: list[tuple[str, dict[str, object]]] = []
+
+    def _backend_handler(
+        tool_name: str, tool_args: dict[str, object]
+    ) -> dict[str, object]:
+        backend_calls.append((tool_name, tool_args))
+        return {
+            "type": "calendar_card.v1",
+            "version": "v1",
+            "data": {"id": "evt-1", "title": str(tool_args.get("title", ""))},
+            "actions": [],
+        }
+
+    runtime.set_backend_tool_handler(_backend_handler)
+
+    def _fake_run_stage(self, **kwargs):
+        stage = kwargs["stage"]
+        if stage == "intent":
+            return (
+                '{"route":"NEEDS_EXECUTION","intent_summary":"create event","execution_brief":"create via backend tool","safety_flags":[]}',
+                UsageCost(1, 1, 2, 0.01),
+                [],
+                None,
+            )
+        if stage == "execution":
+            return (
+                '{"status":"SUCCESS","execution_summary":"created","execution_data":{"title":"项目评审","timezone":"Asia/Shanghai"},"report_brief":"done"}',
+                UsageCost(2, 2, 4, 0.02),
+                [],
+                None,
+            )
+        return (
+            '{"assistant_text":"ok","response_metadata":{}}',
+            UsageCost(1, 1, 2, 0.01),
+            [],
+            None,
+        )
+
+    runtime._run_stage_with_crewai = MethodType(_fake_run_stage, runtime)  # type: ignore[method-assign]
+    result = runtime.execute(user_input="创建日程", tools=[])
+
+    assert backend_calls == [
+        (
+            "back.create_calendar_event",
+            {"title": "项目评审", "timezone": "Asia/Shanghai"},
+        )
+    ]
+    tool_calls = cast(list[dict[str, object]], result["tool_calls"])
+    assert any(
+        call.get("target") == "backend"
+        and call.get("name") == "back.create_calendar_event"
+        for call in tool_calls
+    )
+
+
+def test_runtime_extracts_pending_front_tool_from_approval_required_shape() -> None:
+    runtime = _build_runtime()
+
+    def _fake_run_stage(self, **kwargs):
+        stage = kwargs["stage"]
+        if stage == "intent":
+            return (
+                '{"route":"NEEDS_EXECUTION","intent_summary":"navigate","execution_brief":"call tool","safety_flags":[]}',
+                UsageCost(1, 1, 2, 0.01),
+                [],
+                None,
+            )
+        if stage == "execution":
+            return (
+                '{"status":"PARTIAL","execution_summary":"approval needed","execution_data":{"tool_name":"front.navigate_to_route","target":"/calendar/dayweek","approval_required":true},"report_brief":"await approval"}',
+                UsageCost(2, 2, 4, 0.02),
+                [],
+                None,
+            )
+        return (
+            '{"assistant_text":"final answer","response_metadata":{"source":"organization"}}',
+            UsageCost(3, 3, 6, 0.03),
+            [],
+            None,
+        )
+
+    runtime._run_stage_with_crewai = MethodType(_fake_run_stage, runtime)  # type: ignore[method-assign]
+    result = runtime.execute(
+        user_input="go",
+        tools=[
+            {
+                "name": "front.navigate_to_route",
+                "description": "navigate",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "target": {"type": "string"},
+                        "replace": {"type": "boolean"},
+                    },
+                    "required": ["target"],
+                },
+            }
+        ],
+    )
+
+    assert result["pending_front_tool"] == {
+        "name": "front.navigate_to_route",
+        "args": {"target": "/calendar/dayweek", "replace": False},
+        "target": "frontend",
+    }
+
+
+def test_runtime_resume_from_execution_stage_keeps_valid_intent_payload() -> None:
+    runtime = _build_runtime()
+
+    def _fake_run_stage(self, **kwargs):
+        stage = kwargs["stage"]
+        if stage == "execution":
+            return (
+                '{"status":"SUCCESS","execution_summary":"done","execution_data":{},"report_brief":"ok"}',
+                UsageCost(2, 2, 4, 0.02),
+                [],
+                None,
+            )
+        return (
+            '{"assistant_text":"final answer","response_metadata":{"source":"organization"}}',
+            UsageCost(3, 3, 6, 0.03),
+            [],
+            None,
+        )
+
+    runtime._run_stage_with_crewai = MethodType(_fake_run_stage, runtime)  # type: ignore[method-assign]
+    result = runtime.execute(
+        user_input="resume",
+        tools=[],
+        resume_from_stage="execution",
+    )
+
+    assert result["assistant_text"] == "ok"
+
+
+def test_run_stage_with_crewai_uses_output_pydantic_for_stage(
+    monkeypatch,
+) -> None:
+    runtime = _build_runtime()
+    captured: dict[str, object] = {}
+
+    class _FakeLLM:
+        def __init__(self, **kwargs):
+            captured["llm_kwargs"] = kwargs
+
+    class _FakeAgent:
+        def __init__(self, **kwargs):
+            captured["agent_kwargs"] = kwargs
+            self.llm = kwargs.get("llm")
+
+    class _FakeTask:
+        def __init__(self, **kwargs):
+            captured["task_kwargs"] = kwargs
+
+    class _FakeCrew:
+        def __init__(self, **kwargs):
+            captured["crew_kwargs"] = kwargs
+
+        def kickoff(self):
+            return SimpleNamespace(
+                raw="ignored",
+                pydantic=runtime_module.IntentResult(
+                    route="DIRECT_EXECUTION",
+                    intent_summary="intent",
+                    assistant_text="ok",
+                    safety_flags=[],
+                ),
+                json_dict=None,
+                token_usage=SimpleNamespace(
+                    prompt_tokens=1,
+                    completion_tokens=2,
+                    total_tokens=3,
+                ),
+            )
+
+    monkeypatch.setattr(stage_runner_module, "LLM", _FakeLLM)
+    monkeypatch.setattr(stage_runner_module, "Agent", _FakeAgent)
+    monkeypatch.setattr(stage_runner_module, "Task", _FakeTask)
+    monkeypatch.setattr(stage_runner_module, "Crew", _FakeCrew)
+
+    text, usage, calls, pending = runtime._run_stage_with_crewai(
+        stage="intent",
+        user_content="hello",
+        system_prompt="",
+        tools_payload=[],
+        litellm_model="dashscope/qwen3.5-flash",
+    )
+
+    task_kwargs = cast(dict[str, object], captured["task_kwargs"])
+    assert task_kwargs.get("output_pydantic") is runtime_module.IntentResult
+    assert runtime_module.IntentResult.model_validate_json(text).assistant_text == "ok"
+    assert usage.total_tokens == 3
+    assert calls == []
+    assert pending is None
+
+
 def test_runtime_backend_registry_check() -> None:
    runtime = _build_runtime()
    assert runtime.is_registered_backend_tool("back.create_calendar_event") is True
@@ -179,3 +473,184 @@ def test_runtime_emits_step_started_finished_for_all_three_stages() -> None:
        "organization",
        "organization",
    ]
+
+
+def test_parse_intent_result_accepts_markdown_json_fence() -> None:
+    result = _parse_intent_result(
+        """```json
+{
+  \"route\": \"DIRECT_EXECUTION\",
+  \"intent_summary\": \"navigate\",
+  \"assistant_text\": \"ok\",
+  \"safety_flags\": []
+}
+```"""
+    )
+    assert result.route == "DIRECT_EXECUTION"
+    assert result.assistant_text == "ok"
+
+
+def test_parse_intent_result_coerces_structured_fields() -> None:
+    result = _parse_intent_result(
+        """{
+  "route": "DIRECT_EXECUTION",
+  "intent_summary": "navigate",
+  "assistant_text": "",
+  "execution_brief": {
+    "action": "front.navigate_to_route",
+    "target": "/calendar/dayweek"
+  },
+  "safety_flags": {
+    "security_concern": false,
+    "requires_confirmation": true
+  }
+}"""
+    )
+    assert result.route == "NEEDS_EXECUTION"
+    assert result.execution_brief is not None
+    assert "front.navigate_to_route" in result.execution_brief
+    assert result.safety_flags == ["requires_confirmation"]
+
+
+def test_parse_intent_result_coerces_structured_intent_summary() -> None:
+    result = _parse_intent_result(
+        """{
+  "route": "NEEDS_EXECUTION",
+  "intent_summary": {
+    "intent_type": "Navigation Request",
+    "confidence": 0.93
+  },
+  "execution_brief": "call front tool",
+  "safety_flags": []
+}"""
+    )
+    assert result.route == "NEEDS_EXECUTION"
+    assert result.intent_summary.startswith("{")
+    assert "Navigation Request" in result.intent_summary
+
+
+def test_runtime_uses_prompt_module_for_stage_descriptions(monkeypatch) -> None:
+    runtime = _build_runtime()
+    captured: dict[str, object] = {"called": False}
+
+    class _FakeLLM:
+        def __init__(self, **kwargs):
+            del kwargs
+
+    class _FakeAgent:
+        def __init__(self, **kwargs):
+            self.llm = kwargs.get("llm")
+
+    class _FakeTask:
+        def __init__(self, **kwargs):
+            captured["description"] = kwargs.get("description")
+
+    class _FakeCrew:
+        def __init__(self, **kwargs):
+            del kwargs
+
+        def kickoff(self):
+            return SimpleNamespace(
+                raw="ignored",
+                pydantic=runtime_module.IntentResult(
+                    route="DIRECT_EXECUTION",
+                    intent_summary="intent",
+                    assistant_text="ok",
+                    safety_flags=[],
+                ),
+                json_dict=None,
+                token_usage=SimpleNamespace(
+                    prompt_tokens=1,
+                    completion_tokens=2,
+                    total_tokens=3,
+                ),
+            )
+
+    def _fake_build_stage_task_description(**kwargs):
+        del kwargs
+        captured["called"] = True
+        return "PROMPT_FROM_MODULE"
+
+    monkeypatch.setattr(stage_runner_module, "LLM", _FakeLLM)
+    monkeypatch.setattr(stage_runner_module, "Agent", _FakeAgent)
+    monkeypatch.setattr(stage_runner_module, "Task", _FakeTask)
+    monkeypatch.setattr(stage_runner_module, "Crew", _FakeCrew)
+    monkeypatch.setattr(
+        stage_runner_module.runtime_stage_prompts,
+        "build_stage_task_description",
+        _fake_build_stage_task_description,
+    )
+
+    runtime._run_stage_with_crewai(
+        stage="intent",
+        user_content="hello",
+        system_prompt="",
+        tools_payload=[],
+        litellm_model="dashscope/qwen3.5-flash",
+    )
+
+    assert captured["called"] is True
+    assert captured["description"] == "PROMPT_FROM_MODULE"
+
+
+def test_run_stage_with_crewai_does_not_force_execution_output_pydantic(
+    monkeypatch,
+) -> None:
+    runtime = _build_runtime()
+    captured: dict[str, object] = {}
+
+    class _FakeLLM:
+        def __init__(self, **kwargs):
+            del kwargs
+
+    class _FakeAgent:
+        def __init__(self, **kwargs):
+            self.llm = kwargs.get("llm")
+
+    class _FakeTask:
+        def __init__(self, **kwargs):
+            captured["output_pydantic"] = kwargs.get("output_pydantic")
+
+    class _FakeCrew:
+        def __init__(self, **kwargs):
+            del kwargs
+
+        def kickoff(self):
+            return SimpleNamespace(
+                raw=(
+                    '{"status":"SUCCESS","execution_summary":"done",'
+                    '"execution_data":{},"report_brief":"ok"}'
+                ),
+                pydantic=None,
+                json_dict=None,
+                token_usage=SimpleNamespace(
+                    prompt_tokens=1,
+                    completion_tokens=2,
+                    total_tokens=3,
+                ),
+            )
+
+    monkeypatch.setattr(stage_runner_module, "LLM", _FakeLLM)
+    monkeypatch.setattr(stage_runner_module, "Agent", _FakeAgent)
+    monkeypatch.setattr(stage_runner_module, "Task", _FakeTask)
+    monkeypatch.setattr(stage_runner_module, "Crew", _FakeCrew)
+
+    runtime._run_stage_with_crewai(
+        stage="execution",
+        user_content='{"user_input":"go","intent_summary":"navigate"}',
+        system_prompt="",
+        tools_payload=[
+            {
+                "name": "front.navigate_to_route",
+                "description": "navigate",
+                "parameters": {
+                    "type": "object",
+                    "properties": {"target": {"type": "string"}},
+                    "required": ["target"],
+                },
+            }
+        ],
+        litellm_model="dashscope/qwen3.5-flash",
+    )
+
+    assert captured["output_pydantic"] is None
@@ -0,0 +1,19 @@
+from __future__ import annotations
+
+from core.agent.infrastructure.crewai.runtime_parsers import parse_execution_result
+
+
+def test_parse_execution_result_preserves_execution_data_for_interrupted_status() -> (
+    None
+):
+    result = parse_execution_result(
+        '{"status":"interrupted","execution_summary":"approval needed",'
+        '"execution_data":{"tool_called":"front.navigate_to_route",'
+        '"input":{"target":"/calendar/dayweek"},'
+        '"error":"frontend tool requires approval"},'
+        '"report_brief":"await approval"}'
+    )
+
+    assert result.status == "PARTIAL"
+    assert result.execution_data.get("tool_called") == "front.navigate_to_route"
+    assert result.execution_data.get("input") == {"target": "/calendar/dayweek"}
@@ -0,0 +1,223 @@
+from __future__ import annotations
+
+import pytest
+from crewai.agents import parser as crew_parser
+
+from core.agent.infrastructure.crewai.runtime_tools import (
+    PendingFrontendToolCall,
+    extract_pending_front_tool,
+    resolve_stage_crewai_tools,
+)
+
+
+def test_frontend_tool_accepts_direct_kwargs_and_raises_pending() -> None:
+    calls: list[dict[str, object]] = []
+    tools = resolve_stage_crewai_tools(
+        tools_payload=[
+            {
+                "name": "front.navigate_to_route",
+                "description": "Navigate to route",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "target": {"type": "string"},
+                        "replace": {"type": "boolean"},
+                    },
+                    "required": ["target"],
+                },
+            }
+        ],
+        calls=calls,
+        backend_handler=None,
+    )
+
+    with pytest.raises(PendingFrontendToolCall) as exc:
+        tools[0].run(target="/calendar/dayweek", replace=False)
+
+    assert exc.value.payload["name"] == "front.navigate_to_route"
+    assert exc.value.payload["args"] == {
+        "target": "/calendar/dayweek",
+        "replace": False,
+    }
+
+
+def test_react_action_text_can_address_frontend_tool_name() -> None:
+    parsed = crew_parser.parse(
+        "Thought: need route change\n"
+        "Action: front.navigate_to_route\n"
+        'Action Input: {"target":"/calendar/dayweek","replace":false}'
+    )
+    assert isinstance(parsed, crew_parser.AgentAction)
+    calls: list[dict[str, object]] = []
+    tools = resolve_stage_crewai_tools(
+        tools_payload=[
+            {
+                "name": "front.navigate_to_route",
+                "description": "Navigate to route",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "target": {"type": "string"},
+                        "replace": {"type": "boolean"},
+                    },
+                    "required": ["target"],
+                },
+            }
+        ],
+        calls=calls,
+        backend_handler=None,
+    )
+    tool = next(item for item in tools if item.name == parsed.tool)
+
+    with pytest.raises(PendingFrontendToolCall) as exc:
+        tool.run(**{"target": "/calendar/dayweek", "replace": False})
+
+    assert exc.value.payload["name"] == "front.navigate_to_route"
+
+
+def test_dynamic_tool_args_schema_follows_tool_parameters() -> None:
+    calls: list[dict[str, object]] = []
+    tools = resolve_stage_crewai_tools(
+        tools_payload=[
+            {
+                "name": "front.navigate_to_route",
+                "description": "Navigate to route",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "target": {"type": "string"},
+                        "replace": {"type": "boolean"},
+                    },
+                    "required": ["target"],
+                },
+            }
+        ],
+        calls=calls,
+        backend_handler=None,
+    )
+
+    schema = tools[0].args_schema.model_json_schema()
+    props = schema.get("properties", {})
+    required = schema.get("required", [])
+
+    assert isinstance(props, dict)
+    assert "target" in props
+    assert "replace" in props
+    assert required == ["target"]
+
+
+def test_extract_pending_front_tool_supports_tool_called_and_input_fields() -> None:
+    pending = extract_pending_front_tool(
+        execution_tools=[
+            {
+                "name": "front.navigate_to_route",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "target": {"type": "string"},
+                        "replace": {"type": "boolean"},
+                    },
+                },
+            }
+        ],
+        pending_call=None,
+        execution_data={
+            "tool_called": "front.navigate_to_route",
+            "input": {"target": "/calendar/dayweek"},
+            "status": "pending_approval",
+        },
+    )
+
+    assert pending == {
+        "name": "front.navigate_to_route",
+        "args": {"target": "/calendar/dayweek", "replace": False},
+        "target": "frontend",
+    }
+
+
+def test_extract_pending_front_tool_supports_interrupted_status_with_error() -> None:
+    pending = extract_pending_front_tool(
+        execution_tools=[
+            {
+                "name": "front.navigate_to_route",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "target": {"type": "string"},
+                        "replace": {"type": "boolean"},
+                    },
+                },
+            }
+        ],
+        pending_call=None,
+        execution_data={
+            "status": "interrupted",
+            "tool_called": "front.navigate_to_route",
+            "parameters": {"target": "/calendar/dayweek", "replace": False},
+            "error": "frontend tool requires approval",
+        },
+    )
+
+    assert pending == {
+        "name": "front.navigate_to_route",
+        "args": {"target": "/calendar/dayweek", "replace": False},
+        "target": "frontend",
+    }
+
+
+def test_extract_pending_front_tool_supports_approval_result_field() -> None:
+    pending = extract_pending_front_tool(
+        execution_tools=[
+            {
+                "name": "front.navigate_to_route",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "target": {"type": "string"},
+                        "replace": {"type": "boolean"},
+                    },
+                },
+            }
+        ],
+        pending_call=None,
+        execution_data={
+            "tool_called": "front.navigate_to_route",
+            "parameters": {"target": "/calendar/dayweek", "replace": False},
+            "result": "approval_required_error",
+        },
+    )
+
+    assert pending == {
+        "name": "front.navigate_to_route",
+        "args": {"target": "/calendar/dayweek", "replace": False},
+        "target": "frontend",
+    }
+
+
+def test_extract_pending_front_tool_supports_observation_field() -> None:
+    pending = extract_pending_front_tool(
+        execution_tools=[
+            {
+                "name": "front.navigate_to_route",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "target": {"type": "string"},
+                        "replace": {"type": "boolean"},
+                    },
+                },
+            }
+        ],
+        pending_call=None,
+        execution_data={
+            "tool_called": "front.navigate_to_route",
+            "parameters": {"target": "/calendar/dayweek", "replace": False},
+            "observation": "frontend tool requires approval.",
+        },
+    )
+
+    assert pending == {
+        "name": "front.navigate_to_route",
+        "args": {"target": "/calendar/dayweek", "replace": False},
+        "target": "frontend",
+    }
@@ -0,0 +1,16 @@
+from __future__ import annotations
+
+from core.agent.prompt.runtime_stage_prompts import build_stage_task_description
+
+
+def test_execution_stage_prompt_includes_react_tool_invocation_rule() -> None:
+    prompt = build_stage_task_description(
+        stage="execution",
+        task_description="execute",
+        tools_payload=[{"name": "front.navigate_to_route"}],
+        system_prompt="",
+        user_content="go",
+    )
+
+    assert "Action:" in prompt
+    assert "Action Input:" in prompt
@@ -0,0 +1,26 @@
+from __future__ import annotations
+
+import pytest
+
+import core.agent.infrastructure.crewai.tools.stage_tool_allowlist as allowlist_module
+
+
+def test_load_crewai_stage_tools_returns_expected_defaults() -> None:
+    result = allowlist_module.load_crewai_stage_tools()
+
+    assert result == {
+        "intent": [],
+        "execution": ["back.create_calendar_event"],
+        "organization": [],
+    }
+
+
+def test_load_crewai_stage_tools_rejects_unknown_backend_tool(monkeypatch) -> None:
+    monkeypatch.setattr(
+        allowlist_module,
+        "STAGE_TOOL_ALLOWLIST",
+        {"execution": ["back.unknown"]},
+    )
+
+    with pytest.raises(ValueError, match="unknown backend tool"):
+        allowlist_module.load_crewai_stage_tools()
@@ -1,5 +1,7 @@
 from __future__ import annotations

+import asyncio
+
 import pytest

 from core.config.settings import RedisSettings
@@ -107,7 +109,9 @@ async def test_get_or_init_redis_client_initializes_when_needed(
    async def _fake_initialize() -> bool:
        return True

-    monkeypatch.setattr(type(redis_service), "is_initialized", property(lambda _: False))
+    monkeypatch.setattr(
+        type(redis_service), "is_initialized", property(lambda _: False)
+    )
    monkeypatch.setattr(redis_service, "initialize", _fake_initialize)
    monkeypatch.setattr(redis_service, "get_client", lambda: fake_client)

@@ -123,8 +127,40 @@ async def test_get_or_init_redis_client_raises_when_init_fails(
    async def _fake_initialize() -> bool:
        return False

-    monkeypatch.setattr(type(redis_service), "is_initialized", property(lambda _: False))
+    monkeypatch.setattr(
+        type(redis_service), "is_initialized", property(lambda _: False)
+    )
    monkeypatch.setattr(redis_service, "initialize", _fake_initialize)

    with pytest.raises(RuntimeError, match="Redis service initialization failed"):
        await get_or_init_redis_client()
+
+
+@pytest.mark.asyncio
+async def test_get_or_init_redis_client_reinitializes_when_event_loop_changes(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    stale_client = _FakeRedisClient()
+    fresh_client = _FakeRedisClient()
+    call_count = {"initialize": 0}
+
+    async def _fake_initialize() -> bool:
+        call_count["initialize"] += 1
+        return True
+
+    class _Loop:
+        pass
+
+    loop_obj = _Loop()
+
+    monkeypatch.setattr(asyncio, "get_running_loop", lambda: loop_obj)
+    monkeypatch.setattr(redis_service, "initialize", _fake_initialize)
+    monkeypatch.setattr(redis_service, "get_client", lambda: fresh_client)
+    monkeypatch.setattr(redis_service, "_client", stale_client, raising=False)
+    monkeypatch.setattr(redis_service, "_loop_id", 123, raising=False)
+    monkeypatch.setattr(redis_service, "_initialized", True, raising=False)
+
+    client = await get_or_init_redis_client()
+
+    assert call_count["initialize"] == 1
+    assert client is fresh_client