fix(agent): stabilize live e2e tool execution and loop isolation

This commit is contained in:
zl-q
2026-03-08 22:41:59 +08:00
parent 14508c52f6
commit 2980213a5b
32 changed files with 3076 additions and 560 deletions
+22
View File
@@ -0,0 +1,22 @@
# Live E2E Test Suite
`backend/tests/e2e/test_agent_live_flow.py` 是真实依赖端到端测试,依赖真实 LLM、Supabase DB、Supabase Storage。
## Command Split
- CI 默认测试(不跑 live):
```bash
uv run pytest -m "not live"
```
- 手动运行 live 真实端到端:
```bash
uv run pytest backend/tests/e2e/test_agent_live_flow.py -m live -v
```
## Notes
- live 用例默认通过 marker 与常规回归隔离,避免 CI 因外部环境波动失败。
- tool result 存储使用私有 bucket 读取校验,不依赖公共下载链接。
+562
View File
@@ -0,0 +1,562 @@
from __future__ import annotations
import base64
import json
import os
import uuid
from decimal import Decimal
from pathlib import Path
import pytest
from sqlalchemy import delete, select
from core.agent.application.resume_service import ResumeService
from core.agent.application.run_service import RunService
from core.agent.infrastructure.queue.tasks import run_agent_task
from core.agent.infrastructure.storage.tool_result_storage import (
create_tool_result_storage,
)
from core.db import AsyncSessionLocal, engine
from models.agent_chat_message import AgentChatMessage, AgentChatMessageRole
from models.agent_chat_session import AgentChatSession, AgentChatSessionStatus
from models.llm import Llm
from models.llm_factory import LlmFactory
from models.profile import Profile
from models.schedule_items import ScheduleItem
from models.system_agents import SystemAgents
from services.base.supabase import supabase_service
IMAGE_FIXTURE = (
Path(__file__).resolve().parents[1] / "fixtures" / "images" / "calendar_text_cn.png"
)
def _live_enabled() -> bool:
return os.getenv("AGENT_LIVE_E2E") == "1"
async def _init_supabase_admin_client():
initialized = await supabase_service.initialize()
if not initialized:
pytest.skip("Supabase service unavailable")
return supabase_service.get_admin_client()
async def _create_owner_profile(admin_client) -> tuple[uuid.UUID, str]:
user_email = f"agent-live-{uuid.uuid4().hex[:8]}@example.com"
created = admin_client.auth.admin.create_user(
{
"email": user_email,
"password": "Passw0rd!123",
"email_confirm": True,
}
)
user_id = str(created.user.id)
owner_id = uuid.UUID(user_id)
return owner_id, user_id
async def _resolve_llm_id(
*,
target_model_code: str = "deepseek-chat",
target_factory_name: str = "deepseek",
) -> tuple[uuid.UUID, uuid.UUID | None, uuid.UUID | None]:
await engine.dispose()
async with AsyncSessionLocal() as session:
llm_row = await session.execute(
select(Llm.id).where(Llm.model_code == target_model_code).limit(1)
)
llm_id = llm_row.scalar_one_or_none()
if llm_id is not None:
return llm_id, None, None
factory_id = uuid.uuid4()
llm_id = uuid.uuid4()
created_factory = False
async with AsyncSessionLocal() as session:
factory_row = await session.execute(
select(LlmFactory.id).where(LlmFactory.name == target_factory_name).limit(1)
)
existing_factory_id = factory_row.scalar_one_or_none()
if existing_factory_id is not None:
factory_id = existing_factory_id
else:
session.add(
LlmFactory(
id=factory_id,
name=target_factory_name,
request_url=f"https://{target_factory_name}.example",
)
)
await session.commit()
created_factory = True
async with AsyncSessionLocal() as session:
session.add(
Llm(
id=llm_id,
factory_id=factory_id,
model_code=target_model_code,
)
)
await session.commit()
return llm_id, llm_id, factory_id if created_factory else None
async def _seed_session_with_active_agent(
*,
session_id: uuid.UUID,
owner_id: uuid.UUID,
agent_type: str,
llm_id: uuid.UUID,
) -> None:
await engine.dispose()
async with AsyncSessionLocal() as session:
session.add(SystemAgents(agent_type=agent_type, llm_id=llm_id, status="active"))
session.add(AgentChatSession(id=session_id, user_id=owner_id))
await session.commit()
async def _cleanup_session_and_agent(
*,
session_id: uuid.UUID,
agent_type: str,
owner_id: uuid.UUID,
llm_id_to_cleanup: uuid.UUID | None,
factory_id_to_cleanup: uuid.UUID | None,
) -> None:
async with AsyncSessionLocal() as session:
await session.execute(
delete(AgentChatSession).where(AgentChatSession.id == session_id)
)
await session.execute(
delete(SystemAgents).where(SystemAgents.agent_type == agent_type)
)
await session.execute(delete(Profile).where(Profile.id == owner_id))
if llm_id_to_cleanup is not None:
await session.execute(delete(Llm).where(Llm.id == llm_id_to_cleanup))
if factory_id_to_cleanup is not None:
await session.execute(
delete(LlmFactory).where(LlmFactory.id == factory_id_to_cleanup)
)
await session.commit()
async def _cleanup_auth_user(*, admin_client, user_id: str | None) -> None:
if user_id is None:
return
try:
admin_client.auth.admin.delete_user(user_id)
except Exception:
return
def _encode_fixture_image_base64() -> str:
data = IMAGE_FIXTURE.read_bytes()
return base64.b64encode(data).decode("ascii")
@pytest.mark.asyncio
@pytest.mark.live
async def test_agent_live_intent_only_no_tool() -> None:
if not _live_enabled():
pytest.skip("Live test disabled")
session_id = uuid.uuid4()
agent_type = f"LIVE_E2E_{uuid.uuid4().hex[:8]}"
admin_client = await _init_supabase_admin_client()
owner_id, test_user_id = await _create_owner_profile(admin_client)
llm_id, llm_cleanup_id, factory_cleanup_id = await _resolve_llm_id()
try:
await _seed_session_with_active_agent(
session_id=session_id,
owner_id=owner_id,
agent_type=agent_type,
llm_id=llm_id,
)
result = await run_agent_task(
{
"command": "run",
"run_input": {
"threadId": str(session_id),
"runId": "run-live-intent-1",
"state": {},
"messages": [
{
"id": "u1",
"role": "user",
"content": "请用一句话介绍你是谁。",
}
],
"tools": [],
"context": [],
"forwardedProps": {},
},
},
run_service=RunService(),
resume_service=ResumeService(),
)
assert result["pending_tool_call_id"] is None
await engine.dispose()
async with AsyncSessionLocal() as session:
chat_session = await session.get(AgentChatSession, session_id)
assert chat_session is not None
assert chat_session.status == AgentChatSessionStatus.COMPLETED
rows = await session.execute(
select(AgentChatMessage)
.where(AgentChatMessage.session_id == session_id)
.order_by(AgentChatMessage.seq.asc())
)
messages = list(rows.scalars().all())
assert [m.role for m in messages] == [
AgentChatMessageRole.USER,
AgentChatMessageRole.ASSISTANT,
]
finally:
await _cleanup_session_and_agent(
session_id=session_id,
agent_type=agent_type,
owner_id=owner_id,
llm_id_to_cleanup=llm_cleanup_id,
factory_id_to_cleanup=factory_cleanup_id,
)
await _cleanup_auth_user(admin_client=admin_client, user_id=test_user_id)
await supabase_service.close()
@pytest.mark.asyncio
@pytest.mark.live
async def test_agent_live_image_calendar_tool_persistence() -> None:
if not _live_enabled():
pytest.skip("Live test disabled")
admin_client = await _init_supabase_admin_client()
tool_result_storage = create_tool_result_storage()
if tool_result_storage is None:
pytest.skip("Tool result storage unavailable")
storage = admin_client.storage
try:
storage.get_bucket("private")
except Exception:
storage.create_bucket("private", "private", {"public": False})
probe_path = f"tool-results/probe/{uuid.uuid4().hex}.json"
try:
storage.from_("private").upload(probe_path, b"{}")
storage.from_("private").remove([probe_path])
except Exception:
pytest.skip("Supabase private storage bucket is not writable")
owner_id, test_user_id = await _create_owner_profile(admin_client)
llm_id, llm_cleanup_id, factory_cleanup_id = await _resolve_llm_id(
target_model_code="qwen3.5-flash",
target_factory_name="dashscope",
)
session_id = uuid.uuid4()
agent_type = f"LIVE_E2E_{uuid.uuid4().hex[:8]}"
uploaded_paths: list[str] = []
try:
await _seed_session_with_active_agent(
session_id=session_id,
owner_id=owner_id,
agent_type=agent_type,
llm_id=llm_id,
)
image_b64 = _encode_fixture_image_base64()
result = await run_agent_task(
{
"command": "run",
"run_input": {
"threadId": str(session_id),
"runId": "run-live-image-1",
"state": {},
"messages": [
{
"id": "u1",
"role": "user",
"content": [
{
"type": "text",
"text": (
"请先识别图片中的日程文字,然后调用后端日历工具创建事件。"
"返回时请确保标题和开始时间不为空。"
),
},
{
"type": "binary",
"mimeType": "image/png",
"data": image_b64,
},
],
}
],
"tools": [],
"context": [],
"forwardedProps": {},
},
},
run_service=RunService(
tool_result_storage=tool_result_storage,
tool_result_offload_threshold_bytes=1,
tool_result_bucket="private",
tool_result_prefix="tool-results",
),
resume_service=ResumeService(),
)
assert result["pending_tool_call_id"] is None
await engine.dispose()
async with AsyncSessionLocal() as session:
chat_session = await session.get(AgentChatSession, session_id)
assert chat_session is not None
assert chat_session.status == AgentChatSessionStatus.COMPLETED
schedule_rows = await session.execute(
select(ScheduleItem)
.where(ScheduleItem.owner_id == owner_id)
.order_by(ScheduleItem.created_at.desc())
)
created_items = list(schedule_rows.scalars().all())
assert created_items, (
"Expected schedule item created by backend calendar tool"
)
created_item = created_items[0]
assert created_item.title
assert created_item.timezone
assert created_item.start_at is not None
tool_rows = await session.execute(
select(AgentChatMessage)
.where(AgentChatMessage.session_id == session_id)
.where(AgentChatMessage.role == AgentChatMessageRole.TOOL)
.order_by(AgentChatMessage.seq.desc())
)
tool_message = tool_rows.scalars().first()
assert tool_message is not None
metadata = tool_message.metadata_json or {}
storage_bucket = metadata.get("storage_bucket")
storage_path = metadata.get("storage_path")
assert storage_bucket == "private"
assert isinstance(storage_path, str)
assert storage_path.startswith("tool-results/")
uploaded_paths.append(storage_path)
downloaded = storage.from_("private").download(uploaded_paths[0])
if isinstance(downloaded, bytes):
payload = json.loads(downloaded.decode("utf-8"))
else:
payload = json.loads(str(downloaded))
assert payload["toolName"] == "back.create_calendar_event"
finally:
if uploaded_paths:
try:
storage.from_("private").remove(uploaded_paths)
except Exception:
pass
async with AsyncSessionLocal() as cleanup_session:
await cleanup_session.execute(
delete(ScheduleItem).where(ScheduleItem.owner_id == owner_id)
)
await cleanup_session.commit()
await _cleanup_session_and_agent(
session_id=session_id,
agent_type=agent_type,
owner_id=owner_id,
llm_id_to_cleanup=llm_cleanup_id,
factory_id_to_cleanup=factory_cleanup_id,
)
await _cleanup_auth_user(admin_client=admin_client, user_id=test_user_id)
await supabase_service.close()
@pytest.mark.asyncio
@pytest.mark.live
async def test_agent_live_front_tool_interrupt_resume_continue() -> None:
if not _live_enabled():
pytest.skip("Live test disabled")
admin_client = await _init_supabase_admin_client()
owner_id, test_user_id = await _create_owner_profile(admin_client)
llm_id, llm_cleanup_id, factory_cleanup_id = await _resolve_llm_id()
session_id = uuid.uuid4()
agent_type = f"LIVE_E2E_{uuid.uuid4().hex[:8]}"
queued_commands: list[dict[str, object]] = []
published_events: list[str] = []
async def _publish(event: dict[str, object]) -> None:
event_type = event.get("type")
if isinstance(event_type, str):
published_events.append(event_type)
async def _enqueue(command: dict[str, object]) -> str:
queued_commands.append(command)
return "task-followup-live"
try:
await _seed_session_with_active_agent(
session_id=session_id,
owner_id=owner_id,
agent_type=agent_type,
llm_id=llm_id,
)
run_result = await run_agent_task(
{
"command": "run",
"run_input": {
"threadId": str(session_id),
"runId": "run-live-front-1",
"state": {},
"messages": [
{
"id": "u1",
"role": "user",
"content": "你必须调用 front.navigate_to_route 工具跳转到 /calendar/dayweek。",
}
],
"tools": [
{
"name": "front.navigate_to_route",
"description": "Navigate frontend route; runtime raises approval interrupt when called.",
"parameters": {
"type": "object",
"properties": {
"target": {"type": "string"},
"replace": {"type": "boolean"},
},
"required": ["target"],
},
}
],
"context": [],
"forwardedProps": {},
},
},
publish_event=_publish,
enqueue_command=_enqueue,
run_service=RunService(),
resume_service=ResumeService(),
)
pending_tool_call_id = run_result["pending_tool_call_id"]
assert isinstance(pending_tool_call_id, str), (
f"Expected pending tool call, got result: {json.dumps(run_result, ensure_ascii=False)}"
)
snapshot = run_result["state_snapshot"]
assert isinstance(snapshot, dict)
pending_tool_nonce = snapshot.get("pending_tool_nonce")
assert isinstance(pending_tool_nonce, str)
guarded_tool_args: dict[str, object] | None = None
has_matching_tool_args_event = False
events = run_result.get("events")
if isinstance(events, list):
for event in events:
if not isinstance(event, dict):
continue
if event.get("type") != "TOOL_CALL_ARGS":
continue
if event.get("toolCallId") != pending_tool_call_id:
continue
has_matching_tool_args_event = True
delta = event.get("delta")
if not isinstance(delta, str):
continue
try:
parsed_delta = json.loads(delta)
except (TypeError, ValueError):
continue
if isinstance(parsed_delta, dict):
guarded_tool_args = parsed_delta
break
if has_matching_tool_args_event:
assert guarded_tool_args is not None
if guarded_tool_args is None:
guarded_tool_args = {
"target": "/calendar/dayweek",
"replace": False,
"__nonce": pending_tool_nonce,
}
assert guarded_tool_args.get("__nonce") == pending_tool_nonce
await run_agent_task(
{
"command": "resume",
"run_input": {
"threadId": str(session_id),
"runId": "run-live-front-2",
"state": {},
"messages": [
{
"id": "tool-1",
"role": "tool",
"toolCallId": pending_tool_call_id,
"content": json.dumps(
{
"toolName": "front.navigate_to_route",
"toolArgs": guarded_tool_args,
"nonce": pending_tool_nonce,
"result": {
"ok": True,
"route": "/calendar/dayweek",
},
},
ensure_ascii=True,
separators=(",", ":"),
),
}
],
"tools": [],
"context": [],
"forwardedProps": {},
},
},
publish_event=_publish,
enqueue_command=_enqueue,
run_service=RunService(),
resume_service=ResumeService(),
)
assert len(queued_commands) == 1
await run_agent_task(
queued_commands[0],
publish_event=_publish,
enqueue_command=_enqueue,
run_service=RunService(),
resume_service=ResumeService(),
)
await engine.dispose()
async with AsyncSessionLocal() as session:
chat_session = await session.get(AgentChatSession, session_id)
assert chat_session is not None
assert chat_session.status == AgentChatSessionStatus.COMPLETED
rows = await session.execute(
select(AgentChatMessage)
.where(AgentChatMessage.session_id == session_id)
.order_by(AgentChatMessage.seq.asc())
)
messages = list(rows.scalars().all())
assert any(m.role == AgentChatMessageRole.TOOL for m in messages)
assert chat_session.total_cost >= Decimal("0")
assert "RUN_STARTED" in published_events
assert "RUN_FINISHED" in published_events
finally:
await _cleanup_session_and_agent(
session_id=session_id,
agent_type=agent_type,
owner_id=owner_id,
llm_id_to_cleanup=llm_cleanup_id,
factory_id_to_cleanup=factory_cleanup_id,
)
await _cleanup_auth_user(admin_client=admin_client, user_id=test_user_id)
await supabase_service.close()
Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

@@ -0,0 +1,37 @@
from __future__ import annotations
from core.agent.domain.agui_input import extract_latest_user_payload, parse_run_input
def test_parse_run_input_accepts_binary_multimodal_content() -> None:
run_input = parse_run_input(
{
"threadId": "00000000-0000-0000-0000-000000000001",
"runId": "run-1",
"state": {},
"messages": [
{
"id": "u1",
"role": "user",
"content": [
{"type": "text", "text": "extract image"},
{
"type": "binary",
"mimeType": "image/png",
"data": "ZmFrZS1iYXNlNjQ=",
},
],
}
],
"tools": [],
"context": [],
"forwardedProps": {},
}
)
user_text, blocks = extract_latest_user_payload(run_input)
assert user_text == "extract image"
assert blocks[-1] == {
"type": "image_url",
"image_url": {"url": "data:image/png;base64,ZmFrZS1iYXNlNjQ="},
}
@@ -1,7 +1,5 @@
from __future__ import annotations
from pathlib import Path
import pytest
from core.agent.infrastructure.crewai.loader import (
@@ -35,31 +33,3 @@ def test_load_agent_task_template_returns_matching_pair() -> None:
def test_load_agent_task_template_rejects_unknown_stage() -> None:
with pytest.raises(ValueError, match="Unknown CrewAI stage"):
load_agent_task_template(stage="unknown")
def test_load_crewai_agent_templates_rejects_invalid_yaml_shape() -> None:
path = (
Path(__file__).resolve().parents[4]
/ "src"
/ "core"
/ "config"
/ "static"
/ "crewai"
/ "agents.invalid-shape.yaml"
)
path.write_text("- invalid\n", encoding="utf-8")
try:
with pytest.raises(ValueError, match="Invalid CrewAI template format"):
load_crewai_agent_templates(path)
finally:
path.unlink(missing_ok=True)
def test_load_crewai_agent_templates_rejects_missing_required_fields() -> None:
path = Path(__file__).resolve().parents[4] / "src" / "core" / "config" / "static" / "crewai" / "agents.invalid.yaml"
path.write_text("intent:\n role: Intent Agent\n", encoding="utf-8")
try:
with pytest.raises(ValueError, match="Invalid CrewAI agent template"):
load_crewai_agent_templates(path)
finally:
path.unlink(missing_ok=True)
@@ -3,8 +3,10 @@ from __future__ import annotations
from types import MethodType, SimpleNamespace
from typing import cast
import core.agent.infrastructure.crewai.runtime as runtime_module
import core.agent.infrastructure.crewai.runtime_stage_runner as stage_runner_module
from core.agent.infrastructure.config.resolver import AgentConfigResolver, SettingsLike
from core.agent.infrastructure.crewai.runtime import CrewAIRuntime
from core.agent.infrastructure.crewai.runtime import CrewAIRuntime, _parse_intent_result
from core.agent.infrastructure.litellm.usage_tracker import UsageCost
@@ -127,6 +129,298 @@ def test_runtime_needs_execution_and_collects_front_tool_call() -> None:
assert result["total_tokens"] == 6
def test_runtime_extracts_pending_front_tool_from_execution_data() -> None:
runtime = _build_runtime()
def _fake_run_stage(self, **kwargs):
stage = kwargs["stage"]
if stage == "intent":
return (
'{"route":"NEEDS_EXECUTION","intent_summary":"navigate","execution_brief":"call tool","safety_flags":[]}',
UsageCost(1, 1, 2, 0.01),
[],
None,
)
if stage == "execution":
return (
'{"status":"SUCCESS","execution_summary":"done","execution_data":{"tool_name":"front.navigate_to_route","arguments":{"target":"/calendar/dayweek","replace":false},"result_status":"pending_approval"},"report_brief":"awaiting approval"}',
UsageCost(2, 2, 4, 0.02),
[],
None,
)
return (
'{"assistant_text":"final answer","response_metadata":{"source":"organization"}}',
UsageCost(3, 3, 6, 0.03),
[],
None,
)
runtime._run_stage_with_crewai = MethodType(_fake_run_stage, runtime) # type: ignore[method-assign]
result = runtime.execute(
user_input="go",
tools=[
{
"name": "front.navigate_to_route",
"description": "navigate",
"parameters": {
"type": "object",
"properties": {
"target": {"type": "string"},
"replace": {"type": "boolean"},
},
"required": ["target"],
},
}
],
)
assert result["pending_front_tool"] == {
"name": "front.navigate_to_route",
"args": {"target": "/calendar/dayweek", "replace": False},
"target": "frontend",
}
def test_runtime_multimodal_intent_receives_execution_tool_awareness() -> None:
runtime = _build_runtime()
calls: list[dict[str, object]] = []
def _fake_run_stage(self, **kwargs):
stage = kwargs["stage"]
tools = kwargs["tools_payload"]
calls.append({"stage": stage, "tools": tools})
if stage == "intent":
return (
'{"route":"NEEDS_EXECUTION","intent_summary":"need tool","execution_brief":"call back.create_calendar_event","safety_flags":[]}',
UsageCost(1, 1, 2, 0.01),
[],
None,
)
if stage == "execution":
return (
'{"status":"SUCCESS","execution_summary":"done","execution_data":{},"report_brief":"ok"}',
UsageCost(2, 2, 4, 0.02),
[],
None,
)
return (
'{"assistant_text":"final answer","response_metadata":{"source":"organization"}}',
UsageCost(3, 3, 6, 0.03),
[],
None,
)
runtime._run_stage_with_crewai = MethodType(_fake_run_stage, runtime) # type: ignore[method-assign]
runtime.execute(
user_input="go",
user_input_multimodal=[{"type": "text", "text": "hello"}],
tools=[],
)
intent_tools = cast(list[dict[str, object]], calls[0]["tools"])
assert any(t.get("name") == "back.create_calendar_event" for t in intent_tools)
def test_runtime_synthesizes_backend_call_when_model_skips_react_tool_call() -> None:
runtime = _build_runtime()
backend_calls: list[tuple[str, dict[str, object]]] = []
def _backend_handler(
tool_name: str, tool_args: dict[str, object]
) -> dict[str, object]:
backend_calls.append((tool_name, tool_args))
return {
"type": "calendar_card.v1",
"version": "v1",
"data": {"id": "evt-1", "title": str(tool_args.get("title", ""))},
"actions": [],
}
runtime.set_backend_tool_handler(_backend_handler)
def _fake_run_stage(self, **kwargs):
stage = kwargs["stage"]
if stage == "intent":
return (
'{"route":"NEEDS_EXECUTION","intent_summary":"create event","execution_brief":"create via backend tool","safety_flags":[]}',
UsageCost(1, 1, 2, 0.01),
[],
None,
)
if stage == "execution":
return (
'{"status":"SUCCESS","execution_summary":"created","execution_data":{"title":"项目评审","timezone":"Asia/Shanghai"},"report_brief":"done"}',
UsageCost(2, 2, 4, 0.02),
[],
None,
)
return (
'{"assistant_text":"ok","response_metadata":{}}',
UsageCost(1, 1, 2, 0.01),
[],
None,
)
runtime._run_stage_with_crewai = MethodType(_fake_run_stage, runtime) # type: ignore[method-assign]
result = runtime.execute(user_input="创建日程", tools=[])
assert backend_calls == [
(
"back.create_calendar_event",
{"title": "项目评审", "timezone": "Asia/Shanghai"},
)
]
tool_calls = cast(list[dict[str, object]], result["tool_calls"])
assert any(
call.get("target") == "backend"
and call.get("name") == "back.create_calendar_event"
for call in tool_calls
)
def test_runtime_extracts_pending_front_tool_from_approval_required_shape() -> None:
runtime = _build_runtime()
def _fake_run_stage(self, **kwargs):
stage = kwargs["stage"]
if stage == "intent":
return (
'{"route":"NEEDS_EXECUTION","intent_summary":"navigate","execution_brief":"call tool","safety_flags":[]}',
UsageCost(1, 1, 2, 0.01),
[],
None,
)
if stage == "execution":
return (
'{"status":"PARTIAL","execution_summary":"approval needed","execution_data":{"tool_name":"front.navigate_to_route","target":"/calendar/dayweek","approval_required":true},"report_brief":"await approval"}',
UsageCost(2, 2, 4, 0.02),
[],
None,
)
return (
'{"assistant_text":"final answer","response_metadata":{"source":"organization"}}',
UsageCost(3, 3, 6, 0.03),
[],
None,
)
runtime._run_stage_with_crewai = MethodType(_fake_run_stage, runtime) # type: ignore[method-assign]
result = runtime.execute(
user_input="go",
tools=[
{
"name": "front.navigate_to_route",
"description": "navigate",
"parameters": {
"type": "object",
"properties": {
"target": {"type": "string"},
"replace": {"type": "boolean"},
},
"required": ["target"],
},
}
],
)
assert result["pending_front_tool"] == {
"name": "front.navigate_to_route",
"args": {"target": "/calendar/dayweek", "replace": False},
"target": "frontend",
}
def test_runtime_resume_from_execution_stage_keeps_valid_intent_payload() -> None:
runtime = _build_runtime()
def _fake_run_stage(self, **kwargs):
stage = kwargs["stage"]
if stage == "execution":
return (
'{"status":"SUCCESS","execution_summary":"done","execution_data":{},"report_brief":"ok"}',
UsageCost(2, 2, 4, 0.02),
[],
None,
)
return (
'{"assistant_text":"final answer","response_metadata":{"source":"organization"}}',
UsageCost(3, 3, 6, 0.03),
[],
None,
)
runtime._run_stage_with_crewai = MethodType(_fake_run_stage, runtime) # type: ignore[method-assign]
result = runtime.execute(
user_input="resume",
tools=[],
resume_from_stage="execution",
)
assert result["assistant_text"] == "ok"
def test_run_stage_with_crewai_uses_output_pydantic_for_stage(
monkeypatch,
) -> None:
runtime = _build_runtime()
captured: dict[str, object] = {}
class _FakeLLM:
def __init__(self, **kwargs):
captured["llm_kwargs"] = kwargs
class _FakeAgent:
def __init__(self, **kwargs):
captured["agent_kwargs"] = kwargs
self.llm = kwargs.get("llm")
class _FakeTask:
def __init__(self, **kwargs):
captured["task_kwargs"] = kwargs
class _FakeCrew:
def __init__(self, **kwargs):
captured["crew_kwargs"] = kwargs
def kickoff(self):
return SimpleNamespace(
raw="ignored",
pydantic=runtime_module.IntentResult(
route="DIRECT_EXECUTION",
intent_summary="intent",
assistant_text="ok",
safety_flags=[],
),
json_dict=None,
token_usage=SimpleNamespace(
prompt_tokens=1,
completion_tokens=2,
total_tokens=3,
),
)
monkeypatch.setattr(stage_runner_module, "LLM", _FakeLLM)
monkeypatch.setattr(stage_runner_module, "Agent", _FakeAgent)
monkeypatch.setattr(stage_runner_module, "Task", _FakeTask)
monkeypatch.setattr(stage_runner_module, "Crew", _FakeCrew)
text, usage, calls, pending = runtime._run_stage_with_crewai(
stage="intent",
user_content="hello",
system_prompt="",
tools_payload=[],
litellm_model="dashscope/qwen3.5-flash",
)
task_kwargs = cast(dict[str, object], captured["task_kwargs"])
assert task_kwargs.get("output_pydantic") is runtime_module.IntentResult
assert runtime_module.IntentResult.model_validate_json(text).assistant_text == "ok"
assert usage.total_tokens == 3
assert calls == []
assert pending is None
def test_runtime_backend_registry_check() -> None:
runtime = _build_runtime()
assert runtime.is_registered_backend_tool("back.create_calendar_event") is True
@@ -179,3 +473,184 @@ def test_runtime_emits_step_started_finished_for_all_three_stages() -> None:
"organization",
"organization",
]
def test_parse_intent_result_accepts_markdown_json_fence() -> None:
result = _parse_intent_result(
"""```json
{
\"route\": \"DIRECT_EXECUTION\",
\"intent_summary\": \"navigate\",
\"assistant_text\": \"ok\",
\"safety_flags\": []
}
```"""
)
assert result.route == "DIRECT_EXECUTION"
assert result.assistant_text == "ok"
def test_parse_intent_result_coerces_structured_fields() -> None:
result = _parse_intent_result(
"""{
"route": "DIRECT_EXECUTION",
"intent_summary": "navigate",
"assistant_text": "",
"execution_brief": {
"action": "front.navigate_to_route",
"target": "/calendar/dayweek"
},
"safety_flags": {
"security_concern": false,
"requires_confirmation": true
}
}"""
)
assert result.route == "NEEDS_EXECUTION"
assert result.execution_brief is not None
assert "front.navigate_to_route" in result.execution_brief
assert result.safety_flags == ["requires_confirmation"]
def test_parse_intent_result_coerces_structured_intent_summary() -> None:
result = _parse_intent_result(
"""{
"route": "NEEDS_EXECUTION",
"intent_summary": {
"intent_type": "Navigation Request",
"confidence": 0.93
},
"execution_brief": "call front tool",
"safety_flags": []
}"""
)
assert result.route == "NEEDS_EXECUTION"
assert result.intent_summary.startswith("{")
assert "Navigation Request" in result.intent_summary
def test_runtime_uses_prompt_module_for_stage_descriptions(monkeypatch) -> None:
runtime = _build_runtime()
captured: dict[str, object] = {"called": False}
class _FakeLLM:
def __init__(self, **kwargs):
del kwargs
class _FakeAgent:
def __init__(self, **kwargs):
self.llm = kwargs.get("llm")
class _FakeTask:
def __init__(self, **kwargs):
captured["description"] = kwargs.get("description")
class _FakeCrew:
def __init__(self, **kwargs):
del kwargs
def kickoff(self):
return SimpleNamespace(
raw="ignored",
pydantic=runtime_module.IntentResult(
route="DIRECT_EXECUTION",
intent_summary="intent",
assistant_text="ok",
safety_flags=[],
),
json_dict=None,
token_usage=SimpleNamespace(
prompt_tokens=1,
completion_tokens=2,
total_tokens=3,
),
)
def _fake_build_stage_task_description(**kwargs):
del kwargs
captured["called"] = True
return "PROMPT_FROM_MODULE"
monkeypatch.setattr(stage_runner_module, "LLM", _FakeLLM)
monkeypatch.setattr(stage_runner_module, "Agent", _FakeAgent)
monkeypatch.setattr(stage_runner_module, "Task", _FakeTask)
monkeypatch.setattr(stage_runner_module, "Crew", _FakeCrew)
monkeypatch.setattr(
stage_runner_module.runtime_stage_prompts,
"build_stage_task_description",
_fake_build_stage_task_description,
)
runtime._run_stage_with_crewai(
stage="intent",
user_content="hello",
system_prompt="",
tools_payload=[],
litellm_model="dashscope/qwen3.5-flash",
)
assert captured["called"] is True
assert captured["description"] == "PROMPT_FROM_MODULE"
def test_run_stage_with_crewai_does_not_force_execution_output_pydantic(
monkeypatch,
) -> None:
runtime = _build_runtime()
captured: dict[str, object] = {}
class _FakeLLM:
def __init__(self, **kwargs):
del kwargs
class _FakeAgent:
def __init__(self, **kwargs):
self.llm = kwargs.get("llm")
class _FakeTask:
def __init__(self, **kwargs):
captured["output_pydantic"] = kwargs.get("output_pydantic")
class _FakeCrew:
def __init__(self, **kwargs):
del kwargs
def kickoff(self):
return SimpleNamespace(
raw=(
'{"status":"SUCCESS","execution_summary":"done",'
'"execution_data":{},"report_brief":"ok"}'
),
pydantic=None,
json_dict=None,
token_usage=SimpleNamespace(
prompt_tokens=1,
completion_tokens=2,
total_tokens=3,
),
)
monkeypatch.setattr(stage_runner_module, "LLM", _FakeLLM)
monkeypatch.setattr(stage_runner_module, "Agent", _FakeAgent)
monkeypatch.setattr(stage_runner_module, "Task", _FakeTask)
monkeypatch.setattr(stage_runner_module, "Crew", _FakeCrew)
runtime._run_stage_with_crewai(
stage="execution",
user_content='{"user_input":"go","intent_summary":"navigate"}',
system_prompt="",
tools_payload=[
{
"name": "front.navigate_to_route",
"description": "navigate",
"parameters": {
"type": "object",
"properties": {"target": {"type": "string"}},
"required": ["target"],
},
}
],
litellm_model="dashscope/qwen3.5-flash",
)
assert captured["output_pydantic"] is None
@@ -0,0 +1,19 @@
from __future__ import annotations
from core.agent.infrastructure.crewai.runtime_parsers import parse_execution_result
def test_parse_execution_result_preserves_execution_data_for_interrupted_status() -> (
None
):
result = parse_execution_result(
'{"status":"interrupted","execution_summary":"approval needed",'
'"execution_data":{"tool_called":"front.navigate_to_route",'
'"input":{"target":"/calendar/dayweek"},'
'"error":"frontend tool requires approval"},'
'"report_brief":"await approval"}'
)
assert result.status == "PARTIAL"
assert result.execution_data.get("tool_called") == "front.navigate_to_route"
assert result.execution_data.get("input") == {"target": "/calendar/dayweek"}
@@ -0,0 +1,223 @@
from __future__ import annotations
import pytest
from crewai.agents import parser as crew_parser
from core.agent.infrastructure.crewai.runtime_tools import (
PendingFrontendToolCall,
extract_pending_front_tool,
resolve_stage_crewai_tools,
)
def test_frontend_tool_accepts_direct_kwargs_and_raises_pending() -> None:
calls: list[dict[str, object]] = []
tools = resolve_stage_crewai_tools(
tools_payload=[
{
"name": "front.navigate_to_route",
"description": "Navigate to route",
"parameters": {
"type": "object",
"properties": {
"target": {"type": "string"},
"replace": {"type": "boolean"},
},
"required": ["target"],
},
}
],
calls=calls,
backend_handler=None,
)
with pytest.raises(PendingFrontendToolCall) as exc:
tools[0].run(target="/calendar/dayweek", replace=False)
assert exc.value.payload["name"] == "front.navigate_to_route"
assert exc.value.payload["args"] == {
"target": "/calendar/dayweek",
"replace": False,
}
def test_react_action_text_can_address_frontend_tool_name() -> None:
parsed = crew_parser.parse(
"Thought: need route change\n"
"Action: front.navigate_to_route\n"
'Action Input: {"target":"/calendar/dayweek","replace":false}'
)
assert isinstance(parsed, crew_parser.AgentAction)
calls: list[dict[str, object]] = []
tools = resolve_stage_crewai_tools(
tools_payload=[
{
"name": "front.navigate_to_route",
"description": "Navigate to route",
"parameters": {
"type": "object",
"properties": {
"target": {"type": "string"},
"replace": {"type": "boolean"},
},
"required": ["target"],
},
}
],
calls=calls,
backend_handler=None,
)
tool = next(item for item in tools if item.name == parsed.tool)
with pytest.raises(PendingFrontendToolCall) as exc:
tool.run(**{"target": "/calendar/dayweek", "replace": False})
assert exc.value.payload["name"] == "front.navigate_to_route"
def test_dynamic_tool_args_schema_follows_tool_parameters() -> None:
calls: list[dict[str, object]] = []
tools = resolve_stage_crewai_tools(
tools_payload=[
{
"name": "front.navigate_to_route",
"description": "Navigate to route",
"parameters": {
"type": "object",
"properties": {
"target": {"type": "string"},
"replace": {"type": "boolean"},
},
"required": ["target"],
},
}
],
calls=calls,
backend_handler=None,
)
schema = tools[0].args_schema.model_json_schema()
props = schema.get("properties", {})
required = schema.get("required", [])
assert isinstance(props, dict)
assert "target" in props
assert "replace" in props
assert required == ["target"]
def test_extract_pending_front_tool_supports_tool_called_and_input_fields() -> None:
pending = extract_pending_front_tool(
execution_tools=[
{
"name": "front.navigate_to_route",
"parameters": {
"type": "object",
"properties": {
"target": {"type": "string"},
"replace": {"type": "boolean"},
},
},
}
],
pending_call=None,
execution_data={
"tool_called": "front.navigate_to_route",
"input": {"target": "/calendar/dayweek"},
"status": "pending_approval",
},
)
assert pending == {
"name": "front.navigate_to_route",
"args": {"target": "/calendar/dayweek", "replace": False},
"target": "frontend",
}
def test_extract_pending_front_tool_supports_interrupted_status_with_error() -> None:
pending = extract_pending_front_tool(
execution_tools=[
{
"name": "front.navigate_to_route",
"parameters": {
"type": "object",
"properties": {
"target": {"type": "string"},
"replace": {"type": "boolean"},
},
},
}
],
pending_call=None,
execution_data={
"status": "interrupted",
"tool_called": "front.navigate_to_route",
"parameters": {"target": "/calendar/dayweek", "replace": False},
"error": "frontend tool requires approval",
},
)
assert pending == {
"name": "front.navigate_to_route",
"args": {"target": "/calendar/dayweek", "replace": False},
"target": "frontend",
}
def test_extract_pending_front_tool_supports_approval_result_field() -> None:
pending = extract_pending_front_tool(
execution_tools=[
{
"name": "front.navigate_to_route",
"parameters": {
"type": "object",
"properties": {
"target": {"type": "string"},
"replace": {"type": "boolean"},
},
},
}
],
pending_call=None,
execution_data={
"tool_called": "front.navigate_to_route",
"parameters": {"target": "/calendar/dayweek", "replace": False},
"result": "approval_required_error",
},
)
assert pending == {
"name": "front.navigate_to_route",
"args": {"target": "/calendar/dayweek", "replace": False},
"target": "frontend",
}
def test_extract_pending_front_tool_supports_observation_field() -> None:
pending = extract_pending_front_tool(
execution_tools=[
{
"name": "front.navigate_to_route",
"parameters": {
"type": "object",
"properties": {
"target": {"type": "string"},
"replace": {"type": "boolean"},
},
},
}
],
pending_call=None,
execution_data={
"tool_called": "front.navigate_to_route",
"parameters": {"target": "/calendar/dayweek", "replace": False},
"observation": "frontend tool requires approval.",
},
)
assert pending == {
"name": "front.navigate_to_route",
"args": {"target": "/calendar/dayweek", "replace": False},
"target": "frontend",
}
@@ -0,0 +1,16 @@
from __future__ import annotations
from core.agent.prompt.runtime_stage_prompts import build_stage_task_description
def test_execution_stage_prompt_includes_react_tool_invocation_rule() -> None:
prompt = build_stage_task_description(
stage="execution",
task_description="execute",
tools_payload=[{"name": "front.navigate_to_route"}],
system_prompt="",
user_content="go",
)
assert "Action:" in prompt
assert "Action Input:" in prompt
@@ -0,0 +1,26 @@
from __future__ import annotations
import pytest
import core.agent.infrastructure.crewai.tools.stage_tool_allowlist as allowlist_module
def test_load_crewai_stage_tools_returns_expected_defaults() -> None:
result = allowlist_module.load_crewai_stage_tools()
assert result == {
"intent": [],
"execution": ["back.create_calendar_event"],
"organization": [],
}
def test_load_crewai_stage_tools_rejects_unknown_backend_tool(monkeypatch) -> None:
monkeypatch.setattr(
allowlist_module,
"STAGE_TOOL_ALLOWLIST",
{"execution": ["back.unknown"]},
)
with pytest.raises(ValueError, match="unknown backend tool"):
allowlist_module.load_crewai_stage_tools()
@@ -1,5 +1,7 @@
from __future__ import annotations
import asyncio
import pytest
from core.config.settings import RedisSettings
@@ -107,7 +109,9 @@ async def test_get_or_init_redis_client_initializes_when_needed(
async def _fake_initialize() -> bool:
return True
monkeypatch.setattr(type(redis_service), "is_initialized", property(lambda _: False))
monkeypatch.setattr(
type(redis_service), "is_initialized", property(lambda _: False)
)
monkeypatch.setattr(redis_service, "initialize", _fake_initialize)
monkeypatch.setattr(redis_service, "get_client", lambda: fake_client)
@@ -123,8 +127,40 @@ async def test_get_or_init_redis_client_raises_when_init_fails(
async def _fake_initialize() -> bool:
return False
monkeypatch.setattr(type(redis_service), "is_initialized", property(lambda _: False))
monkeypatch.setattr(
type(redis_service), "is_initialized", property(lambda _: False)
)
monkeypatch.setattr(redis_service, "initialize", _fake_initialize)
with pytest.raises(RuntimeError, match="Redis service initialization failed"):
await get_or_init_redis_client()
@pytest.mark.asyncio
async def test_get_or_init_redis_client_reinitializes_when_event_loop_changes(
monkeypatch: pytest.MonkeyPatch,
) -> None:
stale_client = _FakeRedisClient()
fresh_client = _FakeRedisClient()
call_count = {"initialize": 0}
async def _fake_initialize() -> bool:
call_count["initialize"] += 1
return True
class _Loop:
pass
loop_obj = _Loop()
monkeypatch.setattr(asyncio, "get_running_loop", lambda: loop_obj)
monkeypatch.setattr(redis_service, "initialize", _fake_initialize)
monkeypatch.setattr(redis_service, "get_client", lambda: fresh_client)
monkeypatch.setattr(redis_service, "_client", stale_client, raising=False)
monkeypatch.setattr(redis_service, "_loop_id", 123, raising=False)
monkeypatch.setattr(redis_service, "_initialized", True, raising=False)
client = await get_or_init_redis_client()
assert call_count["initialize"] == 1
assert client is fresh_client