fix(agent): stabilize live e2e tool execution and loop isolation

This commit is contained in:
zl-q
2026-03-08 22:41:59 +08:00
parent 14508c52f6
commit 2980213a5b
32 changed files with 3076 additions and 560 deletions
@@ -0,0 +1,37 @@
from __future__ import annotations
from core.agent.domain.agui_input import extract_latest_user_payload, parse_run_input
def test_parse_run_input_accepts_binary_multimodal_content() -> None:
run_input = parse_run_input(
{
"threadId": "00000000-0000-0000-0000-000000000001",
"runId": "run-1",
"state": {},
"messages": [
{
"id": "u1",
"role": "user",
"content": [
{"type": "text", "text": "extract image"},
{
"type": "binary",
"mimeType": "image/png",
"data": "ZmFrZS1iYXNlNjQ=",
},
],
}
],
"tools": [],
"context": [],
"forwardedProps": {},
}
)
user_text, blocks = extract_latest_user_payload(run_input)
assert user_text == "extract image"
assert blocks[-1] == {
"type": "image_url",
"image_url": {"url": "data:image/png;base64,ZmFrZS1iYXNlNjQ="},
}
@@ -1,7 +1,5 @@
from __future__ import annotations
from pathlib import Path
import pytest
from core.agent.infrastructure.crewai.loader import (
@@ -35,31 +33,3 @@ def test_load_agent_task_template_returns_matching_pair() -> None:
def test_load_agent_task_template_rejects_unknown_stage() -> None:
with pytest.raises(ValueError, match="Unknown CrewAI stage"):
load_agent_task_template(stage="unknown")
def test_load_crewai_agent_templates_rejects_invalid_yaml_shape() -> None:
path = (
Path(__file__).resolve().parents[4]
/ "src"
/ "core"
/ "config"
/ "static"
/ "crewai"
/ "agents.invalid-shape.yaml"
)
path.write_text("- invalid\n", encoding="utf-8")
try:
with pytest.raises(ValueError, match="Invalid CrewAI template format"):
load_crewai_agent_templates(path)
finally:
path.unlink(missing_ok=True)
def test_load_crewai_agent_templates_rejects_missing_required_fields() -> None:
path = Path(__file__).resolve().parents[4] / "src" / "core" / "config" / "static" / "crewai" / "agents.invalid.yaml"
path.write_text("intent:\n role: Intent Agent\n", encoding="utf-8")
try:
with pytest.raises(ValueError, match="Invalid CrewAI agent template"):
load_crewai_agent_templates(path)
finally:
path.unlink(missing_ok=True)
@@ -3,8 +3,10 @@ from __future__ import annotations
from types import MethodType, SimpleNamespace
from typing import cast
import core.agent.infrastructure.crewai.runtime as runtime_module
import core.agent.infrastructure.crewai.runtime_stage_runner as stage_runner_module
from core.agent.infrastructure.config.resolver import AgentConfigResolver, SettingsLike
from core.agent.infrastructure.crewai.runtime import CrewAIRuntime
from core.agent.infrastructure.crewai.runtime import CrewAIRuntime, _parse_intent_result
from core.agent.infrastructure.litellm.usage_tracker import UsageCost
@@ -127,6 +129,298 @@ def test_runtime_needs_execution_and_collects_front_tool_call() -> None:
assert result["total_tokens"] == 6
def test_runtime_extracts_pending_front_tool_from_execution_data() -> None:
runtime = _build_runtime()
def _fake_run_stage(self, **kwargs):
stage = kwargs["stage"]
if stage == "intent":
return (
'{"route":"NEEDS_EXECUTION","intent_summary":"navigate","execution_brief":"call tool","safety_flags":[]}',
UsageCost(1, 1, 2, 0.01),
[],
None,
)
if stage == "execution":
return (
'{"status":"SUCCESS","execution_summary":"done","execution_data":{"tool_name":"front.navigate_to_route","arguments":{"target":"/calendar/dayweek","replace":false},"result_status":"pending_approval"},"report_brief":"awaiting approval"}',
UsageCost(2, 2, 4, 0.02),
[],
None,
)
return (
'{"assistant_text":"final answer","response_metadata":{"source":"organization"}}',
UsageCost(3, 3, 6, 0.03),
[],
None,
)
runtime._run_stage_with_crewai = MethodType(_fake_run_stage, runtime) # type: ignore[method-assign]
result = runtime.execute(
user_input="go",
tools=[
{
"name": "front.navigate_to_route",
"description": "navigate",
"parameters": {
"type": "object",
"properties": {
"target": {"type": "string"},
"replace": {"type": "boolean"},
},
"required": ["target"],
},
}
],
)
assert result["pending_front_tool"] == {
"name": "front.navigate_to_route",
"args": {"target": "/calendar/dayweek", "replace": False},
"target": "frontend",
}
def test_runtime_multimodal_intent_receives_execution_tool_awareness() -> None:
runtime = _build_runtime()
calls: list[dict[str, object]] = []
def _fake_run_stage(self, **kwargs):
stage = kwargs["stage"]
tools = kwargs["tools_payload"]
calls.append({"stage": stage, "tools": tools})
if stage == "intent":
return (
'{"route":"NEEDS_EXECUTION","intent_summary":"need tool","execution_brief":"call back.create_calendar_event","safety_flags":[]}',
UsageCost(1, 1, 2, 0.01),
[],
None,
)
if stage == "execution":
return (
'{"status":"SUCCESS","execution_summary":"done","execution_data":{},"report_brief":"ok"}',
UsageCost(2, 2, 4, 0.02),
[],
None,
)
return (
'{"assistant_text":"final answer","response_metadata":{"source":"organization"}}',
UsageCost(3, 3, 6, 0.03),
[],
None,
)
runtime._run_stage_with_crewai = MethodType(_fake_run_stage, runtime) # type: ignore[method-assign]
runtime.execute(
user_input="go",
user_input_multimodal=[{"type": "text", "text": "hello"}],
tools=[],
)
intent_tools = cast(list[dict[str, object]], calls[0]["tools"])
assert any(t.get("name") == "back.create_calendar_event" for t in intent_tools)
def test_runtime_synthesizes_backend_call_when_model_skips_react_tool_call() -> None:
runtime = _build_runtime()
backend_calls: list[tuple[str, dict[str, object]]] = []
def _backend_handler(
tool_name: str, tool_args: dict[str, object]
) -> dict[str, object]:
backend_calls.append((tool_name, tool_args))
return {
"type": "calendar_card.v1",
"version": "v1",
"data": {"id": "evt-1", "title": str(tool_args.get("title", ""))},
"actions": [],
}
runtime.set_backend_tool_handler(_backend_handler)
def _fake_run_stage(self, **kwargs):
stage = kwargs["stage"]
if stage == "intent":
return (
'{"route":"NEEDS_EXECUTION","intent_summary":"create event","execution_brief":"create via backend tool","safety_flags":[]}',
UsageCost(1, 1, 2, 0.01),
[],
None,
)
if stage == "execution":
return (
'{"status":"SUCCESS","execution_summary":"created","execution_data":{"title":"项目评审","timezone":"Asia/Shanghai"},"report_brief":"done"}',
UsageCost(2, 2, 4, 0.02),
[],
None,
)
return (
'{"assistant_text":"ok","response_metadata":{}}',
UsageCost(1, 1, 2, 0.01),
[],
None,
)
runtime._run_stage_with_crewai = MethodType(_fake_run_stage, runtime) # type: ignore[method-assign]
result = runtime.execute(user_input="创建日程", tools=[])
assert backend_calls == [
(
"back.create_calendar_event",
{"title": "项目评审", "timezone": "Asia/Shanghai"},
)
]
tool_calls = cast(list[dict[str, object]], result["tool_calls"])
assert any(
call.get("target") == "backend"
and call.get("name") == "back.create_calendar_event"
for call in tool_calls
)
def test_runtime_extracts_pending_front_tool_from_approval_required_shape() -> None:
runtime = _build_runtime()
def _fake_run_stage(self, **kwargs):
stage = kwargs["stage"]
if stage == "intent":
return (
'{"route":"NEEDS_EXECUTION","intent_summary":"navigate","execution_brief":"call tool","safety_flags":[]}',
UsageCost(1, 1, 2, 0.01),
[],
None,
)
if stage == "execution":
return (
'{"status":"PARTIAL","execution_summary":"approval needed","execution_data":{"tool_name":"front.navigate_to_route","target":"/calendar/dayweek","approval_required":true},"report_brief":"await approval"}',
UsageCost(2, 2, 4, 0.02),
[],
None,
)
return (
'{"assistant_text":"final answer","response_metadata":{"source":"organization"}}',
UsageCost(3, 3, 6, 0.03),
[],
None,
)
runtime._run_stage_with_crewai = MethodType(_fake_run_stage, runtime) # type: ignore[method-assign]
result = runtime.execute(
user_input="go",
tools=[
{
"name": "front.navigate_to_route",
"description": "navigate",
"parameters": {
"type": "object",
"properties": {
"target": {"type": "string"},
"replace": {"type": "boolean"},
},
"required": ["target"],
},
}
],
)
assert result["pending_front_tool"] == {
"name": "front.navigate_to_route",
"args": {"target": "/calendar/dayweek", "replace": False},
"target": "frontend",
}
def test_runtime_resume_from_execution_stage_keeps_valid_intent_payload() -> None:
runtime = _build_runtime()
def _fake_run_stage(self, **kwargs):
stage = kwargs["stage"]
if stage == "execution":
return (
'{"status":"SUCCESS","execution_summary":"done","execution_data":{},"report_brief":"ok"}',
UsageCost(2, 2, 4, 0.02),
[],
None,
)
return (
'{"assistant_text":"final answer","response_metadata":{"source":"organization"}}',
UsageCost(3, 3, 6, 0.03),
[],
None,
)
runtime._run_stage_with_crewai = MethodType(_fake_run_stage, runtime) # type: ignore[method-assign]
result = runtime.execute(
user_input="resume",
tools=[],
resume_from_stage="execution",
)
assert result["assistant_text"] == "ok"
def test_run_stage_with_crewai_uses_output_pydantic_for_stage(
monkeypatch,
) -> None:
runtime = _build_runtime()
captured: dict[str, object] = {}
class _FakeLLM:
def __init__(self, **kwargs):
captured["llm_kwargs"] = kwargs
class _FakeAgent:
def __init__(self, **kwargs):
captured["agent_kwargs"] = kwargs
self.llm = kwargs.get("llm")
class _FakeTask:
def __init__(self, **kwargs):
captured["task_kwargs"] = kwargs
class _FakeCrew:
def __init__(self, **kwargs):
captured["crew_kwargs"] = kwargs
def kickoff(self):
return SimpleNamespace(
raw="ignored",
pydantic=runtime_module.IntentResult(
route="DIRECT_EXECUTION",
intent_summary="intent",
assistant_text="ok",
safety_flags=[],
),
json_dict=None,
token_usage=SimpleNamespace(
prompt_tokens=1,
completion_tokens=2,
total_tokens=3,
),
)
monkeypatch.setattr(stage_runner_module, "LLM", _FakeLLM)
monkeypatch.setattr(stage_runner_module, "Agent", _FakeAgent)
monkeypatch.setattr(stage_runner_module, "Task", _FakeTask)
monkeypatch.setattr(stage_runner_module, "Crew", _FakeCrew)
text, usage, calls, pending = runtime._run_stage_with_crewai(
stage="intent",
user_content="hello",
system_prompt="",
tools_payload=[],
litellm_model="dashscope/qwen3.5-flash",
)
task_kwargs = cast(dict[str, object], captured["task_kwargs"])
assert task_kwargs.get("output_pydantic") is runtime_module.IntentResult
assert runtime_module.IntentResult.model_validate_json(text).assistant_text == "ok"
assert usage.total_tokens == 3
assert calls == []
assert pending is None
def test_runtime_backend_registry_check() -> None:
runtime = _build_runtime()
assert runtime.is_registered_backend_tool("back.create_calendar_event") is True
@@ -179,3 +473,184 @@ def test_runtime_emits_step_started_finished_for_all_three_stages() -> None:
"organization",
"organization",
]
def test_parse_intent_result_accepts_markdown_json_fence() -> None:
result = _parse_intent_result(
"""```json
{
\"route\": \"DIRECT_EXECUTION\",
\"intent_summary\": \"navigate\",
\"assistant_text\": \"ok\",
\"safety_flags\": []
}
```"""
)
assert result.route == "DIRECT_EXECUTION"
assert result.assistant_text == "ok"
def test_parse_intent_result_coerces_structured_fields() -> None:
result = _parse_intent_result(
"""{
"route": "DIRECT_EXECUTION",
"intent_summary": "navigate",
"assistant_text": "",
"execution_brief": {
"action": "front.navigate_to_route",
"target": "/calendar/dayweek"
},
"safety_flags": {
"security_concern": false,
"requires_confirmation": true
}
}"""
)
assert result.route == "NEEDS_EXECUTION"
assert result.execution_brief is not None
assert "front.navigate_to_route" in result.execution_brief
assert result.safety_flags == ["requires_confirmation"]
def test_parse_intent_result_coerces_structured_intent_summary() -> None:
result = _parse_intent_result(
"""{
"route": "NEEDS_EXECUTION",
"intent_summary": {
"intent_type": "Navigation Request",
"confidence": 0.93
},
"execution_brief": "call front tool",
"safety_flags": []
}"""
)
assert result.route == "NEEDS_EXECUTION"
assert result.intent_summary.startswith("{")
assert "Navigation Request" in result.intent_summary
def test_runtime_uses_prompt_module_for_stage_descriptions(monkeypatch) -> None:
runtime = _build_runtime()
captured: dict[str, object] = {"called": False}
class _FakeLLM:
def __init__(self, **kwargs):
del kwargs
class _FakeAgent:
def __init__(self, **kwargs):
self.llm = kwargs.get("llm")
class _FakeTask:
def __init__(self, **kwargs):
captured["description"] = kwargs.get("description")
class _FakeCrew:
def __init__(self, **kwargs):
del kwargs
def kickoff(self):
return SimpleNamespace(
raw="ignored",
pydantic=runtime_module.IntentResult(
route="DIRECT_EXECUTION",
intent_summary="intent",
assistant_text="ok",
safety_flags=[],
),
json_dict=None,
token_usage=SimpleNamespace(
prompt_tokens=1,
completion_tokens=2,
total_tokens=3,
),
)
def _fake_build_stage_task_description(**kwargs):
del kwargs
captured["called"] = True
return "PROMPT_FROM_MODULE"
monkeypatch.setattr(stage_runner_module, "LLM", _FakeLLM)
monkeypatch.setattr(stage_runner_module, "Agent", _FakeAgent)
monkeypatch.setattr(stage_runner_module, "Task", _FakeTask)
monkeypatch.setattr(stage_runner_module, "Crew", _FakeCrew)
monkeypatch.setattr(
stage_runner_module.runtime_stage_prompts,
"build_stage_task_description",
_fake_build_stage_task_description,
)
runtime._run_stage_with_crewai(
stage="intent",
user_content="hello",
system_prompt="",
tools_payload=[],
litellm_model="dashscope/qwen3.5-flash",
)
assert captured["called"] is True
assert captured["description"] == "PROMPT_FROM_MODULE"
def test_run_stage_with_crewai_does_not_force_execution_output_pydantic(
monkeypatch,
) -> None:
runtime = _build_runtime()
captured: dict[str, object] = {}
class _FakeLLM:
def __init__(self, **kwargs):
del kwargs
class _FakeAgent:
def __init__(self, **kwargs):
self.llm = kwargs.get("llm")
class _FakeTask:
def __init__(self, **kwargs):
captured["output_pydantic"] = kwargs.get("output_pydantic")
class _FakeCrew:
def __init__(self, **kwargs):
del kwargs
def kickoff(self):
return SimpleNamespace(
raw=(
'{"status":"SUCCESS","execution_summary":"done",'
'"execution_data":{},"report_brief":"ok"}'
),
pydantic=None,
json_dict=None,
token_usage=SimpleNamespace(
prompt_tokens=1,
completion_tokens=2,
total_tokens=3,
),
)
monkeypatch.setattr(stage_runner_module, "LLM", _FakeLLM)
monkeypatch.setattr(stage_runner_module, "Agent", _FakeAgent)
monkeypatch.setattr(stage_runner_module, "Task", _FakeTask)
monkeypatch.setattr(stage_runner_module, "Crew", _FakeCrew)
runtime._run_stage_with_crewai(
stage="execution",
user_content='{"user_input":"go","intent_summary":"navigate"}',
system_prompt="",
tools_payload=[
{
"name": "front.navigate_to_route",
"description": "navigate",
"parameters": {
"type": "object",
"properties": {"target": {"type": "string"}},
"required": ["target"],
},
}
],
litellm_model="dashscope/qwen3.5-flash",
)
assert captured["output_pydantic"] is None
@@ -0,0 +1,19 @@
from __future__ import annotations
from core.agent.infrastructure.crewai.runtime_parsers import parse_execution_result
def test_parse_execution_result_preserves_execution_data_for_interrupted_status() -> (
None
):
result = parse_execution_result(
'{"status":"interrupted","execution_summary":"approval needed",'
'"execution_data":{"tool_called":"front.navigate_to_route",'
'"input":{"target":"/calendar/dayweek"},'
'"error":"frontend tool requires approval"},'
'"report_brief":"await approval"}'
)
assert result.status == "PARTIAL"
assert result.execution_data.get("tool_called") == "front.navigate_to_route"
assert result.execution_data.get("input") == {"target": "/calendar/dayweek"}
@@ -0,0 +1,223 @@
from __future__ import annotations
import pytest
from crewai.agents import parser as crew_parser
from core.agent.infrastructure.crewai.runtime_tools import (
PendingFrontendToolCall,
extract_pending_front_tool,
resolve_stage_crewai_tools,
)
def test_frontend_tool_accepts_direct_kwargs_and_raises_pending() -> None:
calls: list[dict[str, object]] = []
tools = resolve_stage_crewai_tools(
tools_payload=[
{
"name": "front.navigate_to_route",
"description": "Navigate to route",
"parameters": {
"type": "object",
"properties": {
"target": {"type": "string"},
"replace": {"type": "boolean"},
},
"required": ["target"],
},
}
],
calls=calls,
backend_handler=None,
)
with pytest.raises(PendingFrontendToolCall) as exc:
tools[0].run(target="/calendar/dayweek", replace=False)
assert exc.value.payload["name"] == "front.navigate_to_route"
assert exc.value.payload["args"] == {
"target": "/calendar/dayweek",
"replace": False,
}
def test_react_action_text_can_address_frontend_tool_name() -> None:
parsed = crew_parser.parse(
"Thought: need route change\n"
"Action: front.navigate_to_route\n"
'Action Input: {"target":"/calendar/dayweek","replace":false}'
)
assert isinstance(parsed, crew_parser.AgentAction)
calls: list[dict[str, object]] = []
tools = resolve_stage_crewai_tools(
tools_payload=[
{
"name": "front.navigate_to_route",
"description": "Navigate to route",
"parameters": {
"type": "object",
"properties": {
"target": {"type": "string"},
"replace": {"type": "boolean"},
},
"required": ["target"],
},
}
],
calls=calls,
backend_handler=None,
)
tool = next(item for item in tools if item.name == parsed.tool)
with pytest.raises(PendingFrontendToolCall) as exc:
tool.run(**{"target": "/calendar/dayweek", "replace": False})
assert exc.value.payload["name"] == "front.navigate_to_route"
def test_dynamic_tool_args_schema_follows_tool_parameters() -> None:
calls: list[dict[str, object]] = []
tools = resolve_stage_crewai_tools(
tools_payload=[
{
"name": "front.navigate_to_route",
"description": "Navigate to route",
"parameters": {
"type": "object",
"properties": {
"target": {"type": "string"},
"replace": {"type": "boolean"},
},
"required": ["target"],
},
}
],
calls=calls,
backend_handler=None,
)
schema = tools[0].args_schema.model_json_schema()
props = schema.get("properties", {})
required = schema.get("required", [])
assert isinstance(props, dict)
assert "target" in props
assert "replace" in props
assert required == ["target"]
def test_extract_pending_front_tool_supports_tool_called_and_input_fields() -> None:
pending = extract_pending_front_tool(
execution_tools=[
{
"name": "front.navigate_to_route",
"parameters": {
"type": "object",
"properties": {
"target": {"type": "string"},
"replace": {"type": "boolean"},
},
},
}
],
pending_call=None,
execution_data={
"tool_called": "front.navigate_to_route",
"input": {"target": "/calendar/dayweek"},
"status": "pending_approval",
},
)
assert pending == {
"name": "front.navigate_to_route",
"args": {"target": "/calendar/dayweek", "replace": False},
"target": "frontend",
}
def test_extract_pending_front_tool_supports_interrupted_status_with_error() -> None:
pending = extract_pending_front_tool(
execution_tools=[
{
"name": "front.navigate_to_route",
"parameters": {
"type": "object",
"properties": {
"target": {"type": "string"},
"replace": {"type": "boolean"},
},
},
}
],
pending_call=None,
execution_data={
"status": "interrupted",
"tool_called": "front.navigate_to_route",
"parameters": {"target": "/calendar/dayweek", "replace": False},
"error": "frontend tool requires approval",
},
)
assert pending == {
"name": "front.navigate_to_route",
"args": {"target": "/calendar/dayweek", "replace": False},
"target": "frontend",
}
def test_extract_pending_front_tool_supports_approval_result_field() -> None:
pending = extract_pending_front_tool(
execution_tools=[
{
"name": "front.navigate_to_route",
"parameters": {
"type": "object",
"properties": {
"target": {"type": "string"},
"replace": {"type": "boolean"},
},
},
}
],
pending_call=None,
execution_data={
"tool_called": "front.navigate_to_route",
"parameters": {"target": "/calendar/dayweek", "replace": False},
"result": "approval_required_error",
},
)
assert pending == {
"name": "front.navigate_to_route",
"args": {"target": "/calendar/dayweek", "replace": False},
"target": "frontend",
}
def test_extract_pending_front_tool_supports_observation_field() -> None:
pending = extract_pending_front_tool(
execution_tools=[
{
"name": "front.navigate_to_route",
"parameters": {
"type": "object",
"properties": {
"target": {"type": "string"},
"replace": {"type": "boolean"},
},
},
}
],
pending_call=None,
execution_data={
"tool_called": "front.navigate_to_route",
"parameters": {"target": "/calendar/dayweek", "replace": False},
"observation": "frontend tool requires approval.",
},
)
assert pending == {
"name": "front.navigate_to_route",
"args": {"target": "/calendar/dayweek", "replace": False},
"target": "frontend",
}
@@ -0,0 +1,16 @@
from __future__ import annotations
from core.agent.prompt.runtime_stage_prompts import build_stage_task_description
def test_execution_stage_prompt_includes_react_tool_invocation_rule() -> None:
prompt = build_stage_task_description(
stage="execution",
task_description="execute",
tools_payload=[{"name": "front.navigate_to_route"}],
system_prompt="",
user_content="go",
)
assert "Action:" in prompt
assert "Action Input:" in prompt
@@ -0,0 +1,26 @@
from __future__ import annotations
import pytest
import core.agent.infrastructure.crewai.tools.stage_tool_allowlist as allowlist_module
def test_load_crewai_stage_tools_returns_expected_defaults() -> None:
result = allowlist_module.load_crewai_stage_tools()
assert result == {
"intent": [],
"execution": ["back.create_calendar_event"],
"organization": [],
}
def test_load_crewai_stage_tools_rejects_unknown_backend_tool(monkeypatch) -> None:
monkeypatch.setattr(
allowlist_module,
"STAGE_TOOL_ALLOWLIST",
{"execution": ["back.unknown"]},
)
with pytest.raises(ValueError, match="unknown backend tool"):
allowlist_module.load_crewai_stage_tools()