feat(agent): redesign project_cli with module/method/input protocol

- Replace command/subcommand/args with module/method/input envelope
- Calendar handler uses discriminated union (mode) for read operations
- Strict Pydantic models with extra='forbid' for all calendar methods
- Worker max_iters=7, router prompt simplified (removed project_cli_defaults)
- Skill index cards + per-action files for progressive disclosure
- Frontend/AG-UI aligned to module/method dispatch
- Protocol docs updated to module/method/input contract

WIP: action cards need envelope fix, 2 tests need update, memory
handler needs Pydantic models.
This commit is contained in:
qzl
2026-04-24 13:24:13 +08:00
parent ab526af2c4
commit d060962a5f
62 changed files with 4802 additions and 805 deletions
View File
+196
View File
@@ -0,0 +1,196 @@
from __future__ import annotations
import os
import time
from pathlib import Path
from uuid import uuid4
import httpx
import jwt
def _load_env() -> None:
env_path = Path(__file__).resolve().parents[3] / ".env"
if env_path.exists():
for line in env_path.read_text().splitlines():
line = line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, _, value = line.partition("=")
key = key.strip()
value = value.strip().strip('"').strip("'")
if key and key not in os.environ:
os.environ[key] = value
_load_env()
BASE_URL = os.getenv("AGENT_LIVE_BASE_URL", "http://localhost:5775")
def get_jwt_secret() -> str:
secret = (
os.getenv("SOCIAL_SUPABASE__JWT_SECRET")
or os.getenv("SUPABASE_JWT_SECRET")
or os.getenv("JWT_SECRET")
)
if not secret:
raise RuntimeError("JWT_SECRET not found in environment")
return secret
def get_supabase_url() -> str:
return (
os.getenv("SOCIAL_SUPABASE__URL")
or os.getenv("SUPABASE_URL")
or "http://localhost:54321"
)
def get_test_user_id() -> str:
user_id = os.getenv("TEST_USER_ID")
if user_id:
return user_id
raise RuntimeError("TEST_USER_ID not set")
def create_test_jwt(user_id: str) -> str:
now = int(time.time())
payload = {
"sub": user_id,
"role": "authenticated",
"aud": "authenticated",
"iss": get_supabase_url(),
"iat": now,
"exp": now + 3600,
}
return jwt.encode(payload, get_jwt_secret(), algorithm="HS256")
async def run_agent_and_collect(
*,
user_message: str,
client: httpx.AsyncClient,
headers: dict,
run_id: str | None = None,
thread_id: str | None = None,
timeout: float = 120.0,
) -> AgentRunResult:
if thread_id is None:
thread_id = str(uuid4())
if run_id is None:
run_id = f"quality-{thread_id[:8]}"
t_start = time.monotonic()
run_resp = await client.post(
f"{BASE_URL}/api/v1/agent/runs",
headers=headers,
json={
"threadId": thread_id,
"runId": run_id,
"state": {},
"messages": [
{"id": "u1", "role": "user", "content": user_message}
],
"tools": [],
"context": [],
"forwardedProps": {"runtime_mode": "chat"},
},
)
run_data = run_resp.json()
effective_thread_id = str(run_data.get("threadId", thread_id))
effective_run_id = run_data.get("runId", run_id)
events_url = (
f"{BASE_URL}/api/v1/agent/runs/{effective_thread_id}/events"
f"?runId={effective_run_id}"
)
import json
tool_results: list[dict] = []
all_events: list[dict] = []
run_finished = False
final_answer = ""
async with client.stream(
"GET", events_url, headers=headers, timeout=timeout
) as sse_resp:
buffer = ""
async for line in sse_resp.aiter_lines():
if line.startswith("data:"):
data_str = line.split(":", 1)[1].strip()
if data_str:
buffer = data_str
elif line == "" and buffer:
try:
event_data = json.loads(buffer)
event_type = event_data.get("type")
all_events.append(event_data)
if event_type == "TOOL_CALL_RESULT":
tool_results.append(event_data)
elif event_type == "TEXT_MESSAGE_END":
final_answer = event_data.get("answer", "") or event_data.get("text", "")
elif event_type in {"RUN_FINISHED", "RUN_ERROR"}:
run_finished = True
except json.JSONDecodeError:
pass
buffer = ""
t_end = time.monotonic()
return AgentRunResult(
thread_id=effective_thread_id,
run_id=effective_run_id,
user_message=user_message,
final_answer=final_answer,
tool_results=tool_results,
all_events=all_events,
run_finished=run_finished,
latency_ms=round((t_end - t_start) * 1000),
)
class AgentRunResult:
def __init__(
self,
*,
thread_id: str,
run_id: str,
user_message: str,
final_answer: str,
tool_results: list[dict],
all_events: list[dict],
run_finished: bool,
latency_ms: int,
) -> None:
self.thread_id = thread_id
self.run_id = run_id
self.user_message = user_message
self.final_answer = final_answer
self.tool_results = tool_results
self.all_events = all_events
self.run_finished = run_finished
self.latency_ms = latency_ms
@property
def tool_names_called(self) -> list[str]:
return [
tr.get("tool_name", "") or tr.get("toolName", "")
for tr in self.tool_results
]
@property
def successful_tool_names(self) -> list[str]:
return [
tr.get("tool_name", "") or tr.get("toolName", "")
for tr in self.tool_results
if tr.get("status") in ("success", "partial")
]
@property
def has_tool_success(self) -> bool:
return len(self.successful_tool_names) > 0
@@ -0,0 +1,99 @@
from __future__ import annotations
from pydantic import BaseModel
class ScoreDetail(BaseModel):
criterion: str
passed: bool
note: str = ""
class ScenarioScore(BaseModel):
scenario_id: str
model_code: str
latency_ms: int
input_tokens: int = 0
output_tokens: int = 0
cost_usd: float = 0.0
tool_called: bool
tool_succeeded: bool
answer_quality: float
details: list[ScoreDetail]
raw_answer: str = ""
run_finished: bool = True
@property
def overall_score(self) -> float:
weights = {
"tool_correctness": 0.3,
"answer_quality": 0.5,
"latency": 0.2,
}
tool_score = 1.0 if self.tool_succeeded else (0.5 if self.tool_called else 0.0)
latency_score = self._latency_score()
return (
weights["tool_correctness"] * tool_score
+ weights["answer_quality"] * self.answer_quality
+ weights["latency"] * latency_score
)
def _latency_score(self) -> float:
if self.latency_ms <= 5000:
return 1.0
if self.latency_ms <= 15000:
return 0.7
if self.latency_ms <= 30000:
return 0.4
return 0.1
class ModelScorecard(BaseModel):
model_code: str
scenario_scores: list[ScenarioScore]
@property
def avg_overall(self) -> float:
if not self.scenario_scores:
return 0.0
return sum(s.overall_score for s in self.scenario_scores) / len(self.scenario_scores)
@property
def avg_latency_ms(self) -> float:
if not self.scenario_scores:
return 0.0
return sum(s.latency_ms for s in self.scenario_scores) / len(self.scenario_scores)
@property
def avg_cost_usd(self) -> float:
if not self.scenario_scores:
return 0.0
return sum(s.cost_usd for s in self.scenario_scores) / len(self.scenario_scores)
@property
def tool_success_rate(self) -> float:
if not self.scenario_scores:
return 0.0
return sum(1 for s in self.scenario_scores if s.tool_succeeded) / len(self.scenario_scores)
def summary_table(self) -> str:
lines = [
f"\n{'='*60}",
f"Model Scorecard: {self.model_code}",
f"{'='*60}",
f" Avg Overall Score : {self.avg_overall:.2f}",
f" Avg Latency : {self.avg_latency_ms:.0f}ms",
f" Avg Cost : ${self.avg_cost_usd:.6f}",
f" Tool Success Rate : {self.tool_success_rate:.0%}",
f"{'-'*60}",
]
for s in self.scenario_scores:
status = "PASS" if s.tool_succeeded else "FAIL"
lines.append(
f" [{status}] {s.scenario_id:<25} "
f"score={s.overall_score:.2f} "
f"lat={s.latency_ms}ms "
f"cost=${s.cost_usd:.6f}"
)
lines.append(f"{'='*60}")
return "\n".join(lines)
@@ -0,0 +1,82 @@
from __future__ import annotations
from pydantic import BaseModel
class EvalScenario(BaseModel):
id: str
prompt: str
category: str
expect_tool_use: bool
expect_tool_success: bool
quality_criteria: list[str]
CALENDAR_SCENARIOS: list[EvalScenario] = [
EvalScenario(
id="calendar-read-today",
prompt="请查询我今天的日程安排",
category="calendar",
expect_tool_use=True,
expect_tool_success=True,
quality_criteria=[
"应调用 project_cli 的 calendar.read 方法",
"input 应包含 mode=day 和具体日期",
"回答应基于工具返回的实际数据",
"如果无日程,应明确告知无日程",
],
),
EvalScenario(
id="calendar-create-event",
prompt="帮我创建一个明天下午3点两小时的会议,标题是项目周会",
category="calendar",
expect_tool_use=True,
expect_tool_success=True,
quality_criteria=[
"应调用 project_cli 的 calendar.create 方法",
"input 应包含 title、start_at、timezone",
"start_at 应为具体的时间戳而非自然语言",
"应返回创建结果(包含 event_id)",
],
),
EvalScenario(
id="calendar-read-range",
prompt="这周一到周五我有哪些日程?",
category="calendar",
expect_tool_use=True,
expect_tool_success=True,
quality_criteria=[
"应调用 project_cli 的 calendar.read 方法",
"input 应使用 mode=range 或多次 mode=day",
"应提供完整时间范围",
],
),
]
GENERAL_SCENARIOS: list[EvalScenario] = [
EvalScenario(
id="general-greeting",
prompt="你好,你是谁?",
category="general",
expect_tool_use=False,
expect_tool_success=False,
quality_criteria=[
"应简短自我介绍",
"不应调用任何工具",
"回答简洁不啰嗦",
],
),
EvalScenario(
id="general-farewell",
prompt="好的谢谢,再见",
category="general",
expect_tool_use=False,
expect_tool_success=False,
quality_criteria=[
"应礼貌告别",
"不应调用任何工具",
],
),
]
ALL_SCENARIOS = CALENDAR_SCENARIOS + GENERAL_SCENARIOS
+440
View File
@@ -0,0 +1,440 @@
from __future__ import annotations
import json
import os
import time
from uuid import uuid4
import httpx
import jwt
import pytest
from backend.tests.quality.evaluators import ModelScorecard, ScoreDetail, ScenarioScore
from backend.tests.quality.scenarios import ALL_SCENARIOS
CANDIDATE_MODELS = ["qwen3.5-flash", "deepseek-chat"]
MODEL_LLM_IDS = {
"qwen3.5-flash": "c625bce4-970e-4a76-bebe-cb8840fed854",
"deepseek-chat": "12bc1963-4b67-404b-b952-5948bea0f690",
}
BASE_URL = os.getenv("AGENT_LIVE_BASE_URL", "http://localhost:5775")
def _load_env() -> None:
from pathlib import Path
env_path = Path(__file__).resolve().parents[3] / ".env"
if env_path.exists():
for line in env_path.read_text().splitlines():
line = line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, _, value = line.partition("=")
key = key.strip()
value = value.strip().strip('"').strip("'")
if key and key not in os.environ:
os.environ[key] = value
_load_env()
def _get_jwt_secret() -> str:
secret = (
os.getenv("SOCIAL_SUPABASE__JWT_SECRET")
or os.getenv("SUPABASE_JWT_SECRET")
or os.getenv("JWT_SECRET")
)
if not secret:
raise RuntimeError("JWT_SECRET not found in environment")
return secret
def _get_supabase_url() -> str:
return (
os.getenv("SOCIAL_SUPABASE__PUBLIC_URL")
or os.getenv("SOCIAL_SUPABASE__URL")
or os.getenv("SUPABASE_URL")
or "http://localhost:54321"
)
def _get_supabase_key() -> str:
from core.config.settings import config
key = os.getenv("SOCIAL_SUPABASE__SERVICE_ROLE_KEY", "")
if key:
return key
return config.supabase.service_role_key
def _get_test_user_id() -> str:
user_id = os.getenv("TEST_USER_ID")
if user_id:
return user_id
raise RuntimeError("TEST_USER_ID not set")
def _create_jwt(user_id: str) -> str:
now = int(time.time())
payload = {
"sub": user_id,
"role": "authenticated",
"aud": "authenticated",
"iss": _get_supabase_url(),
"iat": now,
"exp": now + 3600,
}
return jwt.encode(payload, _get_jwt_secret(), algorithm="HS256")
async def _run_via_http(
*,
user_message: str,
token: str,
timeout: float = 120.0,
) -> dict:
thread_id = str(uuid4())
run_id = f"q-{uuid4().hex[:12]}"
async with httpx.AsyncClient(timeout=httpx.Timeout(timeout)) as client:
headers = {"Authorization": f"Bearer {token}"}
run_resp = await client.post(
f"{BASE_URL}/api/v1/agent/runs",
headers=headers,
json={
"threadId": thread_id,
"runId": run_id,
"state": {},
"messages": [
{"id": "u1", "role": "user", "content": user_message}
],
"tools": [],
"context": [],
"forwardedProps": {"runtime_mode": "chat"},
},
)
run_data = run_resp.json()
eff_thread = str(run_data.get("threadId", thread_id))
eff_run = run_data.get("runId", run_id)
events_url = (
f"{BASE_URL}/api/v1/agent/runs/{eff_thread}/events"
f"?runId={eff_run}"
)
t_start = time.monotonic()
tool_results: list[dict] = []
all_events: list[dict] = []
final_answer = ""
run_finished = False
token_usage: dict = {}
async with client.stream(
"GET", events_url, headers=headers, timeout=timeout
) as sse:
buffer = ""
async for line in sse.aiter_lines():
if line.startswith("data:"):
data_str = line.split(":", 1)[1].strip()
if data_str:
buffer = data_str
elif line == "" and buffer:
try:
ev = json.loads(buffer)
all_events.append(ev)
etype = ev.get("type")
if etype == "TOOL_CALL_RESULT":
tool_results.append(ev)
elif etype == "TEXT_MESSAGE_END":
final_answer = ev.get("answer", "") or ev.get("text", "")
token_usage = {
"totalTokens": ev.get("totalTokens", 0),
"inputTokens": ev.get("inputTokens", 0),
"outputTokens": ev.get("outputTokens", 0),
"promptCacheMissTokens": ev.get(
"promptCacheMissTokens", 0
),
"promptCacheHitTokens": ev.get(
"promptCacheHitTokens", 0
),
}
elif etype in {"RUN_FINISHED", "RUN_ERROR"}:
run_finished = True
except json.JSONDecodeError:
pass
buffer = ""
t_end = time.monotonic()
tool_names = [
tr.get("tool_name", "") or tr.get("toolName", "")
for tr in tool_results
]
successful_tool_names = [
tr.get("tool_name", "") or tr.get("toolName", "")
for tr in tool_results
if tr.get("status") in ("success", "partial")
]
return {
"final_answer": final_answer,
"tool_results": tool_results,
"tool_names": tool_names,
"successful_tool_names": successful_tool_names,
"run_finished": run_finished,
"latency_ms": round((t_end - t_start) * 1000),
"token_usage": token_usage,
}
def _switch_model(model_code: str) -> None:
from supabase import create_client
sb = create_client(_get_supabase_url(), _get_supabase_key())
llm_id = MODEL_LLM_IDS[model_code]
for agent_type in ("router", "worker"):
(
sb.table("system_agents")
.update({"llm_id": llm_id})
.eq("agent_type", agent_type)
.execute()
)
def _save_original_models() -> list[dict]:
from supabase import create_client
sb = create_client(_get_supabase_url(), _get_supabase_key())
return (
sb.table("system_agents")
.select("agent_type, llm_id")
.execute()
.data
)
def _restore_models(original_rows: list[dict]) -> None:
from supabase import create_client
sb = create_client(_get_supabase_url(), _get_supabase_key())
for row in original_rows:
(
sb.table("system_agents")
.update({"llm_id": row["llm_id"]})
.eq("agent_type", row["agent_type"])
.execute()
)
def _evaluate_answer_quality(
*,
answer: str,
run_finished: bool,
expect_tool_use: bool,
has_tool_success: bool,
tool_names: list[str],
) -> float:
if not run_finished:
return 0.0
if not answer or not answer.strip():
return 0.0
score = 0.6
if expect_tool_use:
if has_tool_success:
score += 0.2
elif tool_names:
score += 0.1
else:
score -= 0.3
else:
if not tool_names:
score += 0.2
else:
score -= 0.1
if len(answer) > 10:
score += 0.1
if "无法" in answer or "失败" in answer or "错误" in answer:
if expect_tool_use:
score -= 0.1
return max(0.0, min(1.0, score))
def _evaluate_criteria(
*,
answer: str,
run_finished: bool,
tool_names: list[str],
has_tool_success: bool,
tool_results: list[dict],
scenario: object,
) -> list[ScoreDetail]:
details: list[ScoreDetail] = []
for criterion in getattr(scenario, "quality_criteria", []):
passed = False
note = ""
if "调用" in criterion or "project_cli" in criterion:
passed = any("project_cli" in tn for tn in tool_names)
note = f"tools: {tool_names}" if not passed else ""
elif "mode" in criterion and "day" in criterion:
for tr in tool_results:
args = tr.get("tool_call_args", {}) or tr.get("toolCallArgs", {})
inp = args.get("input", {})
if isinstance(inp, dict) and inp.get("mode") == "day":
passed = True
break
elif "具体" in criterion or "时间戳" in criterion:
passed = has_tool_success
elif "基于工具" in criterion or "返回" in criterion:
passed = has_tool_success
elif "无日程" in criterion:
passed = "" in answer or "没有" in answer
elif "简短" in criterion or "简洁" in criterion:
passed = 0 < len(answer) < 200
elif "自我介绍" in criterion:
passed = "Linksy" in answer or "助手" in answer
elif "礼貌" in criterion:
passed = len(answer) > 0
else:
passed = run_finished and len(answer) > 0
details.append(ScoreDetail(criterion=criterion, passed=passed, note=note))
return details
async def _run_model_scenarios(model_code: str, user_id: str) -> ModelScorecard:
from services.llm_pricing.service import LlmPricingService
pricing = LlmPricingService()
token = _create_jwt(user_id)
scores: list[ScenarioScore] = []
for scenario in ALL_SCENARIOS:
result = await _run_via_http(
user_message=scenario.prompt,
token=token,
)
answer = result["final_answer"]
tool_names = result["tool_names"]
has_tool_success = len(result["successful_tool_names"]) > 0
tu = result["token_usage"]
total_tokens = tu.get("totalTokens", 0)
input_tokens = tu.get("inputTokens", 0) or tu.get("promptCacheMissTokens", 0)
output_tokens = tu.get("outputTokens", 0) or max(total_tokens - input_tokens, 0)
try:
cost_usd = pricing.calculate_cost(
model=model_code,
prompt_tokens=input_tokens,
completion_tokens=output_tokens,
cached_prompt_tokens=tu.get("promptCacheHitTokens", 0),
)
except ValueError:
cost_usd = 0.0
cost_usd = round(cost_usd, 8)
tool_called = any("project_cli" in tn for tn in tool_names)
tool_succeeded = has_tool_success if scenario.expect_tool_use else True
answer_quality = _evaluate_answer_quality(
answer=answer,
run_finished=result["run_finished"],
expect_tool_use=scenario.expect_tool_use,
has_tool_success=has_tool_success,
tool_names=tool_names,
)
details = _evaluate_criteria(
answer=answer,
run_finished=result["run_finished"],
tool_names=tool_names,
has_tool_success=has_tool_success,
tool_results=result["tool_results"],
scenario=scenario,
)
print(
f" [{model_code}] {scenario.id:<25} "
f"lat={result['latency_ms']}ms "
f"tokens={total_tokens} "
f"cost=${cost_usd:.6f} "
f"tool={'OK' if has_tool_success else 'FAIL'} "
f"answer={answer[:60]}"
)
scores.append(
ScenarioScore(
scenario_id=scenario.id,
model_code=model_code,
latency_ms=result["latency_ms"],
input_tokens=input_tokens,
output_tokens=output_tokens,
cost_usd=cost_usd,
tool_called=tool_called,
tool_succeeded=tool_succeeded,
answer_quality=answer_quality,
details=details,
raw_answer=answer[:500],
run_finished=result["run_finished"],
)
)
return ModelScorecard(model_code=model_code, scenario_scores=scores)
@pytest.fixture(autouse=True)
def _check_env():
if os.getenv("QUALITY_TEST") != "1":
pytest.skip("set QUALITY_TEST=1 to run quality tests")
@pytest.fixture(autouse=True)
def _require_test_user_id():
_get_test_user_id()
@pytest.mark.asyncio
@pytest.mark.quality
@pytest.mark.live
async def test_model_ab_comparison():
user_id = _get_test_user_id()
original_rows = _save_original_models()
scorecards: list[ModelScorecard] = []
try:
for model_code in CANDIDATE_MODELS:
_switch_model(model_code)
card = await _run_model_scenarios(model_code, user_id)
scorecards.append(card)
print(card.summary_table())
finally:
_restore_models(original_rows)
print("\n" + "=" * 60)
print("COMPARISON")
print("=" * 60)
for card in scorecards:
print(
f" {card.model_code:<20} "
f"overall={card.avg_overall:.2f} "
f"latency={card.avg_latency_ms:.0f}ms "
f"cost=${card.avg_cost_usd:.6f} "
f"tool_success={card.tool_success_rate:.0%}"
)
if len(scorecards) == 2:
a, b = scorecards
winner = a.model_code if a.avg_overall >= b.avg_overall else b.model_code
print(f"\n Winner: {winner} (by overall score)")