feat(agent): redesign project_cli with module/method/input protocol
- Replace command/subcommand/args with module/method/input envelope - Calendar handler uses discriminated union (mode) for read operations - Strict Pydantic models with extra='forbid' for all calendar methods - Worker max_iters=7, router prompt simplified (removed project_cli_defaults) - Skill index cards + per-action files for progressive disclosure - Frontend/AG-UI aligned to module/method dispatch - Protocol docs updated to module/method/input contract WIP: action cards need envelope fix, 2 tests need update, memory handler needs Pydantic models.
This commit is contained in:
@@ -0,0 +1,440 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from uuid import uuid4
|
||||
|
||||
import httpx
|
||||
import jwt
|
||||
import pytest
|
||||
|
||||
from backend.tests.quality.evaluators import ModelScorecard, ScoreDetail, ScenarioScore
|
||||
from backend.tests.quality.scenarios import ALL_SCENARIOS
|
||||
|
||||
CANDIDATE_MODELS = ["qwen3.5-flash", "deepseek-chat"]
|
||||
|
||||
MODEL_LLM_IDS = {
|
||||
"qwen3.5-flash": "c625bce4-970e-4a76-bebe-cb8840fed854",
|
||||
"deepseek-chat": "12bc1963-4b67-404b-b952-5948bea0f690",
|
||||
}
|
||||
|
||||
BASE_URL = os.getenv("AGENT_LIVE_BASE_URL", "http://localhost:5775")
|
||||
|
||||
|
||||
def _load_env() -> None:
|
||||
from pathlib import Path
|
||||
|
||||
env_path = Path(__file__).resolve().parents[3] / ".env"
|
||||
if env_path.exists():
|
||||
for line in env_path.read_text().splitlines():
|
||||
line = line.strip()
|
||||
if not line or line.startswith("#") or "=" not in line:
|
||||
continue
|
||||
key, _, value = line.partition("=")
|
||||
key = key.strip()
|
||||
value = value.strip().strip('"').strip("'")
|
||||
if key and key not in os.environ:
|
||||
os.environ[key] = value
|
||||
|
||||
|
||||
_load_env()
|
||||
|
||||
|
||||
def _get_jwt_secret() -> str:
|
||||
secret = (
|
||||
os.getenv("SOCIAL_SUPABASE__JWT_SECRET")
|
||||
or os.getenv("SUPABASE_JWT_SECRET")
|
||||
or os.getenv("JWT_SECRET")
|
||||
)
|
||||
if not secret:
|
||||
raise RuntimeError("JWT_SECRET not found in environment")
|
||||
return secret
|
||||
|
||||
|
||||
def _get_supabase_url() -> str:
|
||||
return (
|
||||
os.getenv("SOCIAL_SUPABASE__PUBLIC_URL")
|
||||
or os.getenv("SOCIAL_SUPABASE__URL")
|
||||
or os.getenv("SUPABASE_URL")
|
||||
or "http://localhost:54321"
|
||||
)
|
||||
|
||||
|
||||
def _get_supabase_key() -> str:
|
||||
from core.config.settings import config
|
||||
|
||||
key = os.getenv("SOCIAL_SUPABASE__SERVICE_ROLE_KEY", "")
|
||||
if key:
|
||||
return key
|
||||
return config.supabase.service_role_key
|
||||
|
||||
|
||||
def _get_test_user_id() -> str:
|
||||
user_id = os.getenv("TEST_USER_ID")
|
||||
if user_id:
|
||||
return user_id
|
||||
raise RuntimeError("TEST_USER_ID not set")
|
||||
|
||||
|
||||
def _create_jwt(user_id: str) -> str:
|
||||
now = int(time.time())
|
||||
payload = {
|
||||
"sub": user_id,
|
||||
"role": "authenticated",
|
||||
"aud": "authenticated",
|
||||
"iss": _get_supabase_url(),
|
||||
"iat": now,
|
||||
"exp": now + 3600,
|
||||
}
|
||||
return jwt.encode(payload, _get_jwt_secret(), algorithm="HS256")
|
||||
|
||||
|
||||
async def _run_via_http(
|
||||
*,
|
||||
user_message: str,
|
||||
token: str,
|
||||
timeout: float = 120.0,
|
||||
) -> dict:
|
||||
thread_id = str(uuid4())
|
||||
run_id = f"q-{uuid4().hex[:12]}"
|
||||
|
||||
async with httpx.AsyncClient(timeout=httpx.Timeout(timeout)) as client:
|
||||
headers = {"Authorization": f"Bearer {token}"}
|
||||
|
||||
run_resp = await client.post(
|
||||
f"{BASE_URL}/api/v1/agent/runs",
|
||||
headers=headers,
|
||||
json={
|
||||
"threadId": thread_id,
|
||||
"runId": run_id,
|
||||
"state": {},
|
||||
"messages": [
|
||||
{"id": "u1", "role": "user", "content": user_message}
|
||||
],
|
||||
"tools": [],
|
||||
"context": [],
|
||||
"forwardedProps": {"runtime_mode": "chat"},
|
||||
},
|
||||
)
|
||||
run_data = run_resp.json()
|
||||
eff_thread = str(run_data.get("threadId", thread_id))
|
||||
eff_run = run_data.get("runId", run_id)
|
||||
events_url = (
|
||||
f"{BASE_URL}/api/v1/agent/runs/{eff_thread}/events"
|
||||
f"?runId={eff_run}"
|
||||
)
|
||||
|
||||
t_start = time.monotonic()
|
||||
|
||||
tool_results: list[dict] = []
|
||||
all_events: list[dict] = []
|
||||
final_answer = ""
|
||||
run_finished = False
|
||||
token_usage: dict = {}
|
||||
|
||||
async with client.stream(
|
||||
"GET", events_url, headers=headers, timeout=timeout
|
||||
) as sse:
|
||||
buffer = ""
|
||||
async for line in sse.aiter_lines():
|
||||
if line.startswith("data:"):
|
||||
data_str = line.split(":", 1)[1].strip()
|
||||
if data_str:
|
||||
buffer = data_str
|
||||
elif line == "" and buffer:
|
||||
try:
|
||||
ev = json.loads(buffer)
|
||||
all_events.append(ev)
|
||||
etype = ev.get("type")
|
||||
|
||||
if etype == "TOOL_CALL_RESULT":
|
||||
tool_results.append(ev)
|
||||
elif etype == "TEXT_MESSAGE_END":
|
||||
final_answer = ev.get("answer", "") or ev.get("text", "")
|
||||
token_usage = {
|
||||
"totalTokens": ev.get("totalTokens", 0),
|
||||
"inputTokens": ev.get("inputTokens", 0),
|
||||
"outputTokens": ev.get("outputTokens", 0),
|
||||
"promptCacheMissTokens": ev.get(
|
||||
"promptCacheMissTokens", 0
|
||||
),
|
||||
"promptCacheHitTokens": ev.get(
|
||||
"promptCacheHitTokens", 0
|
||||
),
|
||||
}
|
||||
elif etype in {"RUN_FINISHED", "RUN_ERROR"}:
|
||||
run_finished = True
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
buffer = ""
|
||||
|
||||
t_end = time.monotonic()
|
||||
|
||||
tool_names = [
|
||||
tr.get("tool_name", "") or tr.get("toolName", "")
|
||||
for tr in tool_results
|
||||
]
|
||||
successful_tool_names = [
|
||||
tr.get("tool_name", "") or tr.get("toolName", "")
|
||||
for tr in tool_results
|
||||
if tr.get("status") in ("success", "partial")
|
||||
]
|
||||
|
||||
return {
|
||||
"final_answer": final_answer,
|
||||
"tool_results": tool_results,
|
||||
"tool_names": tool_names,
|
||||
"successful_tool_names": successful_tool_names,
|
||||
"run_finished": run_finished,
|
||||
"latency_ms": round((t_end - t_start) * 1000),
|
||||
"token_usage": token_usage,
|
||||
}
|
||||
|
||||
|
||||
def _switch_model(model_code: str) -> None:
|
||||
from supabase import create_client
|
||||
|
||||
sb = create_client(_get_supabase_url(), _get_supabase_key())
|
||||
llm_id = MODEL_LLM_IDS[model_code]
|
||||
for agent_type in ("router", "worker"):
|
||||
(
|
||||
sb.table("system_agents")
|
||||
.update({"llm_id": llm_id})
|
||||
.eq("agent_type", agent_type)
|
||||
.execute()
|
||||
)
|
||||
|
||||
|
||||
def _save_original_models() -> list[dict]:
|
||||
from supabase import create_client
|
||||
|
||||
sb = create_client(_get_supabase_url(), _get_supabase_key())
|
||||
return (
|
||||
sb.table("system_agents")
|
||||
.select("agent_type, llm_id")
|
||||
.execute()
|
||||
.data
|
||||
)
|
||||
|
||||
|
||||
def _restore_models(original_rows: list[dict]) -> None:
|
||||
from supabase import create_client
|
||||
|
||||
sb = create_client(_get_supabase_url(), _get_supabase_key())
|
||||
for row in original_rows:
|
||||
(
|
||||
sb.table("system_agents")
|
||||
.update({"llm_id": row["llm_id"]})
|
||||
.eq("agent_type", row["agent_type"])
|
||||
.execute()
|
||||
)
|
||||
|
||||
|
||||
def _evaluate_answer_quality(
|
||||
*,
|
||||
answer: str,
|
||||
run_finished: bool,
|
||||
expect_tool_use: bool,
|
||||
has_tool_success: bool,
|
||||
tool_names: list[str],
|
||||
) -> float:
|
||||
if not run_finished:
|
||||
return 0.0
|
||||
if not answer or not answer.strip():
|
||||
return 0.0
|
||||
|
||||
score = 0.6
|
||||
|
||||
if expect_tool_use:
|
||||
if has_tool_success:
|
||||
score += 0.2
|
||||
elif tool_names:
|
||||
score += 0.1
|
||||
else:
|
||||
score -= 0.3
|
||||
else:
|
||||
if not tool_names:
|
||||
score += 0.2
|
||||
else:
|
||||
score -= 0.1
|
||||
|
||||
if len(answer) > 10:
|
||||
score += 0.1
|
||||
|
||||
if "无法" in answer or "失败" in answer or "错误" in answer:
|
||||
if expect_tool_use:
|
||||
score -= 0.1
|
||||
|
||||
return max(0.0, min(1.0, score))
|
||||
|
||||
|
||||
def _evaluate_criteria(
|
||||
*,
|
||||
answer: str,
|
||||
run_finished: bool,
|
||||
tool_names: list[str],
|
||||
has_tool_success: bool,
|
||||
tool_results: list[dict],
|
||||
scenario: object,
|
||||
) -> list[ScoreDetail]:
|
||||
details: list[ScoreDetail] = []
|
||||
for criterion in getattr(scenario, "quality_criteria", []):
|
||||
passed = False
|
||||
note = ""
|
||||
|
||||
if "调用" in criterion or "project_cli" in criterion:
|
||||
passed = any("project_cli" in tn for tn in tool_names)
|
||||
note = f"tools: {tool_names}" if not passed else ""
|
||||
elif "mode" in criterion and "day" in criterion:
|
||||
for tr in tool_results:
|
||||
args = tr.get("tool_call_args", {}) or tr.get("toolCallArgs", {})
|
||||
inp = args.get("input", {})
|
||||
if isinstance(inp, dict) and inp.get("mode") == "day":
|
||||
passed = True
|
||||
break
|
||||
elif "具体" in criterion or "时间戳" in criterion:
|
||||
passed = has_tool_success
|
||||
elif "基于工具" in criterion or "返回" in criterion:
|
||||
passed = has_tool_success
|
||||
elif "无日程" in criterion:
|
||||
passed = "无" in answer or "没有" in answer
|
||||
elif "简短" in criterion or "简洁" in criterion:
|
||||
passed = 0 < len(answer) < 200
|
||||
elif "自我介绍" in criterion:
|
||||
passed = "Linksy" in answer or "助手" in answer
|
||||
elif "礼貌" in criterion:
|
||||
passed = len(answer) > 0
|
||||
else:
|
||||
passed = run_finished and len(answer) > 0
|
||||
|
||||
details.append(ScoreDetail(criterion=criterion, passed=passed, note=note))
|
||||
return details
|
||||
|
||||
|
||||
async def _run_model_scenarios(model_code: str, user_id: str) -> ModelScorecard:
|
||||
from services.llm_pricing.service import LlmPricingService
|
||||
|
||||
pricing = LlmPricingService()
|
||||
token = _create_jwt(user_id)
|
||||
scores: list[ScenarioScore] = []
|
||||
|
||||
for scenario in ALL_SCENARIOS:
|
||||
result = await _run_via_http(
|
||||
user_message=scenario.prompt,
|
||||
token=token,
|
||||
)
|
||||
|
||||
answer = result["final_answer"]
|
||||
tool_names = result["tool_names"]
|
||||
has_tool_success = len(result["successful_tool_names"]) > 0
|
||||
tu = result["token_usage"]
|
||||
|
||||
total_tokens = tu.get("totalTokens", 0)
|
||||
input_tokens = tu.get("inputTokens", 0) or tu.get("promptCacheMissTokens", 0)
|
||||
output_tokens = tu.get("outputTokens", 0) or max(total_tokens - input_tokens, 0)
|
||||
|
||||
try:
|
||||
cost_usd = pricing.calculate_cost(
|
||||
model=model_code,
|
||||
prompt_tokens=input_tokens,
|
||||
completion_tokens=output_tokens,
|
||||
cached_prompt_tokens=tu.get("promptCacheHitTokens", 0),
|
||||
)
|
||||
except ValueError:
|
||||
cost_usd = 0.0
|
||||
cost_usd = round(cost_usd, 8)
|
||||
|
||||
tool_called = any("project_cli" in tn for tn in tool_names)
|
||||
tool_succeeded = has_tool_success if scenario.expect_tool_use else True
|
||||
|
||||
answer_quality = _evaluate_answer_quality(
|
||||
answer=answer,
|
||||
run_finished=result["run_finished"],
|
||||
expect_tool_use=scenario.expect_tool_use,
|
||||
has_tool_success=has_tool_success,
|
||||
tool_names=tool_names,
|
||||
)
|
||||
|
||||
details = _evaluate_criteria(
|
||||
answer=answer,
|
||||
run_finished=result["run_finished"],
|
||||
tool_names=tool_names,
|
||||
has_tool_success=has_tool_success,
|
||||
tool_results=result["tool_results"],
|
||||
scenario=scenario,
|
||||
)
|
||||
|
||||
print(
|
||||
f" [{model_code}] {scenario.id:<25} "
|
||||
f"lat={result['latency_ms']}ms "
|
||||
f"tokens={total_tokens} "
|
||||
f"cost=${cost_usd:.6f} "
|
||||
f"tool={'OK' if has_tool_success else 'FAIL'} "
|
||||
f"answer={answer[:60]}"
|
||||
)
|
||||
|
||||
scores.append(
|
||||
ScenarioScore(
|
||||
scenario_id=scenario.id,
|
||||
model_code=model_code,
|
||||
latency_ms=result["latency_ms"],
|
||||
input_tokens=input_tokens,
|
||||
output_tokens=output_tokens,
|
||||
cost_usd=cost_usd,
|
||||
tool_called=tool_called,
|
||||
tool_succeeded=tool_succeeded,
|
||||
answer_quality=answer_quality,
|
||||
details=details,
|
||||
raw_answer=answer[:500],
|
||||
run_finished=result["run_finished"],
|
||||
)
|
||||
)
|
||||
|
||||
return ModelScorecard(model_code=model_code, scenario_scores=scores)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _check_env():
|
||||
if os.getenv("QUALITY_TEST") != "1":
|
||||
pytest.skip("set QUALITY_TEST=1 to run quality tests")
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _require_test_user_id():
|
||||
_get_test_user_id()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.quality
|
||||
@pytest.mark.live
|
||||
async def test_model_ab_comparison():
|
||||
user_id = _get_test_user_id()
|
||||
original_rows = _save_original_models()
|
||||
|
||||
scorecards: list[ModelScorecard] = []
|
||||
try:
|
||||
for model_code in CANDIDATE_MODELS:
|
||||
_switch_model(model_code)
|
||||
card = await _run_model_scenarios(model_code, user_id)
|
||||
scorecards.append(card)
|
||||
print(card.summary_table())
|
||||
finally:
|
||||
_restore_models(original_rows)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("COMPARISON")
|
||||
print("=" * 60)
|
||||
for card in scorecards:
|
||||
print(
|
||||
f" {card.model_code:<20} "
|
||||
f"overall={card.avg_overall:.2f} "
|
||||
f"latency={card.avg_latency_ms:.0f}ms "
|
||||
f"cost=${card.avg_cost_usd:.6f} "
|
||||
f"tool_success={card.tool_success_rate:.0%}"
|
||||
)
|
||||
|
||||
if len(scorecards) == 2:
|
||||
a, b = scorecards
|
||||
winner = a.model_code if a.avg_overall >= b.avg_overall else b.model_code
|
||||
print(f"\n Winner: {winner} (by overall score)")
|
||||
Reference in New Issue
Block a user