feat(agent): redesign project_cli with module/method/input protocol

- Replace command/subcommand/args with module/method/input envelope - Calendar handler uses discriminated union (mode) for read operations - Strict Pydantic models with extra='forbid' for all calendar methods - Worker max_iters=7, router prompt simplified (removed project_cli_defaults) - Skill index cards + per-action files for progressive disclosure - Frontend/AG-UI aligned to module/method dispatch - Protocol docs updated to module/method/input contract WIP: action cards need envelope fix, 2 tests need update, memory handler needs Pydantic models.
2026-04-24 13:24:13 +08:00
parent ab526af2c4
commit d060962a5f
62 changed files with 4802 additions and 805 deletions
@@ -0,0 +1,196 @@
+from __future__ import annotations
+
+import os
+import time
+from pathlib import Path
+from uuid import uuid4
+
+import httpx
+import jwt
+
+
+def _load_env() -> None:
+    env_path = Path(__file__).resolve().parents[3] / ".env"
+    if env_path.exists():
+        for line in env_path.read_text().splitlines():
+            line = line.strip()
+            if not line or line.startswith("#") or "=" not in line:
+                continue
+            key, _, value = line.partition("=")
+            key = key.strip()
+            value = value.strip().strip('"').strip("'")
+            if key and key not in os.environ:
+                os.environ[key] = value
+
+
+_load_env()
+
+BASE_URL = os.getenv("AGENT_LIVE_BASE_URL", "http://localhost:5775")
+
+
+def get_jwt_secret() -> str:
+    secret = (
+        os.getenv("SOCIAL_SUPABASE__JWT_SECRET")
+        or os.getenv("SUPABASE_JWT_SECRET")
+        or os.getenv("JWT_SECRET")
+    )
+    if not secret:
+        raise RuntimeError("JWT_SECRET not found in environment")
+    return secret
+
+
+def get_supabase_url() -> str:
+    return (
+        os.getenv("SOCIAL_SUPABASE__URL")
+        or os.getenv("SUPABASE_URL")
+        or "http://localhost:54321"
+    )
+
+
+def get_test_user_id() -> str:
+    user_id = os.getenv("TEST_USER_ID")
+    if user_id:
+        return user_id
+    raise RuntimeError("TEST_USER_ID not set")
+
+
+def create_test_jwt(user_id: str) -> str:
+    now = int(time.time())
+    payload = {
+        "sub": user_id,
+        "role": "authenticated",
+        "aud": "authenticated",
+        "iss": get_supabase_url(),
+        "iat": now,
+        "exp": now + 3600,
+    }
+    return jwt.encode(payload, get_jwt_secret(), algorithm="HS256")
+
+
+async def run_agent_and_collect(
+    *,
+    user_message: str,
+    client: httpx.AsyncClient,
+    headers: dict,
+    run_id: str | None = None,
+    thread_id: str | None = None,
+    timeout: float = 120.0,
+) -> AgentRunResult:
+    if thread_id is None:
+        thread_id = str(uuid4())
+    if run_id is None:
+        run_id = f"quality-{thread_id[:8]}"
+
+    t_start = time.monotonic()
+
+    run_resp = await client.post(
+        f"{BASE_URL}/api/v1/agent/runs",
+        headers=headers,
+        json={
+            "threadId": thread_id,
+            "runId": run_id,
+            "state": {},
+            "messages": [
+                {"id": "u1", "role": "user", "content": user_message}
+            ],
+            "tools": [],
+            "context": [],
+            "forwardedProps": {"runtime_mode": "chat"},
+        },
+    )
+
+    run_data = run_resp.json()
+    effective_thread_id = str(run_data.get("threadId", thread_id))
+    effective_run_id = run_data.get("runId", run_id)
+
+    events_url = (
+        f"{BASE_URL}/api/v1/agent/runs/{effective_thread_id}/events"
+        f"?runId={effective_run_id}"
+    )
+
+    import json
+
+    tool_results: list[dict] = []
+    all_events: list[dict] = []
+    run_finished = False
+    final_answer = ""
+
+    async with client.stream(
+        "GET", events_url, headers=headers, timeout=timeout
+    ) as sse_resp:
+        buffer = ""
+        async for line in sse_resp.aiter_lines():
+            if line.startswith("data:"):
+                data_str = line.split(":", 1)[1].strip()
+                if data_str:
+                    buffer = data_str
+            elif line == "" and buffer:
+                try:
+                    event_data = json.loads(buffer)
+                    event_type = event_data.get("type")
+                    all_events.append(event_data)
+
+                    if event_type == "TOOL_CALL_RESULT":
+                        tool_results.append(event_data)
+                    elif event_type == "TEXT_MESSAGE_END":
+                        final_answer = event_data.get("answer", "") or event_data.get("text", "")
+                    elif event_type in {"RUN_FINISHED", "RUN_ERROR"}:
+                        run_finished = True
+                except json.JSONDecodeError:
+                    pass
+                buffer = ""
+
+    t_end = time.monotonic()
+
+    return AgentRunResult(
+        thread_id=effective_thread_id,
+        run_id=effective_run_id,
+        user_message=user_message,
+        final_answer=final_answer,
+        tool_results=tool_results,
+        all_events=all_events,
+        run_finished=run_finished,
+        latency_ms=round((t_end - t_start) * 1000),
+    )
+
+
+class AgentRunResult:
+    def __init__(
+        self,
+        *,
+        thread_id: str,
+        run_id: str,
+        user_message: str,
+        final_answer: str,
+        tool_results: list[dict],
+        all_events: list[dict],
+        run_finished: bool,
+        latency_ms: int,
+    ) -> None:
+        self.thread_id = thread_id
+        self.run_id = run_id
+        self.user_message = user_message
+        self.final_answer = final_answer
+        self.tool_results = tool_results
+        self.all_events = all_events
+        self.run_finished = run_finished
+        self.latency_ms = latency_ms
+
+    @property
+    def tool_names_called(self) -> list[str]:
+        return [
+            tr.get("tool_name", "") or tr.get("toolName", "")
+            for tr in self.tool_results
+        ]
+
+    @property
+    def successful_tool_names(self) -> list[str]:
+        return [
+            tr.get("tool_name", "") or tr.get("toolName", "")
+            for tr in self.tool_results
+            if tr.get("status") in ("success", "partial")
+        ]
+
+    @property
+    def has_tool_success(self) -> bool:
+        return len(self.successful_tool_names) > 0
@@ -0,0 +1,99 @@
+from __future__ import annotations
+
+from pydantic import BaseModel
+
+
+class ScoreDetail(BaseModel):
+    criterion: str
+    passed: bool
+    note: str = ""
+
+
+class ScenarioScore(BaseModel):
+    scenario_id: str
+    model_code: str
+    latency_ms: int
+    input_tokens: int = 0
+    output_tokens: int = 0
+    cost_usd: float = 0.0
+    tool_called: bool
+    tool_succeeded: bool
+    answer_quality: float
+    details: list[ScoreDetail]
+    raw_answer: str = ""
+    run_finished: bool = True
+
+    @property
+    def overall_score(self) -> float:
+        weights = {
+            "tool_correctness": 0.3,
+            "answer_quality": 0.5,
+            "latency": 0.2,
+        }
+        tool_score = 1.0 if self.tool_succeeded else (0.5 if self.tool_called else 0.0)
+        latency_score = self._latency_score()
+        return (
+            weights["tool_correctness"] * tool_score
+            + weights["answer_quality"] * self.answer_quality
+            + weights["latency"] * latency_score
+        )
+
+    def _latency_score(self) -> float:
+        if self.latency_ms <= 5000:
+            return 1.0
+        if self.latency_ms <= 15000:
+            return 0.7
+        if self.latency_ms <= 30000:
+            return 0.4
+        return 0.1
+
+
+class ModelScorecard(BaseModel):
+    model_code: str
+    scenario_scores: list[ScenarioScore]
+
+    @property
+    def avg_overall(self) -> float:
+        if not self.scenario_scores:
+            return 0.0
+        return sum(s.overall_score for s in self.scenario_scores) / len(self.scenario_scores)
+
+    @property
+    def avg_latency_ms(self) -> float:
+        if not self.scenario_scores:
+            return 0.0
+        return sum(s.latency_ms for s in self.scenario_scores) / len(self.scenario_scores)
+
+    @property
+    def avg_cost_usd(self) -> float:
+        if not self.scenario_scores:
+            return 0.0
+        return sum(s.cost_usd for s in self.scenario_scores) / len(self.scenario_scores)
+
+    @property
+    def tool_success_rate(self) -> float:
+        if not self.scenario_scores:
+            return 0.0
+        return sum(1 for s in self.scenario_scores if s.tool_succeeded) / len(self.scenario_scores)
+
+    def summary_table(self) -> str:
+        lines = [
+            f"\n{'='*60}",
+            f"Model Scorecard: {self.model_code}",
+            f"{'='*60}",
+            f"  Avg Overall Score : {self.avg_overall:.2f}",
+            f"  Avg Latency       : {self.avg_latency_ms:.0f}ms",
+            f"  Avg Cost          : ${self.avg_cost_usd:.6f}",
+            f"  Tool Success Rate : {self.tool_success_rate:.0%}",
+            f"{'-'*60}",
+        ]
+        for s in self.scenario_scores:
+            status = "PASS" if s.tool_succeeded else "FAIL"
+            lines.append(
+                f"  [{status}] {s.scenario_id:<25} "
+                f"score={s.overall_score:.2f} "
+                f"lat={s.latency_ms}ms "
+                f"cost=${s.cost_usd:.6f}"
+            )
+        lines.append(f"{'='*60}")
+        return "\n".join(lines)
@@ -0,0 +1,82 @@
+from __future__ import annotations
+
+from pydantic import BaseModel
+
+
+class EvalScenario(BaseModel):
+    id: str
+    prompt: str
+    category: str
+    expect_tool_use: bool
+    expect_tool_success: bool
+    quality_criteria: list[str]
+
+
+CALENDAR_SCENARIOS: list[EvalScenario] = [
+    EvalScenario(
+        id="calendar-read-today",
+        prompt="请查询我今天的日程安排",
+        category="calendar",
+        expect_tool_use=True,
+        expect_tool_success=True,
+        quality_criteria=[
+            "应调用 project_cli 的 calendar.read 方法",
+            "input 应包含 mode=day 和具体日期",
+            "回答应基于工具返回的实际数据",
+            "如果无日程，应明确告知无日程",
+        ],
+    ),
+    EvalScenario(
+        id="calendar-create-event",
+        prompt="帮我创建一个明天下午3点两小时的会议，标题是项目周会",
+        category="calendar",
+        expect_tool_use=True,
+        expect_tool_success=True,
+        quality_criteria=[
+            "应调用 project_cli 的 calendar.create 方法",
+            "input 应包含 title、start_at、timezone",
+            "start_at 应为具体的时间戳而非自然语言",
+            "应返回创建结果（包含 event_id）",
+        ],
+    ),
+    EvalScenario(
+        id="calendar-read-range",
+        prompt="这周一到周五我有哪些日程？",
+        category="calendar",
+        expect_tool_use=True,
+        expect_tool_success=True,
+        quality_criteria=[
+            "应调用 project_cli 的 calendar.read 方法",
+            "input 应使用 mode=range 或多次 mode=day",
+            "应提供完整时间范围",
+        ],
+    ),
+]
+
+GENERAL_SCENARIOS: list[EvalScenario] = [
+    EvalScenario(
+        id="general-greeting",
+        prompt="你好，你是谁？",
+        category="general",
+        expect_tool_use=False,
+        expect_tool_success=False,
+        quality_criteria=[
+            "应简短自我介绍",
+            "不应调用任何工具",
+            "回答简洁不啰嗦",
+        ],
+    ),
+    EvalScenario(
+        id="general-farewell",
+        prompt="好的谢谢，再见",
+        category="general",
+        expect_tool_use=False,
+        expect_tool_success=False,
+        quality_criteria=[
+            "应礼貌告别",
+            "不应调用任何工具",
+        ],
+    ),
+]
+
+ALL_SCENARIOS = CALENDAR_SCENARIOS + GENERAL_SCENARIOS
@@ -0,0 +1,440 @@
+from __future__ import annotations
+
+import json
+import os
+import time
+from uuid import uuid4
+
+import httpx
+import jwt
+import pytest
+
+from backend.tests.quality.evaluators import ModelScorecard, ScoreDetail, ScenarioScore
+from backend.tests.quality.scenarios import ALL_SCENARIOS
+
+CANDIDATE_MODELS = ["qwen3.5-flash", "deepseek-chat"]
+
+MODEL_LLM_IDS = {
+    "qwen3.5-flash": "c625bce4-970e-4a76-bebe-cb8840fed854",
+    "deepseek-chat": "12bc1963-4b67-404b-b952-5948bea0f690",
+}
+
+BASE_URL = os.getenv("AGENT_LIVE_BASE_URL", "http://localhost:5775")
+
+
+def _load_env() -> None:
+    from pathlib import Path
+
+    env_path = Path(__file__).resolve().parents[3] / ".env"
+    if env_path.exists():
+        for line in env_path.read_text().splitlines():
+            line = line.strip()
+            if not line or line.startswith("#") or "=" not in line:
+                continue
+            key, _, value = line.partition("=")
+            key = key.strip()
+            value = value.strip().strip('"').strip("'")
+            if key and key not in os.environ:
+                os.environ[key] = value
+
+
+_load_env()
+
+
+def _get_jwt_secret() -> str:
+    secret = (
+        os.getenv("SOCIAL_SUPABASE__JWT_SECRET")
+        or os.getenv("SUPABASE_JWT_SECRET")
+        or os.getenv("JWT_SECRET")
+    )
+    if not secret:
+        raise RuntimeError("JWT_SECRET not found in environment")
+    return secret
+
+
+def _get_supabase_url() -> str:
+    return (
+        os.getenv("SOCIAL_SUPABASE__PUBLIC_URL")
+        or os.getenv("SOCIAL_SUPABASE__URL")
+        or os.getenv("SUPABASE_URL")
+        or "http://localhost:54321"
+    )
+
+
+def _get_supabase_key() -> str:
+    from core.config.settings import config
+
+    key = os.getenv("SOCIAL_SUPABASE__SERVICE_ROLE_KEY", "")
+    if key:
+        return key
+    return config.supabase.service_role_key
+
+
+def _get_test_user_id() -> str:
+    user_id = os.getenv("TEST_USER_ID")
+    if user_id:
+        return user_id
+    raise RuntimeError("TEST_USER_ID not set")
+
+
+def _create_jwt(user_id: str) -> str:
+    now = int(time.time())
+    payload = {
+        "sub": user_id,
+        "role": "authenticated",
+        "aud": "authenticated",
+        "iss": _get_supabase_url(),
+        "iat": now,
+        "exp": now + 3600,
+    }
+    return jwt.encode(payload, _get_jwt_secret(), algorithm="HS256")
+
+
+async def _run_via_http(
+    *,
+    user_message: str,
+    token: str,
+    timeout: float = 120.0,
+) -> dict:
+    thread_id = str(uuid4())
+    run_id = f"q-{uuid4().hex[:12]}"
+
+    async with httpx.AsyncClient(timeout=httpx.Timeout(timeout)) as client:
+        headers = {"Authorization": f"Bearer {token}"}
+
+        run_resp = await client.post(
+            f"{BASE_URL}/api/v1/agent/runs",
+            headers=headers,
+            json={
+                "threadId": thread_id,
+                "runId": run_id,
+                "state": {},
+                "messages": [
+                    {"id": "u1", "role": "user", "content": user_message}
+                ],
+                "tools": [],
+                "context": [],
+                "forwardedProps": {"runtime_mode": "chat"},
+            },
+        )
+        run_data = run_resp.json()
+        eff_thread = str(run_data.get("threadId", thread_id))
+        eff_run = run_data.get("runId", run_id)
+        events_url = (
+            f"{BASE_URL}/api/v1/agent/runs/{eff_thread}/events"
+            f"?runId={eff_run}"
+        )
+
+        t_start = time.monotonic()
+
+        tool_results: list[dict] = []
+        all_events: list[dict] = []
+        final_answer = ""
+        run_finished = False
+        token_usage: dict = {}
+
+        async with client.stream(
+            "GET", events_url, headers=headers, timeout=timeout
+        ) as sse:
+            buffer = ""
+            async for line in sse.aiter_lines():
+                if line.startswith("data:"):
+                    data_str = line.split(":", 1)[1].strip()
+                    if data_str:
+                        buffer = data_str
+                elif line == "" and buffer:
+                    try:
+                        ev = json.loads(buffer)
+                        all_events.append(ev)
+                        etype = ev.get("type")
+
+                        if etype == "TOOL_CALL_RESULT":
+                            tool_results.append(ev)
+                        elif etype == "TEXT_MESSAGE_END":
+                            final_answer = ev.get("answer", "") or ev.get("text", "")
+                            token_usage = {
+                                "totalTokens": ev.get("totalTokens", 0),
+                                "inputTokens": ev.get("inputTokens", 0),
+                                "outputTokens": ev.get("outputTokens", 0),
+                                "promptCacheMissTokens": ev.get(
+                                    "promptCacheMissTokens", 0
+                                ),
+                                "promptCacheHitTokens": ev.get(
+                                    "promptCacheHitTokens", 0
+                                ),
+                            }
+                        elif etype in {"RUN_FINISHED", "RUN_ERROR"}:
+                            run_finished = True
+                    except json.JSONDecodeError:
+                        pass
+                    buffer = ""
+
+        t_end = time.monotonic()
+
+        tool_names = [
+            tr.get("tool_name", "") or tr.get("toolName", "")
+            for tr in tool_results
+        ]
+        successful_tool_names = [
+            tr.get("tool_name", "") or tr.get("toolName", "")
+            for tr in tool_results
+            if tr.get("status") in ("success", "partial")
+        ]
+
+        return {
+            "final_answer": final_answer,
+            "tool_results": tool_results,
+            "tool_names": tool_names,
+            "successful_tool_names": successful_tool_names,
+            "run_finished": run_finished,
+            "latency_ms": round((t_end - t_start) * 1000),
+            "token_usage": token_usage,
+        }
+
+
+def _switch_model(model_code: str) -> None:
+    from supabase import create_client
+
+    sb = create_client(_get_supabase_url(), _get_supabase_key())
+    llm_id = MODEL_LLM_IDS[model_code]
+    for agent_type in ("router", "worker"):
+        (
+            sb.table("system_agents")
+            .update({"llm_id": llm_id})
+            .eq("agent_type", agent_type)
+            .execute()
+        )
+
+
+def _save_original_models() -> list[dict]:
+    from supabase import create_client
+
+    sb = create_client(_get_supabase_url(), _get_supabase_key())
+    return (
+        sb.table("system_agents")
+        .select("agent_type, llm_id")
+        .execute()
+        .data
+    )
+
+
+def _restore_models(original_rows: list[dict]) -> None:
+    from supabase import create_client
+
+    sb = create_client(_get_supabase_url(), _get_supabase_key())
+    for row in original_rows:
+        (
+            sb.table("system_agents")
+            .update({"llm_id": row["llm_id"]})
+            .eq("agent_type", row["agent_type"])
+            .execute()
+        )
+
+
+def _evaluate_answer_quality(
+    *,
+    answer: str,
+    run_finished: bool,
+    expect_tool_use: bool,
+    has_tool_success: bool,
+    tool_names: list[str],
+) -> float:
+    if not run_finished:
+        return 0.0
+    if not answer or not answer.strip():
+        return 0.0
+
+    score = 0.6
+
+    if expect_tool_use:
+        if has_tool_success:
+            score += 0.2
+        elif tool_names:
+            score += 0.1
+        else:
+            score -= 0.3
+    else:
+        if not tool_names:
+            score += 0.2
+        else:
+            score -= 0.1
+
+    if len(answer) > 10:
+        score += 0.1
+
+    if "无法" in answer or "失败" in answer or "错误" in answer:
+        if expect_tool_use:
+            score -= 0.1
+
+    return max(0.0, min(1.0, score))
+
+
+def _evaluate_criteria(
+    *,
+    answer: str,
+    run_finished: bool,
+    tool_names: list[str],
+    has_tool_success: bool,
+    tool_results: list[dict],
+    scenario: object,
+) -> list[ScoreDetail]:
+    details: list[ScoreDetail] = []
+    for criterion in getattr(scenario, "quality_criteria", []):
+        passed = False
+        note = ""
+
+        if "调用" in criterion or "project_cli" in criterion:
+            passed = any("project_cli" in tn for tn in tool_names)
+            note = f"tools: {tool_names}" if not passed else ""
+        elif "mode" in criterion and "day" in criterion:
+            for tr in tool_results:
+                args = tr.get("tool_call_args", {}) or tr.get("toolCallArgs", {})
+                inp = args.get("input", {})
+                if isinstance(inp, dict) and inp.get("mode") == "day":
+                    passed = True
+                    break
+        elif "具体" in criterion or "时间戳" in criterion:
+            passed = has_tool_success
+        elif "基于工具" in criterion or "返回" in criterion:
+            passed = has_tool_success
+        elif "无日程" in criterion:
+            passed = "无" in answer or "没有" in answer
+        elif "简短" in criterion or "简洁" in criterion:
+            passed = 0 < len(answer) < 200
+        elif "自我介绍" in criterion:
+            passed = "Linksy" in answer or "助手" in answer
+        elif "礼貌" in criterion:
+            passed = len(answer) > 0
+        else:
+            passed = run_finished and len(answer) > 0
+
+        details.append(ScoreDetail(criterion=criterion, passed=passed, note=note))
+    return details
+
+
+async def _run_model_scenarios(model_code: str, user_id: str) -> ModelScorecard:
+    from services.llm_pricing.service import LlmPricingService
+
+    pricing = LlmPricingService()
+    token = _create_jwt(user_id)
+    scores: list[ScenarioScore] = []
+
+    for scenario in ALL_SCENARIOS:
+        result = await _run_via_http(
+            user_message=scenario.prompt,
+            token=token,
+        )
+
+        answer = result["final_answer"]
+        tool_names = result["tool_names"]
+        has_tool_success = len(result["successful_tool_names"]) > 0
+        tu = result["token_usage"]
+
+        total_tokens = tu.get("totalTokens", 0)
+        input_tokens = tu.get("inputTokens", 0) or tu.get("promptCacheMissTokens", 0)
+        output_tokens = tu.get("outputTokens", 0) or max(total_tokens - input_tokens, 0)
+
+        try:
+            cost_usd = pricing.calculate_cost(
+                model=model_code,
+                prompt_tokens=input_tokens,
+                completion_tokens=output_tokens,
+                cached_prompt_tokens=tu.get("promptCacheHitTokens", 0),
+            )
+        except ValueError:
+            cost_usd = 0.0
+        cost_usd = round(cost_usd, 8)
+
+        tool_called = any("project_cli" in tn for tn in tool_names)
+        tool_succeeded = has_tool_success if scenario.expect_tool_use else True
+
+        answer_quality = _evaluate_answer_quality(
+            answer=answer,
+            run_finished=result["run_finished"],
+            expect_tool_use=scenario.expect_tool_use,
+            has_tool_success=has_tool_success,
+            tool_names=tool_names,
+        )
+
+        details = _evaluate_criteria(
+            answer=answer,
+            run_finished=result["run_finished"],
+            tool_names=tool_names,
+            has_tool_success=has_tool_success,
+            tool_results=result["tool_results"],
+            scenario=scenario,
+        )
+
+        print(
+            f"  [{model_code}] {scenario.id:<25} "
+            f"lat={result['latency_ms']}ms "
+            f"tokens={total_tokens} "
+            f"cost=${cost_usd:.6f} "
+            f"tool={'OK' if has_tool_success else 'FAIL'} "
+            f"answer={answer[:60]}"
+        )
+
+        scores.append(
+            ScenarioScore(
+                scenario_id=scenario.id,
+                model_code=model_code,
+                latency_ms=result["latency_ms"],
+                input_tokens=input_tokens,
+                output_tokens=output_tokens,
+                cost_usd=cost_usd,
+                tool_called=tool_called,
+                tool_succeeded=tool_succeeded,
+                answer_quality=answer_quality,
+                details=details,
+                raw_answer=answer[:500],
+                run_finished=result["run_finished"],
+            )
+        )
+
+    return ModelScorecard(model_code=model_code, scenario_scores=scores)
+
+
+@pytest.fixture(autouse=True)
+def _check_env():
+    if os.getenv("QUALITY_TEST") != "1":
+        pytest.skip("set QUALITY_TEST=1 to run quality tests")
+
+
+@pytest.fixture(autouse=True)
+def _require_test_user_id():
+    _get_test_user_id()
+
+
+@pytest.mark.asyncio
+@pytest.mark.quality
+@pytest.mark.live
+async def test_model_ab_comparison():
+    user_id = _get_test_user_id()
+    original_rows = _save_original_models()
+
+    scorecards: list[ModelScorecard] = []
+    try:
+        for model_code in CANDIDATE_MODELS:
+            _switch_model(model_code)
+            card = await _run_model_scenarios(model_code, user_id)
+            scorecards.append(card)
+            print(card.summary_table())
+    finally:
+        _restore_models(original_rows)
+
+    print("\n" + "=" * 60)
+    print("COMPARISON")
+    print("=" * 60)
+    for card in scorecards:
+        print(
+            f"  {card.model_code:<20} "
+            f"overall={card.avg_overall:.2f}  "
+            f"latency={card.avg_latency_ms:.0f}ms  "
+            f"cost=${card.avg_cost_usd:.6f}  "
+            f"tool_success={card.tool_success_rate:.0%}"
+        )
+
+    if len(scorecards) == 2:
+        a, b = scorecards
+        winner = a.model_code if a.avg_overall >= b.avg_overall else b.model_code
+        print(f"\n  Winner: {winner} (by overall score)")