feat(agent): redesign project_cli with module/method/input protocol

- Replace command/subcommand/args with module/method/input envelope - Calendar handler uses discriminated union (mode) for read operations - Strict Pydantic models with extra='forbid' for all calendar methods - Worker max_iters=7, router prompt simplified (removed project_cli_defaults) - Skill index cards + per-action files for progressive disclosure - Frontend/AG-UI aligned to module/method dispatch - Protocol docs updated to module/method/input contract WIP: action cards need envelope fix, 2 tests need update, memory handler needs Pydantic models.
2026-04-24 13:24:13 +08:00
parent ab526af2c4
commit d060962a5f
62 changed files with 4802 additions and 805 deletions
@@ -0,0 +1,99 @@
+from __future__ import annotations
+
+from pydantic import BaseModel
+
+
+class ScoreDetail(BaseModel):
+    criterion: str
+    passed: bool
+    note: str = ""
+
+
+class ScenarioScore(BaseModel):
+    scenario_id: str
+    model_code: str
+    latency_ms: int
+    input_tokens: int = 0
+    output_tokens: int = 0
+    cost_usd: float = 0.0
+    tool_called: bool
+    tool_succeeded: bool
+    answer_quality: float
+    details: list[ScoreDetail]
+    raw_answer: str = ""
+    run_finished: bool = True
+
+    @property
+    def overall_score(self) -> float:
+        weights = {
+            "tool_correctness": 0.3,
+            "answer_quality": 0.5,
+            "latency": 0.2,
+        }
+        tool_score = 1.0 if self.tool_succeeded else (0.5 if self.tool_called else 0.0)
+        latency_score = self._latency_score()
+        return (
+            weights["tool_correctness"] * tool_score
+            + weights["answer_quality"] * self.answer_quality
+            + weights["latency"] * latency_score
+        )
+
+    def _latency_score(self) -> float:
+        if self.latency_ms <= 5000:
+            return 1.0
+        if self.latency_ms <= 15000:
+            return 0.7
+        if self.latency_ms <= 30000:
+            return 0.4
+        return 0.1
+
+
+class ModelScorecard(BaseModel):
+    model_code: str
+    scenario_scores: list[ScenarioScore]
+
+    @property
+    def avg_overall(self) -> float:
+        if not self.scenario_scores:
+            return 0.0
+        return sum(s.overall_score for s in self.scenario_scores) / len(self.scenario_scores)
+
+    @property
+    def avg_latency_ms(self) -> float:
+        if not self.scenario_scores:
+            return 0.0
+        return sum(s.latency_ms for s in self.scenario_scores) / len(self.scenario_scores)
+
+    @property
+    def avg_cost_usd(self) -> float:
+        if not self.scenario_scores:
+            return 0.0
+        return sum(s.cost_usd for s in self.scenario_scores) / len(self.scenario_scores)
+
+    @property
+    def tool_success_rate(self) -> float:
+        if not self.scenario_scores:
+            return 0.0
+        return sum(1 for s in self.scenario_scores if s.tool_succeeded) / len(self.scenario_scores)
+
+    def summary_table(self) -> str:
+        lines = [
+            f"\n{'='*60}",
+            f"Model Scorecard: {self.model_code}",
+            f"{'='*60}",
+            f"  Avg Overall Score : {self.avg_overall:.2f}",
+            f"  Avg Latency       : {self.avg_latency_ms:.0f}ms",
+            f"  Avg Cost          : ${self.avg_cost_usd:.6f}",
+            f"  Tool Success Rate : {self.tool_success_rate:.0%}",
+            f"{'-'*60}",
+        ]
+        for s in self.scenario_scores:
+            status = "PASS" if s.tool_succeeded else "FAIL"
+            lines.append(
+                f"  [{status}] {s.scenario_id:<25} "
+                f"score={s.overall_score:.2f} "
+                f"lat={s.latency_ms}ms "
+                f"cost=${s.cost_usd:.6f}"
+            )
+        lines.append(f"{'='*60}")
+        return "\n".join(lines)