Files
qzl d060962a5f feat(agent): redesign project_cli with module/method/input protocol
- Replace command/subcommand/args with module/method/input envelope
- Calendar handler uses discriminated union (mode) for read operations
- Strict Pydantic models with extra='forbid' for all calendar methods
- Worker max_iters=7, router prompt simplified (removed project_cli_defaults)
- Skill index cards + per-action files for progressive disclosure
- Frontend/AG-UI aligned to module/method dispatch
- Protocol docs updated to module/method/input contract

WIP: action cards need envelope fix, 2 tests need update, memory
handler needs Pydantic models.
2026-04-24 13:24:13 +08:00

100 lines
2.9 KiB
Python

from __future__ import annotations
from pydantic import BaseModel
class ScoreDetail(BaseModel):
criterion: str
passed: bool
note: str = ""
class ScenarioScore(BaseModel):
scenario_id: str
model_code: str
latency_ms: int
input_tokens: int = 0
output_tokens: int = 0
cost_usd: float = 0.0
tool_called: bool
tool_succeeded: bool
answer_quality: float
details: list[ScoreDetail]
raw_answer: str = ""
run_finished: bool = True
@property
def overall_score(self) -> float:
weights = {
"tool_correctness": 0.3,
"answer_quality": 0.5,
"latency": 0.2,
}
tool_score = 1.0 if self.tool_succeeded else (0.5 if self.tool_called else 0.0)
latency_score = self._latency_score()
return (
weights["tool_correctness"] * tool_score
+ weights["answer_quality"] * self.answer_quality
+ weights["latency"] * latency_score
)
def _latency_score(self) -> float:
if self.latency_ms <= 5000:
return 1.0
if self.latency_ms <= 15000:
return 0.7
if self.latency_ms <= 30000:
return 0.4
return 0.1
class ModelScorecard(BaseModel):
model_code: str
scenario_scores: list[ScenarioScore]
@property
def avg_overall(self) -> float:
if not self.scenario_scores:
return 0.0
return sum(s.overall_score for s in self.scenario_scores) / len(self.scenario_scores)
@property
def avg_latency_ms(self) -> float:
if not self.scenario_scores:
return 0.0
return sum(s.latency_ms for s in self.scenario_scores) / len(self.scenario_scores)
@property
def avg_cost_usd(self) -> float:
if not self.scenario_scores:
return 0.0
return sum(s.cost_usd for s in self.scenario_scores) / len(self.scenario_scores)
@property
def tool_success_rate(self) -> float:
if not self.scenario_scores:
return 0.0
return sum(1 for s in self.scenario_scores if s.tool_succeeded) / len(self.scenario_scores)
def summary_table(self) -> str:
lines = [
f"\n{'='*60}",
f"Model Scorecard: {self.model_code}",
f"{'='*60}",
f" Avg Overall Score : {self.avg_overall:.2f}",
f" Avg Latency : {self.avg_latency_ms:.0f}ms",
f" Avg Cost : ${self.avg_cost_usd:.6f}",
f" Tool Success Rate : {self.tool_success_rate:.0%}",
f"{'-'*60}",
]
for s in self.scenario_scores:
status = "PASS" if s.tool_succeeded else "FAIL"
lines.append(
f" [{status}] {s.scenario_id:<25} "
f"score={s.overall_score:.2f} "
f"lat={s.latency_ms}ms "
f"cost=${s.cost_usd:.6f}"
)
lines.append(f"{'='*60}")
return "\n".join(lines)