d060962a5f
- Replace command/subcommand/args with module/method/input envelope - Calendar handler uses discriminated union (mode) for read operations - Strict Pydantic models with extra='forbid' for all calendar methods - Worker max_iters=7, router prompt simplified (removed project_cli_defaults) - Skill index cards + per-action files for progressive disclosure - Frontend/AG-UI aligned to module/method dispatch - Protocol docs updated to module/method/input contract WIP: action cards need envelope fix, 2 tests need update, memory handler needs Pydantic models.
100 lines
2.9 KiB
Python
100 lines
2.9 KiB
Python
from __future__ import annotations
|
|
|
|
from pydantic import BaseModel
|
|
|
|
|
|
class ScoreDetail(BaseModel):
|
|
criterion: str
|
|
passed: bool
|
|
note: str = ""
|
|
|
|
|
|
class ScenarioScore(BaseModel):
|
|
scenario_id: str
|
|
model_code: str
|
|
latency_ms: int
|
|
input_tokens: int = 0
|
|
output_tokens: int = 0
|
|
cost_usd: float = 0.0
|
|
tool_called: bool
|
|
tool_succeeded: bool
|
|
answer_quality: float
|
|
details: list[ScoreDetail]
|
|
raw_answer: str = ""
|
|
run_finished: bool = True
|
|
|
|
@property
|
|
def overall_score(self) -> float:
|
|
weights = {
|
|
"tool_correctness": 0.3,
|
|
"answer_quality": 0.5,
|
|
"latency": 0.2,
|
|
}
|
|
tool_score = 1.0 if self.tool_succeeded else (0.5 if self.tool_called else 0.0)
|
|
latency_score = self._latency_score()
|
|
return (
|
|
weights["tool_correctness"] * tool_score
|
|
+ weights["answer_quality"] * self.answer_quality
|
|
+ weights["latency"] * latency_score
|
|
)
|
|
|
|
def _latency_score(self) -> float:
|
|
if self.latency_ms <= 5000:
|
|
return 1.0
|
|
if self.latency_ms <= 15000:
|
|
return 0.7
|
|
if self.latency_ms <= 30000:
|
|
return 0.4
|
|
return 0.1
|
|
|
|
|
|
class ModelScorecard(BaseModel):
|
|
model_code: str
|
|
scenario_scores: list[ScenarioScore]
|
|
|
|
@property
|
|
def avg_overall(self) -> float:
|
|
if not self.scenario_scores:
|
|
return 0.0
|
|
return sum(s.overall_score for s in self.scenario_scores) / len(self.scenario_scores)
|
|
|
|
@property
|
|
def avg_latency_ms(self) -> float:
|
|
if not self.scenario_scores:
|
|
return 0.0
|
|
return sum(s.latency_ms for s in self.scenario_scores) / len(self.scenario_scores)
|
|
|
|
@property
|
|
def avg_cost_usd(self) -> float:
|
|
if not self.scenario_scores:
|
|
return 0.0
|
|
return sum(s.cost_usd for s in self.scenario_scores) / len(self.scenario_scores)
|
|
|
|
@property
|
|
def tool_success_rate(self) -> float:
|
|
if not self.scenario_scores:
|
|
return 0.0
|
|
return sum(1 for s in self.scenario_scores if s.tool_succeeded) / len(self.scenario_scores)
|
|
|
|
def summary_table(self) -> str:
|
|
lines = [
|
|
f"\n{'='*60}",
|
|
f"Model Scorecard: {self.model_code}",
|
|
f"{'='*60}",
|
|
f" Avg Overall Score : {self.avg_overall:.2f}",
|
|
f" Avg Latency : {self.avg_latency_ms:.0f}ms",
|
|
f" Avg Cost : ${self.avg_cost_usd:.6f}",
|
|
f" Tool Success Rate : {self.tool_success_rate:.0%}",
|
|
f"{'-'*60}",
|
|
]
|
|
for s in self.scenario_scores:
|
|
status = "PASS" if s.tool_succeeded else "FAIL"
|
|
lines.append(
|
|
f" [{status}] {s.scenario_id:<25} "
|
|
f"score={s.overall_score:.2f} "
|
|
f"lat={s.latency_ms}ms "
|
|
f"cost=${s.cost_usd:.6f}"
|
|
)
|
|
lines.append(f"{'='*60}")
|
|
return "\n".join(lines)
|