from __future__ import annotations from pydantic import BaseModel class ScoreDetail(BaseModel): criterion: str passed: bool note: str = "" class ScenarioScore(BaseModel): scenario_id: str model_code: str latency_ms: int input_tokens: int = 0 output_tokens: int = 0 cost_usd: float = 0.0 tool_called: bool tool_succeeded: bool answer_quality: float details: list[ScoreDetail] raw_answer: str = "" run_finished: bool = True @property def overall_score(self) -> float: weights = { "tool_correctness": 0.3, "answer_quality": 0.5, "latency": 0.2, } tool_score = 1.0 if self.tool_succeeded else (0.5 if self.tool_called else 0.0) latency_score = self._latency_score() return ( weights["tool_correctness"] * tool_score + weights["answer_quality"] * self.answer_quality + weights["latency"] * latency_score ) def _latency_score(self) -> float: if self.latency_ms <= 5000: return 1.0 if self.latency_ms <= 15000: return 0.7 if self.latency_ms <= 30000: return 0.4 return 0.1 class ModelScorecard(BaseModel): model_code: str scenario_scores: list[ScenarioScore] @property def avg_overall(self) -> float: if not self.scenario_scores: return 0.0 return sum(s.overall_score for s in self.scenario_scores) / len(self.scenario_scores) @property def avg_latency_ms(self) -> float: if not self.scenario_scores: return 0.0 return sum(s.latency_ms for s in self.scenario_scores) / len(self.scenario_scores) @property def avg_cost_usd(self) -> float: if not self.scenario_scores: return 0.0 return sum(s.cost_usd for s in self.scenario_scores) / len(self.scenario_scores) @property def tool_success_rate(self) -> float: if not self.scenario_scores: return 0.0 return sum(1 for s in self.scenario_scores if s.tool_succeeded) / len(self.scenario_scores) def summary_table(self) -> str: lines = [ f"\n{'='*60}", f"Model Scorecard: {self.model_code}", f"{'='*60}", f" Avg Overall Score : {self.avg_overall:.2f}", f" Avg Latency : {self.avg_latency_ms:.0f}ms", f" Avg Cost : ${self.avg_cost_usd:.6f}", f" Tool Success Rate : {self.tool_success_rate:.0%}", f"{'-'*60}", ] for s in self.scenario_scores: status = "PASS" if s.tool_succeeded else "FAIL" lines.append( f" [{status}] {s.scenario_id:<25} " f"score={s.overall_score:.2f} " f"lat={s.latency_ms}ms " f"cost=${s.cost_usd:.6f}" ) lines.append(f"{'='*60}") return "\n".join(lines)