feat(agent): redesign project_cli with module/method/input protocol
- Replace command/subcommand/args with module/method/input envelope - Calendar handler uses discriminated union (mode) for read operations - Strict Pydantic models with extra='forbid' for all calendar methods - Worker max_iters=7, router prompt simplified (removed project_cli_defaults) - Skill index cards + per-action files for progressive disclosure - Frontend/AG-UI aligned to module/method dispatch - Protocol docs updated to module/method/input contract WIP: action cards need envelope fix, 2 tests need update, memory handler needs Pydantic models.
This commit is contained in:
@@ -0,0 +1,99 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class ScoreDetail(BaseModel):
|
||||
criterion: str
|
||||
passed: bool
|
||||
note: str = ""
|
||||
|
||||
|
||||
class ScenarioScore(BaseModel):
|
||||
scenario_id: str
|
||||
model_code: str
|
||||
latency_ms: int
|
||||
input_tokens: int = 0
|
||||
output_tokens: int = 0
|
||||
cost_usd: float = 0.0
|
||||
tool_called: bool
|
||||
tool_succeeded: bool
|
||||
answer_quality: float
|
||||
details: list[ScoreDetail]
|
||||
raw_answer: str = ""
|
||||
run_finished: bool = True
|
||||
|
||||
@property
|
||||
def overall_score(self) -> float:
|
||||
weights = {
|
||||
"tool_correctness": 0.3,
|
||||
"answer_quality": 0.5,
|
||||
"latency": 0.2,
|
||||
}
|
||||
tool_score = 1.0 if self.tool_succeeded else (0.5 if self.tool_called else 0.0)
|
||||
latency_score = self._latency_score()
|
||||
return (
|
||||
weights["tool_correctness"] * tool_score
|
||||
+ weights["answer_quality"] * self.answer_quality
|
||||
+ weights["latency"] * latency_score
|
||||
)
|
||||
|
||||
def _latency_score(self) -> float:
|
||||
if self.latency_ms <= 5000:
|
||||
return 1.0
|
||||
if self.latency_ms <= 15000:
|
||||
return 0.7
|
||||
if self.latency_ms <= 30000:
|
||||
return 0.4
|
||||
return 0.1
|
||||
|
||||
|
||||
class ModelScorecard(BaseModel):
|
||||
model_code: str
|
||||
scenario_scores: list[ScenarioScore]
|
||||
|
||||
@property
|
||||
def avg_overall(self) -> float:
|
||||
if not self.scenario_scores:
|
||||
return 0.0
|
||||
return sum(s.overall_score for s in self.scenario_scores) / len(self.scenario_scores)
|
||||
|
||||
@property
|
||||
def avg_latency_ms(self) -> float:
|
||||
if not self.scenario_scores:
|
||||
return 0.0
|
||||
return sum(s.latency_ms for s in self.scenario_scores) / len(self.scenario_scores)
|
||||
|
||||
@property
|
||||
def avg_cost_usd(self) -> float:
|
||||
if not self.scenario_scores:
|
||||
return 0.0
|
||||
return sum(s.cost_usd for s in self.scenario_scores) / len(self.scenario_scores)
|
||||
|
||||
@property
|
||||
def tool_success_rate(self) -> float:
|
||||
if not self.scenario_scores:
|
||||
return 0.0
|
||||
return sum(1 for s in self.scenario_scores if s.tool_succeeded) / len(self.scenario_scores)
|
||||
|
||||
def summary_table(self) -> str:
|
||||
lines = [
|
||||
f"\n{'='*60}",
|
||||
f"Model Scorecard: {self.model_code}",
|
||||
f"{'='*60}",
|
||||
f" Avg Overall Score : {self.avg_overall:.2f}",
|
||||
f" Avg Latency : {self.avg_latency_ms:.0f}ms",
|
||||
f" Avg Cost : ${self.avg_cost_usd:.6f}",
|
||||
f" Tool Success Rate : {self.tool_success_rate:.0%}",
|
||||
f"{'-'*60}",
|
||||
]
|
||||
for s in self.scenario_scores:
|
||||
status = "PASS" if s.tool_succeeded else "FAIL"
|
||||
lines.append(
|
||||
f" [{status}] {s.scenario_id:<25} "
|
||||
f"score={s.overall_score:.2f} "
|
||||
f"lat={s.latency_ms}ms "
|
||||
f"cost=${s.cost_usd:.6f}"
|
||||
)
|
||||
lines.append(f"{'='*60}")
|
||||
return "\n".join(lines)
|
||||
Reference in New Issue
Block a user