83 lines
2.4 KiB
Python
83 lines
2.4 KiB
Python
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
from pydantic import BaseModel
|
||
|
|
|
||
|
|
|
||
|
|
class EvalScenario(BaseModel):
|
||
|
|
id: str
|
||
|
|
prompt: str
|
||
|
|
category: str
|
||
|
|
expect_tool_use: bool
|
||
|
|
expect_tool_success: bool
|
||
|
|
quality_criteria: list[str]
|
||
|
|
|
||
|
|
|
||
|
|
CALENDAR_SCENARIOS: list[EvalScenario] = [
|
||
|
|
EvalScenario(
|
||
|
|
id="calendar-read-today",
|
||
|
|
prompt="请查询我今天的日程安排",
|
||
|
|
category="calendar",
|
||
|
|
expect_tool_use=True,
|
||
|
|
expect_tool_success=True,
|
||
|
|
quality_criteria=[
|
||
|
|
"应调用 project_cli 的 calendar.read 方法",
|
||
|
|
"input 应包含 mode=day 和具体日期",
|
||
|
|
"回答应基于工具返回的实际数据",
|
||
|
|
"如果无日程,应明确告知无日程",
|
||
|
|
],
|
||
|
|
),
|
||
|
|
EvalScenario(
|
||
|
|
id="calendar-create-event",
|
||
|
|
prompt="帮我创建一个明天下午3点两小时的会议,标题是项目周会",
|
||
|
|
category="calendar",
|
||
|
|
expect_tool_use=True,
|
||
|
|
expect_tool_success=True,
|
||
|
|
quality_criteria=[
|
||
|
|
"应调用 project_cli 的 calendar.create 方法",
|
||
|
|
"input 应包含 title、start_at、timezone",
|
||
|
|
"start_at 应为具体的时间戳而非自然语言",
|
||
|
|
"应返回创建结果(包含 event_id)",
|
||
|
|
],
|
||
|
|
),
|
||
|
|
EvalScenario(
|
||
|
|
id="calendar-read-range",
|
||
|
|
prompt="这周一到周五我有哪些日程?",
|
||
|
|
category="calendar",
|
||
|
|
expect_tool_use=True,
|
||
|
|
expect_tool_success=True,
|
||
|
|
quality_criteria=[
|
||
|
|
"应调用 project_cli 的 calendar.read 方法",
|
||
|
|
"input 应使用 mode=range 或多次 mode=day",
|
||
|
|
"应提供完整时间范围",
|
||
|
|
],
|
||
|
|
),
|
||
|
|
]
|
||
|
|
|
||
|
|
GENERAL_SCENARIOS: list[EvalScenario] = [
|
||
|
|
EvalScenario(
|
||
|
|
id="general-greeting",
|
||
|
|
prompt="你好,你是谁?",
|
||
|
|
category="general",
|
||
|
|
expect_tool_use=False,
|
||
|
|
expect_tool_success=False,
|
||
|
|
quality_criteria=[
|
||
|
|
"应简短自我介绍",
|
||
|
|
"不应调用任何工具",
|
||
|
|
"回答简洁不啰嗦",
|
||
|
|
],
|
||
|
|
),
|
||
|
|
EvalScenario(
|
||
|
|
id="general-farewell",
|
||
|
|
prompt="好的谢谢,再见",
|
||
|
|
category="general",
|
||
|
|
expect_tool_use=False,
|
||
|
|
expect_tool_success=False,
|
||
|
|
quality_criteria=[
|
||
|
|
"应礼貌告别",
|
||
|
|
"不应调用任何工具",
|
||
|
|
],
|
||
|
|
),
|
||
|
|
]
|
||
|
|
|
||
|
|
ALL_SCENARIOS = CALENDAR_SCENARIOS + GENERAL_SCENARIOS
|