refactor: 移除 LiteLLM proxy 架构,后端直连 Provider API
- 移除 backend/scripts/build_litellm_proxy_config.py - 简化 LiteLLMService,移除 run_completion_with_cost 方法 - AgentScopeRunner 改为从 LlmFactory 获取 api_base 和 api_key - 部署配置移除 litellm/litellm-config-job 服务 - Flutter 新增 AuthBootScreen 引导页 - Android 添加通知权限 (POST_NOTIFICATIONS, RECEIVE_BOOT_COMPLETED, SCHEDULE_EXACT_ALARM) - 优化 LocalNotificationService 调度失败 fallback - 更新 manifest.json (version 3)
This commit is contained in:
@@ -1,9 +1,5 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from services.litellm.service import (
|
||||
LiteLLMResponseWithCost,
|
||||
LiteLLMService,
|
||||
LiteLLMUsage,
|
||||
)
|
||||
from services.litellm.service import LiteLLMService
|
||||
|
||||
__all__ = ["LiteLLMService", "LiteLLMUsage", "LiteLLMResponseWithCost"]
|
||||
__all__ = ["LiteLLMService"]
|
||||
|
||||
@@ -1,11 +1,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Callable
|
||||
from typing import Any
|
||||
|
||||
from litellm import completion
|
||||
|
||||
from core.config.settings import config
|
||||
from core.config.initial.init_data import load_llm_catalog
|
||||
|
||||
|
||||
@@ -17,34 +14,10 @@ class PricingTier:
|
||||
cache_hit_cost_per_token: float
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class LiteLLMUsage:
|
||||
prompt_tokens: int
|
||||
completion_tokens: int
|
||||
total_tokens: int
|
||||
cached_prompt_tokens: int
|
||||
cost: float
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class LiteLLMResponseWithCost:
|
||||
response: dict[str, Any]
|
||||
usage: LiteLLMUsage
|
||||
|
||||
|
||||
class LiteLLMService:
|
||||
proxy_base_url: str
|
||||
proxy_api_key: str
|
||||
_pricing_by_model: dict[str, tuple[PricingTier, ...]]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
proxy_base_url: str | None = None,
|
||||
proxy_api_key: str | None = None,
|
||||
) -> None:
|
||||
self.proxy_base_url = proxy_base_url or config.litellm.base_url
|
||||
self.proxy_api_key = proxy_api_key or config.litellm.api_key
|
||||
def __init__(self) -> None:
|
||||
self._pricing_by_model = self._build_pricing_map()
|
||||
|
||||
@staticmethod
|
||||
@@ -142,78 +115,3 @@ class LiteLLMService:
|
||||
"cost": cost,
|
||||
"latencyMs": latency_ms,
|
||||
}
|
||||
|
||||
def run_completion_with_cost(
|
||||
self,
|
||||
*,
|
||||
model: str,
|
||||
messages: list[dict[str, Any]],
|
||||
temperature: float | None = None,
|
||||
max_tokens: int | None = None,
|
||||
timeout: float | None = None,
|
||||
response_format: dict[str, Any] | None = None,
|
||||
completion_fn: Callable[..., dict[str, Any]] | None = None,
|
||||
) -> LiteLLMResponseWithCost:
|
||||
caller = completion_fn or completion
|
||||
request_model = model if model.startswith("openai/") else f"openai/{model}"
|
||||
|
||||
request_kwargs: dict[str, Any] = {
|
||||
"model": request_model,
|
||||
"api_key": self.proxy_api_key,
|
||||
"api_base": self.proxy_base_url,
|
||||
"messages": messages,
|
||||
"temperature": temperature,
|
||||
"max_tokens": max_tokens,
|
||||
"timeout": timeout,
|
||||
"stream": False,
|
||||
}
|
||||
if response_format is not None:
|
||||
request_kwargs["response_format"] = response_format
|
||||
|
||||
response_any = caller(**request_kwargs)
|
||||
response = self._normalize_response(response_any)
|
||||
|
||||
usage_raw = response.get("usage")
|
||||
if not isinstance(usage_raw, dict):
|
||||
raise ValueError("missing usage in response")
|
||||
|
||||
prompt_tokens = int(usage_raw.get("prompt_tokens", 0) or 0)
|
||||
completion_tokens = int(usage_raw.get("completion_tokens", 0) or 0)
|
||||
total_tokens = int(
|
||||
usage_raw.get("total_tokens", prompt_tokens + completion_tokens) or 0
|
||||
)
|
||||
cached_prompt_tokens = 0
|
||||
prompt_tokens_details = usage_raw.get("prompt_tokens_details")
|
||||
if isinstance(prompt_tokens_details, dict):
|
||||
cached_prompt_tokens = int(
|
||||
prompt_tokens_details.get("cached_tokens", 0) or 0
|
||||
)
|
||||
|
||||
resolved_model = str(response.get("model", model)).strip()
|
||||
cost = self.calculate_cost(
|
||||
model=resolved_model,
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
cached_prompt_tokens=cached_prompt_tokens,
|
||||
)
|
||||
return LiteLLMResponseWithCost(
|
||||
response=response,
|
||||
usage=LiteLLMUsage(
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
total_tokens=total_tokens,
|
||||
cached_prompt_tokens=cached_prompt_tokens,
|
||||
cost=cost,
|
||||
),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _normalize_response(response_any: Any) -> dict[str, Any]:
|
||||
if isinstance(response_any, dict):
|
||||
return response_any
|
||||
model_dump = getattr(response_any, "model_dump", None)
|
||||
if callable(model_dump):
|
||||
dumped = model_dump()
|
||||
if isinstance(dumped, dict):
|
||||
return dumped
|
||||
raise ValueError("litellm response is not serializable")
|
||||
|
||||
Reference in New Issue
Block a user