refactor: 移除 LiteLLM proxy 架构，后端直连 Provider API

- 移除 backend/scripts/build_litellm_proxy_config.py - 简化 LiteLLMService，移除 run_completion_with_cost 方法 - AgentScopeRunner 改为从 LlmFactory 获取 api_base 和 api_key - 部署配置移除 litellm/litellm-config-job 服务 - Flutter 新增 AuthBootScreen 引导页 - Android 添加通知权限 (POST_NOTIFICATIONS, RECEIVE_BOOT_COMPLETED, SCHEDULE_EXACT_ALARM) - 优化 LocalNotificationService 调度失败 fallback - 更新 manifest.json (version 3)
2026-03-17 18:05:49 +08:00
parent cf56b358ad
commit 19981964fb
26 changed files with 417 additions and 1018 deletions
@@ -1,9 +1,5 @@
 from __future__ import annotations

-from services.litellm.service import (
-    LiteLLMResponseWithCost,
-    LiteLLMService,
-    LiteLLMUsage,
-)
+from services.litellm.service import LiteLLMService

-__all__ = ["LiteLLMService", "LiteLLMUsage", "LiteLLMResponseWithCost"]
+__all__ = ["LiteLLMService"]
@@ -1,11 +1,8 @@
 from __future__ import annotations

 from dataclasses import dataclass
-from typing import Any, Callable
+from typing import Any

-from litellm import completion
-
-from core.config.settings import config
 from core.config.initial.init_data import load_llm_catalog


@@ -17,34 +14,10 @@ class PricingTier:
    cache_hit_cost_per_token: float


-@dataclass(frozen=True)
-class LiteLLMUsage:
-    prompt_tokens: int
-    completion_tokens: int
-    total_tokens: int
-    cached_prompt_tokens: int
-    cost: float
-
-
-@dataclass(frozen=True)
-class LiteLLMResponseWithCost:
-    response: dict[str, Any]
-    usage: LiteLLMUsage
-
-
 class LiteLLMService:
-    proxy_base_url: str
-    proxy_api_key: str
    _pricing_by_model: dict[str, tuple[PricingTier, ...]]

-    def __init__(
-        self,
-        *,
-        proxy_base_url: str | None = None,
-        proxy_api_key: str | None = None,
-    ) -> None:
-        self.proxy_base_url = proxy_base_url or config.litellm.base_url
-        self.proxy_api_key = proxy_api_key or config.litellm.api_key
+    def __init__(self) -> None:
        self._pricing_by_model = self._build_pricing_map()

    @staticmethod
@@ -142,78 +115,3 @@ class LiteLLMService:
            "cost": cost,
            "latencyMs": latency_ms,
        }
-
-    def run_completion_with_cost(
-        self,
-        *,
-        model: str,
-        messages: list[dict[str, Any]],
-        temperature: float | None = None,
-        max_tokens: int | None = None,
-        timeout: float | None = None,
-        response_format: dict[str, Any] | None = None,
-        completion_fn: Callable[..., dict[str, Any]] | None = None,
-    ) -> LiteLLMResponseWithCost:
-        caller = completion_fn or completion
-        request_model = model if model.startswith("openai/") else f"openai/{model}"
-
-        request_kwargs: dict[str, Any] = {
-            "model": request_model,
-            "api_key": self.proxy_api_key,
-            "api_base": self.proxy_base_url,
-            "messages": messages,
-            "temperature": temperature,
-            "max_tokens": max_tokens,
-            "timeout": timeout,
-            "stream": False,
-        }
-        if response_format is not None:
-            request_kwargs["response_format"] = response_format
-
-        response_any = caller(**request_kwargs)
-        response = self._normalize_response(response_any)
-
-        usage_raw = response.get("usage")
-        if not isinstance(usage_raw, dict):
-            raise ValueError("missing usage in response")
-
-        prompt_tokens = int(usage_raw.get("prompt_tokens", 0) or 0)
-        completion_tokens = int(usage_raw.get("completion_tokens", 0) or 0)
-        total_tokens = int(
-            usage_raw.get("total_tokens", prompt_tokens + completion_tokens) or 0
-        )
-        cached_prompt_tokens = 0
-        prompt_tokens_details = usage_raw.get("prompt_tokens_details")
-        if isinstance(prompt_tokens_details, dict):
-            cached_prompt_tokens = int(
-                prompt_tokens_details.get("cached_tokens", 0) or 0
-            )
-
-        resolved_model = str(response.get("model", model)).strip()
-        cost = self.calculate_cost(
-            model=resolved_model,
-            prompt_tokens=prompt_tokens,
-            completion_tokens=completion_tokens,
-            cached_prompt_tokens=cached_prompt_tokens,
-        )
-        return LiteLLMResponseWithCost(
-            response=response,
-            usage=LiteLLMUsage(
-                prompt_tokens=prompt_tokens,
-                completion_tokens=completion_tokens,
-                total_tokens=total_tokens,
-                cached_prompt_tokens=cached_prompt_tokens,
-                cost=cost,
-            ),
-        )
-
-    @staticmethod
-    def _normalize_response(response_any: Any) -> dict[str, Any]:
-        if isinstance(response_any, dict):
-            return response_any
-        model_dump = getattr(response_any, "model_dump", None)
-        if callable(model_dump):
-            dumped = model_dump()
-            if isinstance(dumped, dict):
-                return dumped
-        raise ValueError("litellm response is not serializable")