feat(agent): session deletion anonymization for iOS compliance
Replace soft-delete with anonymize + hard-delete to meet iOS App Store data retention requirements. Non-PII fields are preserved in anonymous_session_snapshots for analytics. - Add anonymous_session_snapshots table and ORM model - Implement anonymizer to extract non-PII fields before deletion - Remove points_ledger.biz_id FK constraint (snapshot-style reference) - Preserve transaction history while allowing session deletion - Add 14 unit tests + 1 integration test
This commit is contained in:
@@ -2,6 +2,7 @@ from __future__ import annotations
|
||||
|
||||
from .agent_chat_message import AgentChatMessage
|
||||
from .agent_chat_session import AgentChatSession
|
||||
from .anonymous_session_snapshot import AnonymousSessionSnapshot
|
||||
from .auth_user import AuthUser
|
||||
from .invite_code import InviteCode
|
||||
from .llm import Llm
|
||||
@@ -18,6 +19,7 @@ from .user_points import UserPoints
|
||||
__all__ = [
|
||||
"AgentChatMessage",
|
||||
"AgentChatSession",
|
||||
"AnonymousSessionSnapshot",
|
||||
"AuthUser",
|
||||
"InviteCode",
|
||||
"Llm",
|
||||
|
||||
@@ -0,0 +1,46 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from decimal import Decimal
|
||||
import uuid
|
||||
|
||||
from sqlalchemy import Boolean, DateTime, Integer, Numeric, String, Text
|
||||
from sqlalchemy.dialects.postgresql import ARRAY, UUID
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
|
||||
from core.db.base import Base
|
||||
|
||||
__all__ = ["AnonymousSessionSnapshot"]
|
||||
|
||||
|
||||
class AnonymousSessionSnapshot(Base):
|
||||
__tablename__: str = "anonymous_session_snapshots"
|
||||
|
||||
id: Mapped[uuid.UUID] = mapped_column(
|
||||
UUID(as_uuid=True), primary_key=True, default=uuid.uuid4
|
||||
)
|
||||
anonymous_id: Mapped[uuid.UUID] = mapped_column(UUID(as_uuid=True), nullable=False)
|
||||
session_type: Mapped[str] = mapped_column(String(20), nullable=False)
|
||||
message_count: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
status: Mapped[str | None] = mapped_column(String(20), nullable=True)
|
||||
question_type: Mapped[str | None] = mapped_column(String(50), nullable=True)
|
||||
tool_name: Mapped[str | None] = mapped_column(String(100), nullable=True)
|
||||
gua_name: Mapped[str | None] = mapped_column(String(50), nullable=True)
|
||||
gua_name_hant: Mapped[str | None] = mapped_column(String(50), nullable=True)
|
||||
target_gua_name: Mapped[str | None] = mapped_column(String(50), nullable=True)
|
||||
has_changing_yao: Mapped[bool | None] = mapped_column(Boolean, nullable=True)
|
||||
sign_level: Mapped[str | None] = mapped_column(String(20), nullable=True)
|
||||
keywords: Mapped[list[str] | None] = mapped_column(ARRAY(Text()), nullable=True)
|
||||
model_code: Mapped[str | None] = mapped_column(String(50), nullable=True)
|
||||
total_tokens: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
total_cost: Mapped[Decimal | None] = mapped_column(Numeric(12, 6), nullable=True)
|
||||
total_latency_ms: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
||||
created_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), nullable=False
|
||||
)
|
||||
last_activity_at: Mapped[datetime | None] = mapped_column(
|
||||
DateTime(timezone=True), nullable=True
|
||||
)
|
||||
anonymized_at: Mapped[datetime] = mapped_column(
|
||||
DateTime(timezone=True), nullable=False
|
||||
)
|
||||
@@ -0,0 +1,162 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
from uuid import uuid4
|
||||
|
||||
from models.agent_chat_message import AgentChatMessage
|
||||
from models.agent_chat_session import AgentChatSession
|
||||
from models.anonymous_session_snapshot import AnonymousSessionSnapshot
|
||||
from core.logging import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def _truncate_to_day(dt: datetime) -> datetime:
|
||||
return dt.replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
|
||||
|
||||
def _extract_derived_fields(
|
||||
messages: list[AgentChatMessage],
|
||||
) -> dict[str, Any]:
|
||||
for message in messages:
|
||||
metadata_raw = message.metadata_json
|
||||
if not isinstance(metadata_raw, dict):
|
||||
continue
|
||||
agent_output = metadata_raw.get("agent_output")
|
||||
if not isinstance(agent_output, dict):
|
||||
continue
|
||||
derived = agent_output.get("divination_derived")
|
||||
if isinstance(derived, dict) and derived:
|
||||
return derived
|
||||
return {}
|
||||
|
||||
|
||||
def _extract_sign_level(
|
||||
messages: list[AgentChatMessage],
|
||||
) -> str | None:
|
||||
for message in messages:
|
||||
metadata_raw = message.metadata_json
|
||||
if not isinstance(metadata_raw, dict):
|
||||
continue
|
||||
agent_output = metadata_raw.get("agent_output")
|
||||
if not isinstance(agent_output, dict):
|
||||
continue
|
||||
sign_level = agent_output.get("sign_level")
|
||||
if isinstance(sign_level, str) and sign_level:
|
||||
return sign_level
|
||||
return None
|
||||
|
||||
|
||||
def _extract_keywords(
|
||||
messages: list[AgentChatMessage],
|
||||
) -> list[str] | None:
|
||||
for message in messages:
|
||||
metadata_raw = message.metadata_json
|
||||
if not isinstance(metadata_raw, dict):
|
||||
continue
|
||||
agent_output = metadata_raw.get("agent_output")
|
||||
if not isinstance(agent_output, dict):
|
||||
continue
|
||||
keywords = agent_output.get("keywords")
|
||||
if isinstance(keywords, list) and keywords:
|
||||
return keywords
|
||||
return None
|
||||
|
||||
|
||||
def _extract_question_type(
|
||||
messages: list[AgentChatMessage],
|
||||
) -> str | None:
|
||||
derived = _extract_derived_fields(messages)
|
||||
if not derived:
|
||||
for message in messages:
|
||||
metadata_raw = message.metadata_json
|
||||
if not isinstance(metadata_raw, dict):
|
||||
continue
|
||||
agent_output = metadata_raw.get("agent_output")
|
||||
if not isinstance(agent_output, dict):
|
||||
continue
|
||||
question_type = agent_output.get("questionType")
|
||||
if isinstance(question_type, str) and question_type:
|
||||
return question_type
|
||||
return None
|
||||
question_type = derived.get("questionType")
|
||||
if isinstance(question_type, str) and question_type:
|
||||
return question_type
|
||||
return None
|
||||
|
||||
|
||||
def _extract_model_code(
|
||||
messages: list[AgentChatMessage],
|
||||
) -> str | None:
|
||||
for message in messages:
|
||||
if message.model_code:
|
||||
return message.model_code
|
||||
return None
|
||||
|
||||
|
||||
def _extract_tool_name(
|
||||
messages: list[AgentChatMessage],
|
||||
) -> str | None:
|
||||
for message in messages:
|
||||
if message.tool_name:
|
||||
return message.tool_name
|
||||
return None
|
||||
|
||||
|
||||
def _aggregate_latency(
|
||||
messages: list[AgentChatMessage],
|
||||
) -> int | None:
|
||||
total = 0
|
||||
found = False
|
||||
for message in messages:
|
||||
if message.latency_ms is not None:
|
||||
total += message.latency_ms
|
||||
found = True
|
||||
return total if found else None
|
||||
|
||||
|
||||
def anonymize(
|
||||
session: AgentChatSession,
|
||||
messages: list[AgentChatMessage],
|
||||
) -> AnonymousSessionSnapshot:
|
||||
derived = _extract_derived_fields(messages)
|
||||
|
||||
gua_name = derived.get("guaName") if derived else None
|
||||
gua_name_hant = derived.get("guaNameHant") if derived else None
|
||||
target_gua_name = derived.get("targetGuaName") if derived else None
|
||||
has_changing_yao = derived.get("hasChangingYao") if derived else None
|
||||
|
||||
created_at = _truncate_to_day(session.created_at)
|
||||
last_activity_at = (
|
||||
_truncate_to_day(session.last_activity_at) if session.last_activity_at else None
|
||||
)
|
||||
|
||||
return AnonymousSessionSnapshot(
|
||||
id=uuid4(),
|
||||
anonymous_id=uuid4(),
|
||||
session_type=session.session_type.value
|
||||
if hasattr(session.session_type, "value")
|
||||
else str(session.session_type),
|
||||
message_count=session.message_count,
|
||||
status=session.status.value
|
||||
if hasattr(session.status, "value")
|
||||
else str(session.status),
|
||||
question_type=_extract_question_type(messages),
|
||||
tool_name=_extract_tool_name(messages),
|
||||
gua_name=gua_name if isinstance(gua_name, str) else None,
|
||||
gua_name_hant=gua_name_hant if isinstance(gua_name_hant, str) else None,
|
||||
target_gua_name=target_gua_name if isinstance(target_gua_name, str) else None,
|
||||
has_changing_yao=has_changing_yao
|
||||
if isinstance(has_changing_yao, bool)
|
||||
else None,
|
||||
sign_level=_extract_sign_level(messages),
|
||||
keywords=_extract_keywords(messages),
|
||||
model_code=_extract_model_code(messages),
|
||||
total_tokens=session.total_tokens,
|
||||
total_cost=session.total_cost,
|
||||
total_latency_ms=_aggregate_latency(messages),
|
||||
created_at=created_at,
|
||||
last_activity_at=last_activity_at,
|
||||
anonymized_at=datetime.now(timezone.utc),
|
||||
)
|
||||
@@ -5,7 +5,7 @@ from decimal import Decimal
|
||||
from typing import Any, Protocol
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
from sqlalchemy import Select, func, select
|
||||
from sqlalchemy import Select, delete, func, select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from core.http.errors import ApiProblemError
|
||||
@@ -17,6 +17,7 @@ from schemas.domain.chat_message import (
|
||||
AgentChatMessage as AgentChatMessageSchema,
|
||||
AgentChatMessageMetadata,
|
||||
)
|
||||
from v1.agent.anonymizer import anonymize
|
||||
|
||||
|
||||
class ToolResultPayloadStorage(Protocol):
|
||||
@@ -96,7 +97,7 @@ class AgentRepository:
|
||||
async def rollback(self) -> None:
|
||||
await self._session.rollback()
|
||||
|
||||
async def delete_session(self, *, session_id: str) -> None:
|
||||
async def delete_session(self, *, session_id: str) -> list[dict[str, str]]:
|
||||
try:
|
||||
session_uuid = UUID(session_id)
|
||||
except ValueError as exc:
|
||||
@@ -112,11 +113,50 @@ class AgentRepository:
|
||||
)
|
||||
session = (await self._session.execute(stmt)).scalar_one_or_none()
|
||||
if session is None:
|
||||
return
|
||||
return []
|
||||
if session.deleted_at is not None:
|
||||
return
|
||||
session.deleted_at = datetime.now(timezone.utc)
|
||||
return []
|
||||
messages_stmt = (
|
||||
select(AgentChatMessage)
|
||||
.where(AgentChatMessage.session_id == session_uuid)
|
||||
.order_by(AgentChatMessage.seq)
|
||||
)
|
||||
messages = list((await self._session.execute(messages_stmt)).scalars().all())
|
||||
attachment_paths = self._collect_attachment_paths(messages)
|
||||
snapshot = anonymize(session=session, messages=messages)
|
||||
self._session.add(snapshot)
|
||||
await self._session.flush()
|
||||
stmt_delete_messages = delete(AgentChatMessage).where(
|
||||
AgentChatMessage.session_id == session_uuid
|
||||
)
|
||||
await self._session.execute(stmt_delete_messages)
|
||||
stmt_delete_session = delete(AgentChatSession).where(
|
||||
AgentChatSession.id == session_uuid
|
||||
)
|
||||
await self._session.execute(stmt_delete_session)
|
||||
await self._session.flush()
|
||||
return attachment_paths
|
||||
|
||||
@staticmethod
|
||||
def _collect_attachment_paths(
|
||||
messages: list[AgentChatMessage],
|
||||
) -> list[dict[str, str]]:
|
||||
paths: list[dict[str, str]] = []
|
||||
for message in messages:
|
||||
metadata_raw = message.metadata_json
|
||||
if not isinstance(metadata_raw, dict):
|
||||
continue
|
||||
attachments_raw = metadata_raw.get("user_message_attachments")
|
||||
if not isinstance(attachments_raw, list):
|
||||
continue
|
||||
for attachment in attachments_raw:
|
||||
if not isinstance(attachment, dict):
|
||||
continue
|
||||
bucket = attachment.get("bucket")
|
||||
path = attachment.get("path")
|
||||
if isinstance(bucket, str) and isinstance(path, str):
|
||||
paths.append({"bucket": bucket, "path": path})
|
||||
return paths
|
||||
|
||||
async def persist_user_message(
|
||||
self,
|
||||
|
||||
@@ -23,7 +23,7 @@ class AgentRepositoryLike(Protocol):
|
||||
|
||||
async def rollback(self) -> None: ...
|
||||
|
||||
async def delete_session(self, *, session_id: str) -> None: ...
|
||||
async def delete_session(self, *, session_id: str) -> list[dict[str, str]]: ...
|
||||
|
||||
async def get_history_day(
|
||||
self,
|
||||
@@ -126,6 +126,8 @@ class AttachmentStorageLike(Protocol):
|
||||
expires_in_seconds: int,
|
||||
) -> str: ...
|
||||
|
||||
async def delete_prefix(self, *, bucket: str, prefix: str) -> int: ...
|
||||
|
||||
def parse_signed_url(self, url: str) -> tuple[str, str]: ...
|
||||
|
||||
|
||||
|
||||
@@ -235,8 +235,28 @@ class AgentService:
|
||||
return
|
||||
raise
|
||||
ensure_session_owner(owner_id=owner, current_user=current_user)
|
||||
await self._repository.delete_session(session_id=thread_id)
|
||||
attachment_paths = await self._repository.delete_session(session_id=thread_id)
|
||||
await self._repository.commit()
|
||||
await self._cleanup_attachments(attachment_paths)
|
||||
|
||||
async def _cleanup_attachments(
|
||||
self, attachment_paths: list[dict[str, str]]
|
||||
) -> None:
|
||||
if not attachment_paths or self._attachment_storage is None:
|
||||
return
|
||||
for attachment in attachment_paths:
|
||||
bucket = attachment.get("bucket")
|
||||
path = attachment.get("path")
|
||||
if not bucket or not path:
|
||||
continue
|
||||
try:
|
||||
await self._attachment_storage.delete_prefix(bucket=bucket, prefix=path)
|
||||
except Exception:
|
||||
logger.warning(
|
||||
"attachment_cleanup_failed",
|
||||
bucket=bucket,
|
||||
path=path,
|
||||
)
|
||||
|
||||
async def _append_context_cache_user_message(
|
||||
self,
|
||||
|
||||
Reference in New Issue
Block a user