From fb163cfeab2412f1d820115ad3bc49769ebbb967 Mon Sep 17 00:00:00 2001 From: qzl Date: Wed, 29 Apr 2026 18:04:25 +0800 Subject: [PATCH] chore(deploy): add backend ECR deployment flow --- .dockerignore | 21 +++ .gitea/workflows/build-production-docker.yml | 94 +++++++++++ .../IMPLEMENTATION_PLAN.md | 27 +++ .../04-29-cicd-ecr-deployment-flow/prd.md | 27 +++ .../04-29-cicd-ecr-deployment-flow/task.json | 49 ++++++ backend/Dockerfile | 31 ++++ deploy/README.md | 159 ++++++++++++++++++ deploy/docker-compose.prod.yml | 79 +++++++++ 8 files changed, 487 insertions(+) create mode 100644 .dockerignore create mode 100644 .gitea/workflows/build-production-docker.yml create mode 100644 .trellis/tasks/archive/2026-04/04-29-cicd-ecr-deployment-flow/IMPLEMENTATION_PLAN.md create mode 100644 .trellis/tasks/archive/2026-04/04-29-cicd-ecr-deployment-flow/prd.md create mode 100644 .trellis/tasks/archive/2026-04/04-29-cicd-ecr-deployment-flow/task.json create mode 100644 backend/Dockerfile create mode 100644 deploy/README.md create mode 100644 deploy/docker-compose.prod.yml diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..765adfe --- /dev/null +++ b/.dockerignore @@ -0,0 +1,21 @@ +.git +.gitea +.github +.trellis +.venv +.env +.env.* +__pycache__ +*.py[cod] +.pytest_cache +.ruff_cache +.mypy_cache +.pyright +logs +midscene_run +apps/.dart_tool +apps/build +apps/.pub +apps/.gradle +backend/.ruff_cache +infra/docker/supabase/volumes diff --git a/.gitea/workflows/build-production-docker.yml b/.gitea/workflows/build-production-docker.yml new file mode 100644 index 0000000..264bbbb --- /dev/null +++ b/.gitea/workflows/build-production-docker.yml @@ -0,0 +1,94 @@ +name: Build production Docker image + +on: + push: + branches: + - main + workflow_dispatch: + +jobs: + build-backend-image: + runs-on: wsl2-docker-host + env: + IMAGE_NAME: eryao-backend + IMAGE_SIZE_LIMIT_BYTES: 500000000 + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Validate ECR configuration + run: | + set -euo pipefail + test -n "${{ secrets.AWS_ACCESS_KEY_ID }}" + test -n "${{ secrets.AWS_SECRET_ACCESS_KEY }}" + test -n "${{ secrets.AWS_REGION }}" + test -n "${{ secrets.AWS_ACCOUNT_ID }}" + test -n "${{ secrets.ECR_REPOSITORY }}" + + - name: Build backend production image + run: | + set -euo pipefail + docker buildx build \ + --provenance=false \ + --load \ + --file backend/Dockerfile \ + --tag ${IMAGE_NAME}:prod-${GITHUB_SHA} \ + --tag ${IMAGE_NAME}:prod-latest \ + . + + - name: Check image size budget + run: | + set -euo pipefail + image_size_bytes="$(docker image inspect ${IMAGE_NAME}:prod-${GITHUB_SHA} --format '{{.Size}}')" + echo "Image size: ${image_size_bytes} bytes" + if [ "${image_size_bytes}" -gt "${IMAGE_SIZE_LIMIT_BYTES}" ]; then + echo "Image exceeds ${IMAGE_SIZE_LIMIT_BYTES} bytes" >&2 + exit 1 + fi + + - name: Smoke test backend image + run: | + set -euo pipefail + docker run --rm \ + -e ERYAO_RUNTIME__ENVIRONMENT=prod \ + -e ERYAO_SUPABASE__PUBLIC_URL=http://localhost:8001 \ + -e ERYAO_POINTS_POLICY__REGISTER_BONUS_HMAC_KEY=ci-smoke-test-key \ + --entrypoint python \ + ${IMAGE_NAME}:prod-${GITHUB_SHA} \ + -c "import app; print(app.app.title)" + + - name: Push backend image to ECR + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }} + AWS_REGION: ${{ secrets.AWS_REGION }} + AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }} + ECR_REPOSITORY: ${{ secrets.ECR_REPOSITORY }} + run: | + set -euo pipefail + caller_account_id="$(aws sts get-caller-identity --query Account --output text)" + if [ "${caller_account_id}" != "${AWS_ACCOUNT_ID}" ]; then + echo "AWS_ACCOUNT_ID does not match caller identity" >&2 + exit 1 + fi + + ecr_registry="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com" + ecr_image="${ecr_registry}/${ECR_REPOSITORY}" + + aws ecr describe-repositories \ + --region "${AWS_REGION}" \ + --repository-names "${ECR_REPOSITORY}" >/dev/null 2>&1 \ + || aws ecr create-repository \ + --region "${AWS_REGION}" \ + --repository-name "${ECR_REPOSITORY}" \ + --image-scanning-configuration scanOnPush=true \ + --encryption-configuration encryptionType=AES256 >/dev/null + + aws ecr get-login-password --region "${AWS_REGION}" \ + | docker login --username AWS --password-stdin "${ecr_registry}" + + docker tag "${IMAGE_NAME}:prod-${GITHUB_SHA}" "${ecr_image}:${GITHUB_SHA}" + docker tag "${IMAGE_NAME}:prod-${GITHUB_SHA}" "${ecr_image}:latest" + docker push "${ecr_image}:${GITHUB_SHA}" + docker push "${ecr_image}:latest" diff --git a/.trellis/tasks/archive/2026-04/04-29-cicd-ecr-deployment-flow/IMPLEMENTATION_PLAN.md b/.trellis/tasks/archive/2026-04/04-29-cicd-ecr-deployment-flow/IMPLEMENTATION_PLAN.md new file mode 100644 index 0000000..6a75c11 --- /dev/null +++ b/.trellis/tasks/archive/2026-04/04-29-cicd-ecr-deployment-flow/IMPLEMENTATION_PLAN.md @@ -0,0 +1,27 @@ +# CI/CD ECR Deployment Flow Completion + +## Completed + +- Production backend Docker image workflow exists at `.gitea/workflows/build-production-docker.yml`. +- Workflow trigger is configured for push to `main` and manual `workflow_dispatch`. +- Workflow builds `backend/Dockerfile` with Docker Buildx, validates image size, and runs a smoke test. +- Workflow logs in to ECR, creates the repository if missing, and pushes both `${GITHUB_SHA}` and `latest` tags. +- Production Docker Compose file exists at `deploy/docker-compose.prod.yml` and pulls images from ECR instead of building locally. +- Production deploy guide exists at `deploy/README.md` with EC2-side ECR login, Compose pull/up, health check, logs, and stop commands. +- Cloudflare IPv4 ingress rules were added to AWS security group `sg-064bf6675c881fde3` for `tcp/80` and `tcp/443`. + +## Deferred Intentionally + +- EC2 will not auto-pull and restart yet. The operator will log in to the single EC2 host and start Docker Compose manually after ECR image confirmation. +- Public `0.0.0.0/0` ingress for `tcp/80` and `tcp/443` remains until `https://api.meeyao.com` or the agreed health endpoint is confirmed healthy. +- Gitea workflow does not yet include SSH or SSM deployment steps. + +## Verification To Perform After PR Merge + +1. Confirm the PR is merged to `main` or otherwise pushed to `main`. +2. Confirm Gitea Actions runs the production Docker workflow successfully. +3. Confirm ECR contains the backend image tagged with the commit SHA and `latest`. +4. Operator manually logs in to EC2 and runs the documented Compose deployment commands. +5. Confirm local EC2 health check returns `{"status":"ok"}`. +6. Confirm external API health through Cloudflare. +7. Remove `0.0.0.0/0` ingress for `tcp/80` and `tcp/443` only after external health is confirmed. diff --git a/.trellis/tasks/archive/2026-04/04-29-cicd-ecr-deployment-flow/prd.md b/.trellis/tasks/archive/2026-04/04-29-cicd-ecr-deployment-flow/prd.md new file mode 100644 index 0000000..4f4bd3d --- /dev/null +++ b/.trellis/tasks/archive/2026-04/04-29-cicd-ecr-deployment-flow/prd.md @@ -0,0 +1,27 @@ +# CI/CD ECR Deployment Flow Record + +## Goal + +Record the current production CI/CD state for the backend Docker deployment path and preserve the handoff point before EC2 manual service startup. + +## Scope + +- Document that pushes to `main` trigger the Gitea workflow to build the backend Docker image. +- Document that the workflow validates the image and pushes `${GITHUB_SHA}` and `latest` tags to AWS ECR. +- Document that Cloudflare IPv4 CIDR ingress rules were added for `tcp/80` and `tcp/443` on security group `sg-064bf6675c881fde3` in `us-east-2`. +- Document that the open `0.0.0.0/0` ingress rules for `tcp/80` and `tcp/443` remain in place until the API is healthy. +- Document that final EC2 service startup is intentionally manual: the operator will log in to the single EC2 host and run Docker Compose after confirming the image exists in ECR. + +## Out of Scope + +- Automated SSH or SSM deployment to EC2. +- ECS task definition or service deployment. +- Removing the public `0.0.0.0/0` security group rules before API health is confirmed. + +## Acceptance Criteria + +- Trellis task records the completed CI/CD preparation work. +- The task is archived after recording completion. +- The temporary root-level `DEPLOYMENT_REPORT.md` is removed. +- Current repository changes are committed on `dev`, pushed, and proposed for merge to `main`. +- After merge or main push triggers CI, ECR is checked for the uploaded backend image. diff --git a/.trellis/tasks/archive/2026-04/04-29-cicd-ecr-deployment-flow/task.json b/.trellis/tasks/archive/2026-04/04-29-cicd-ecr-deployment-flow/task.json new file mode 100644 index 0000000..2d416c2 --- /dev/null +++ b/.trellis/tasks/archive/2026-04/04-29-cicd-ecr-deployment-flow/task.json @@ -0,0 +1,49 @@ +{ + "id": "cicd-ecr-deployment-flow", + "name": "cicd-ecr-deployment-flow", + "title": "Record CI/CD ECR deployment flow", + "description": "Record completed backend Docker CI/CD preparation through ECR push and the remaining manual EC2 Docker Compose startup step.", + "status": "completed", + "dev_type": "docs", + "scope": "deployment", + "priority": "P2", + "creator": "zl-q", + "assignee": "zl-q", + "createdAt": "2026-04-29", + "completedAt": "2026-04-29", + "branch": null, + "base_branch": "dev", + "worktree_path": null, + "current_phase": 0, + "next_action": [ + { + "phase": 1, + "action": "implement" + }, + { + "phase": 2, + "action": "check" + }, + { + "phase": 3, + "action": "finish" + }, + { + "phase": 4, + "action": "create-pr" + } + ], + "commit": null, + "pr_url": null, + "subtasks": [], + "children": [], + "parent": null, + "relatedFiles": [ + ".gitea/workflows/build-production-docker.yml", + "backend/Dockerfile", + "deploy/docker-compose.prod.yml", + "deploy/README.md" + ], + "notes": "CI/CD is complete through ECR image push. EC2 remains single-host Docker Compose and will be started manually after ECR image confirmation. Cloudflare IPv4 ingress was added; public 0.0.0.0/0 ingress remains until API health is confirmed.", + "meta": {} +} diff --git a/backend/Dockerfile b/backend/Dockerfile new file mode 100644 index 0000000..37f3640 --- /dev/null +++ b/backend/Dockerfile @@ -0,0 +1,31 @@ +FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim AS builder + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + UV_LINK_MODE=copy + +WORKDIR /app + +COPY pyproject.toml uv.lock ./ +RUN uv sync --frozen --no-dev --no-install-project --no-cache +RUN find /app/.venv -type d \( -name __pycache__ -o -name test -o -name tests \) -prune -exec rm -rf {} + \ + && if command -v strip >/dev/null 2>&1; then \ + find /app/.venv -type f -name "*.so" -exec strip --strip-unneeded {} +; \ + fi + +FROM python:3.12-slim-bookworm + +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PYTHONPATH=/app/backend/src \ + PATH="/app/.venv/bin:$PATH" + +WORKDIR /app + +COPY --from=builder /app/.venv ./.venv + +COPY backend ./backend + +EXPOSE 5775 + +CMD ["sh", "-c", "exec uvicorn app:app --host ${ERYAO_WEB__HOST:-0.0.0.0} --port ${ERYAO_WEB__PORT:-5775} --workers ${ERYAO_WEB__WORKERS:-2} --log-level $(printf '%s' ${ERYAO_RUNTIME__LOG_LEVEL:-info} | tr '[:upper:]' '[:lower:]')"] diff --git a/deploy/README.md b/deploy/README.md new file mode 100644 index 0000000..1116d8f --- /dev/null +++ b/deploy/README.md @@ -0,0 +1,159 @@ +# 觅爻生产部署指南 + +## 目录说明 + +`deploy/` 用于存放生产环境启动所需文件: + +- `docker-compose.prod.yml`:生产 Docker Compose 启动配置,只拉取已有镜像,不负责构建。 +- `.env`:生产环境变量文件,本文件包含敏感信息,不应提交到 Git。 + +## 前置条件 + +生产机器需要安装: + +- Docker +- Docker Compose v2 +- AWS CLI v2 + +确认命令: + +```bash +docker --version +docker compose version +aws --version +``` + +## 环境变量 + +`docker-compose.prod.yml` 默认从当前目录读取 `.env`: + +```bash +deploy/.env +``` + +必须包含 AWS ECR 镜像定位变量: + +```text +AWS_ACCOUNT_ID=<你的 AWS 账号 ID> +AWS_REGION= +ECR_REPOSITORY= +``` + +如果本目录下的 `.env` 是从项目根目录 `.env` 复制过来的,通常还需要手动追加以上三个变量。 + +默认镜像地址会拼接为: + +```text +${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ECR_REPOSITORY}:latest +``` + +如果要手动指定完整镜像地址,可以在 `.env` 中设置: + +```text +ERYAO_BACKEND_IMAGE=<完整镜像地址> +``` + +Web 服务端口使用项目环境变量: + +```text +ERYAO_WEB__PORT=5775 +``` + +默认只绑定本机回环地址: + +```text +ERYAO_DEPLOY_BIND_HOST=127.0.0.1 +``` + +如果生产机器没有 Nginx、ALB 或其他反向代理,需要直接对外暴露端口,可改为: + +```text +ERYAO_DEPLOY_BIND_HOST=0.0.0.0 +``` + +## 登录 ECR + +进入部署目录,并把 `.env` 加载到当前 shell: + +```bash +cd deploy +set -a +. ./.env +set +a +``` + +在生产机器上配置好 AWS 凭据后执行: + +```bash +aws ecr get-login-password --region "$AWS_REGION" \ + | docker login --username AWS --password-stdin \ + "${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com" +``` + +## 启动服务 + +启动 Web、Redis 和 worker: + +```bash +cd deploy +docker compose --env-file ./.env -f docker-compose.prod.yml --profile workers pull +docker compose --env-file ./.env -f docker-compose.prod.yml --profile workers up -d +``` + +只启动 Web 和 Redis: + +```bash +cd deploy +docker compose --env-file ./.env -f docker-compose.prod.yml up -d +``` + +## 健康检查 + +如果 `ERYAO_WEB__PORT=5775`: + +```bash +curl http://127.0.0.1:5775/health +``` + +期望返回: + +```json +{"status":"ok"} +``` + +## 查看状态和日志 + +```bash +cd deploy +docker compose --env-file ./.env -f docker-compose.prod.yml --profile workers ps +docker logs -f eryao-prod-backend +docker logs -f eryao-prod-worker-agent +docker logs -f eryao-prod-worker-general +docker logs -f eryao-prod-redis +``` + +## 更新版本 + +CI 推送新镜像到 ECR 后,在生产机器执行: + +```bash +cd deploy +docker compose --env-file ./.env -f docker-compose.prod.yml --profile workers pull +docker compose --env-file ./.env -f docker-compose.prod.yml --profile workers up -d +``` + +## 停止服务 + +```bash +cd deploy +docker compose --env-file ./.env -f docker-compose.prod.yml --profile workers down +``` + +如需连 Redis 数据卷一起删除: + +```bash +cd deploy +docker compose --env-file ./.env -f docker-compose.prod.yml --profile workers down -v +``` + +谨慎使用 `down -v`,它会删除 Redis 持久化数据。 diff --git a/deploy/docker-compose.prod.yml b/deploy/docker-compose.prod.yml new file mode 100644 index 0000000..1284570 --- /dev/null +++ b/deploy/docker-compose.prod.yml @@ -0,0 +1,79 @@ +name: eryao-prod + +x-backend-common: &backend-common + image: ${ERYAO_BACKEND_IMAGE:-${AWS_ACCOUNT_ID:?AWS_ACCOUNT_ID is required}.dkr.ecr.${AWS_REGION:?AWS_REGION is required}.amazonaws.com/${ECR_REPOSITORY:?ECR_REPOSITORY is required}:latest} + env_file: + - path: ./.env + required: true + depends_on: + redis: + condition: service_healthy + restart: unless-stopped + +services: + backend: + <<: *backend-common + container_name: eryao-prod-backend + environment: + ERYAO_RUNTIME__ENVIRONMENT: prod + ERYAO_RUNTIME__SERVICE_NAME: web + ERYAO_REDIS__HOST: redis + ERYAO_REDIS__PORT: 6379 + ports: + - "${ERYAO_DEPLOY_BIND_HOST:-127.0.0.1}:${ERYAO_WEB__PORT:-5775}:${ERYAO_WEB__PORT:-5775}" + + worker-agent: + <<: *backend-common + container_name: eryao-prod-worker-agent + profiles: ["workers"] + environment: + ERYAO_RUNTIME__ENVIRONMENT: prod + ERYAO_RUNTIME__SERVICE_NAME: worker-agent + ERYAO_REDIS__HOST: redis + ERYAO_REDIS__PORT: 6379 + command: + - sh + - -c + - exec taskiq worker core.taskiq.app:worker_agent_broker core.agentscope.runtime.tasks --workers ${ERYAO_WORKER__GROUPS__AGENT__CONCURRENCY:-2} + + worker-general: + <<: *backend-common + container_name: eryao-prod-worker-general + profiles: ["workers"] + environment: + ERYAO_RUNTIME__ENVIRONMENT: prod + ERYAO_RUNTIME__SERVICE_NAME: worker-general + ERYAO_REDIS__HOST: redis + ERYAO_REDIS__PORT: 6379 + command: + - sh + - -c + - exec taskiq worker core.taskiq.app:worker_general_broker core.agentscope.runtime.tasks v1.feedback.tasks --workers ${ERYAO_WORKER__GROUPS__GENERAL__CONCURRENCY:-1} + + redis: + image: redis:7.4.2-alpine + container_name: eryao-prod-redis + env_file: + - path: ./.env + required: true + environment: + REDIS_PASSWORD: ${ERYAO_REDIS__PASSWORD:-} + command: > + sh -c 'if [ -n "$$REDIS_PASSWORD" ]; then redis-server --appendonly yes --requirepass "$$REDIS_PASSWORD"; else redis-server --appendonly yes; fi' + volumes: + - redis_data:/data + healthcheck: + test: + [ + "CMD", + "sh", + "-c", + 'if [ -n "$$REDIS_PASSWORD" ]; then redis-cli -a "$$REDIS_PASSWORD" ping; else redis-cli ping; fi', + ] + interval: 5s + timeout: 3s + retries: 5 + restart: unless-stopped + +volumes: + redis_data: