chore(deploy): add backend ECR deployment flow

This commit is contained in:
qzl
2026-04-29 18:04:25 +08:00
parent 59760416e6
commit fb163cfeab
8 changed files with 487 additions and 0 deletions
+21
View File
@@ -0,0 +1,21 @@
.git
.gitea
.github
.trellis
.venv
.env
.env.*
__pycache__
*.py[cod]
.pytest_cache
.ruff_cache
.mypy_cache
.pyright
logs
midscene_run
apps/.dart_tool
apps/build
apps/.pub
apps/.gradle
backend/.ruff_cache
infra/docker/supabase/volumes
@@ -0,0 +1,94 @@
name: Build production Docker image
on:
push:
branches:
- main
workflow_dispatch:
jobs:
build-backend-image:
runs-on: wsl2-docker-host
env:
IMAGE_NAME: eryao-backend
IMAGE_SIZE_LIMIT_BYTES: 500000000
steps:
- name: Check out repository
uses: actions/checkout@v4
- name: Validate ECR configuration
run: |
set -euo pipefail
test -n "${{ secrets.AWS_ACCESS_KEY_ID }}"
test -n "${{ secrets.AWS_SECRET_ACCESS_KEY }}"
test -n "${{ secrets.AWS_REGION }}"
test -n "${{ secrets.AWS_ACCOUNT_ID }}"
test -n "${{ secrets.ECR_REPOSITORY }}"
- name: Build backend production image
run: |
set -euo pipefail
docker buildx build \
--provenance=false \
--load \
--file backend/Dockerfile \
--tag ${IMAGE_NAME}:prod-${GITHUB_SHA} \
--tag ${IMAGE_NAME}:prod-latest \
.
- name: Check image size budget
run: |
set -euo pipefail
image_size_bytes="$(docker image inspect ${IMAGE_NAME}:prod-${GITHUB_SHA} --format '{{.Size}}')"
echo "Image size: ${image_size_bytes} bytes"
if [ "${image_size_bytes}" -gt "${IMAGE_SIZE_LIMIT_BYTES}" ]; then
echo "Image exceeds ${IMAGE_SIZE_LIMIT_BYTES} bytes" >&2
exit 1
fi
- name: Smoke test backend image
run: |
set -euo pipefail
docker run --rm \
-e ERYAO_RUNTIME__ENVIRONMENT=prod \
-e ERYAO_SUPABASE__PUBLIC_URL=http://localhost:8001 \
-e ERYAO_POINTS_POLICY__REGISTER_BONUS_HMAC_KEY=ci-smoke-test-key \
--entrypoint python \
${IMAGE_NAME}:prod-${GITHUB_SHA} \
-c "import app; print(app.app.title)"
- name: Push backend image to ECR
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_DEFAULT_REGION: ${{ secrets.AWS_REGION }}
AWS_REGION: ${{ secrets.AWS_REGION }}
AWS_ACCOUNT_ID: ${{ secrets.AWS_ACCOUNT_ID }}
ECR_REPOSITORY: ${{ secrets.ECR_REPOSITORY }}
run: |
set -euo pipefail
caller_account_id="$(aws sts get-caller-identity --query Account --output text)"
if [ "${caller_account_id}" != "${AWS_ACCOUNT_ID}" ]; then
echo "AWS_ACCOUNT_ID does not match caller identity" >&2
exit 1
fi
ecr_registry="${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com"
ecr_image="${ecr_registry}/${ECR_REPOSITORY}"
aws ecr describe-repositories \
--region "${AWS_REGION}" \
--repository-names "${ECR_REPOSITORY}" >/dev/null 2>&1 \
|| aws ecr create-repository \
--region "${AWS_REGION}" \
--repository-name "${ECR_REPOSITORY}" \
--image-scanning-configuration scanOnPush=true \
--encryption-configuration encryptionType=AES256 >/dev/null
aws ecr get-login-password --region "${AWS_REGION}" \
| docker login --username AWS --password-stdin "${ecr_registry}"
docker tag "${IMAGE_NAME}:prod-${GITHUB_SHA}" "${ecr_image}:${GITHUB_SHA}"
docker tag "${IMAGE_NAME}:prod-${GITHUB_SHA}" "${ecr_image}:latest"
docker push "${ecr_image}:${GITHUB_SHA}"
docker push "${ecr_image}:latest"
@@ -0,0 +1,27 @@
# CI/CD ECR Deployment Flow Completion
## Completed
- Production backend Docker image workflow exists at `.gitea/workflows/build-production-docker.yml`.
- Workflow trigger is configured for push to `main` and manual `workflow_dispatch`.
- Workflow builds `backend/Dockerfile` with Docker Buildx, validates image size, and runs a smoke test.
- Workflow logs in to ECR, creates the repository if missing, and pushes both `${GITHUB_SHA}` and `latest` tags.
- Production Docker Compose file exists at `deploy/docker-compose.prod.yml` and pulls images from ECR instead of building locally.
- Production deploy guide exists at `deploy/README.md` with EC2-side ECR login, Compose pull/up, health check, logs, and stop commands.
- Cloudflare IPv4 ingress rules were added to AWS security group `sg-064bf6675c881fde3` for `tcp/80` and `tcp/443`.
## Deferred Intentionally
- EC2 will not auto-pull and restart yet. The operator will log in to the single EC2 host and start Docker Compose manually after ECR image confirmation.
- Public `0.0.0.0/0` ingress for `tcp/80` and `tcp/443` remains until `https://api.meeyao.com` or the agreed health endpoint is confirmed healthy.
- Gitea workflow does not yet include SSH or SSM deployment steps.
## Verification To Perform After PR Merge
1. Confirm the PR is merged to `main` or otherwise pushed to `main`.
2. Confirm Gitea Actions runs the production Docker workflow successfully.
3. Confirm ECR contains the backend image tagged with the commit SHA and `latest`.
4. Operator manually logs in to EC2 and runs the documented Compose deployment commands.
5. Confirm local EC2 health check returns `{"status":"ok"}`.
6. Confirm external API health through Cloudflare.
7. Remove `0.0.0.0/0` ingress for `tcp/80` and `tcp/443` only after external health is confirmed.
@@ -0,0 +1,27 @@
# CI/CD ECR Deployment Flow Record
## Goal
Record the current production CI/CD state for the backend Docker deployment path and preserve the handoff point before EC2 manual service startup.
## Scope
- Document that pushes to `main` trigger the Gitea workflow to build the backend Docker image.
- Document that the workflow validates the image and pushes `${GITHUB_SHA}` and `latest` tags to AWS ECR.
- Document that Cloudflare IPv4 CIDR ingress rules were added for `tcp/80` and `tcp/443` on security group `sg-064bf6675c881fde3` in `us-east-2`.
- Document that the open `0.0.0.0/0` ingress rules for `tcp/80` and `tcp/443` remain in place until the API is healthy.
- Document that final EC2 service startup is intentionally manual: the operator will log in to the single EC2 host and run Docker Compose after confirming the image exists in ECR.
## Out of Scope
- Automated SSH or SSM deployment to EC2.
- ECS task definition or service deployment.
- Removing the public `0.0.0.0/0` security group rules before API health is confirmed.
## Acceptance Criteria
- Trellis task records the completed CI/CD preparation work.
- The task is archived after recording completion.
- The temporary root-level `DEPLOYMENT_REPORT.md` is removed.
- Current repository changes are committed on `dev`, pushed, and proposed for merge to `main`.
- After merge or main push triggers CI, ECR is checked for the uploaded backend image.
@@ -0,0 +1,49 @@
{
"id": "cicd-ecr-deployment-flow",
"name": "cicd-ecr-deployment-flow",
"title": "Record CI/CD ECR deployment flow",
"description": "Record completed backend Docker CI/CD preparation through ECR push and the remaining manual EC2 Docker Compose startup step.",
"status": "completed",
"dev_type": "docs",
"scope": "deployment",
"priority": "P2",
"creator": "zl-q",
"assignee": "zl-q",
"createdAt": "2026-04-29",
"completedAt": "2026-04-29",
"branch": null,
"base_branch": "dev",
"worktree_path": null,
"current_phase": 0,
"next_action": [
{
"phase": 1,
"action": "implement"
},
{
"phase": 2,
"action": "check"
},
{
"phase": 3,
"action": "finish"
},
{
"phase": 4,
"action": "create-pr"
}
],
"commit": null,
"pr_url": null,
"subtasks": [],
"children": [],
"parent": null,
"relatedFiles": [
".gitea/workflows/build-production-docker.yml",
"backend/Dockerfile",
"deploy/docker-compose.prod.yml",
"deploy/README.md"
],
"notes": "CI/CD is complete through ECR image push. EC2 remains single-host Docker Compose and will be started manually after ECR image confirmation. Cloudflare IPv4 ingress was added; public 0.0.0.0/0 ingress remains until API health is confirmed.",
"meta": {}
}
+31
View File
@@ -0,0 +1,31 @@
FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim AS builder
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
UV_LINK_MODE=copy
WORKDIR /app
COPY pyproject.toml uv.lock ./
RUN uv sync --frozen --no-dev --no-install-project --no-cache
RUN find /app/.venv -type d \( -name __pycache__ -o -name test -o -name tests \) -prune -exec rm -rf {} + \
&& if command -v strip >/dev/null 2>&1; then \
find /app/.venv -type f -name "*.so" -exec strip --strip-unneeded {} +; \
fi
FROM python:3.12-slim-bookworm
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PYTHONPATH=/app/backend/src \
PATH="/app/.venv/bin:$PATH"
WORKDIR /app
COPY --from=builder /app/.venv ./.venv
COPY backend ./backend
EXPOSE 5775
CMD ["sh", "-c", "exec uvicorn app:app --host ${ERYAO_WEB__HOST:-0.0.0.0} --port ${ERYAO_WEB__PORT:-5775} --workers ${ERYAO_WEB__WORKERS:-2} --log-level $(printf '%s' ${ERYAO_RUNTIME__LOG_LEVEL:-info} | tr '[:upper:]' '[:lower:]')"]
+159
View File
@@ -0,0 +1,159 @@
# 觅爻生产部署指南
## 目录说明
`deploy/` 用于存放生产环境启动所需文件:
- `docker-compose.prod.yml`:生产 Docker Compose 启动配置,只拉取已有镜像,不负责构建。
- `.env`:生产环境变量文件,本文件包含敏感信息,不应提交到 Git。
## 前置条件
生产机器需要安装:
- Docker
- Docker Compose v2
- AWS CLI v2
确认命令:
```bash
docker --version
docker compose version
aws --version
```
## 环境变量
`docker-compose.prod.yml` 默认从当前目录读取 `.env`
```bash
deploy/.env
```
必须包含 AWS ECR 镜像定位变量:
```text
AWS_ACCOUNT_ID=<你的 AWS 账号 ID>
AWS_REGION=<ECR 所在区域>
ECR_REPOSITORY=<ECR 仓库名>
```
如果本目录下的 `.env` 是从项目根目录 `.env` 复制过来的,通常还需要手动追加以上三个变量。
默认镜像地址会拼接为:
```text
${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ECR_REPOSITORY}:latest
```
如果要手动指定完整镜像地址,可以在 `.env` 中设置:
```text
ERYAO_BACKEND_IMAGE=<完整镜像地址>
```
Web 服务端口使用项目环境变量:
```text
ERYAO_WEB__PORT=5775
```
默认只绑定本机回环地址:
```text
ERYAO_DEPLOY_BIND_HOST=127.0.0.1
```
如果生产机器没有 Nginx、ALB 或其他反向代理,需要直接对外暴露端口,可改为:
```text
ERYAO_DEPLOY_BIND_HOST=0.0.0.0
```
## 登录 ECR
进入部署目录,并把 `.env` 加载到当前 shell
```bash
cd deploy
set -a
. ./.env
set +a
```
在生产机器上配置好 AWS 凭据后执行:
```bash
aws ecr get-login-password --region "$AWS_REGION" \
| docker login --username AWS --password-stdin \
"${AWS_ACCOUNT_ID}.dkr.ecr.${AWS_REGION}.amazonaws.com"
```
## 启动服务
启动 Web、Redis 和 worker
```bash
cd deploy
docker compose --env-file ./.env -f docker-compose.prod.yml --profile workers pull
docker compose --env-file ./.env -f docker-compose.prod.yml --profile workers up -d
```
只启动 Web 和 Redis
```bash
cd deploy
docker compose --env-file ./.env -f docker-compose.prod.yml up -d
```
## 健康检查
如果 `ERYAO_WEB__PORT=5775`
```bash
curl http://127.0.0.1:5775/health
```
期望返回:
```json
{"status":"ok"}
```
## 查看状态和日志
```bash
cd deploy
docker compose --env-file ./.env -f docker-compose.prod.yml --profile workers ps
docker logs -f eryao-prod-backend
docker logs -f eryao-prod-worker-agent
docker logs -f eryao-prod-worker-general
docker logs -f eryao-prod-redis
```
## 更新版本
CI 推送新镜像到 ECR 后,在生产机器执行:
```bash
cd deploy
docker compose --env-file ./.env -f docker-compose.prod.yml --profile workers pull
docker compose --env-file ./.env -f docker-compose.prod.yml --profile workers up -d
```
## 停止服务
```bash
cd deploy
docker compose --env-file ./.env -f docker-compose.prod.yml --profile workers down
```
如需连 Redis 数据卷一起删除:
```bash
cd deploy
docker compose --env-file ./.env -f docker-compose.prod.yml --profile workers down -v
```
谨慎使用 `down -v`,它会删除 Redis 持久化数据。
+79
View File
@@ -0,0 +1,79 @@
name: eryao-prod
x-backend-common: &backend-common
image: ${ERYAO_BACKEND_IMAGE:-${AWS_ACCOUNT_ID:?AWS_ACCOUNT_ID is required}.dkr.ecr.${AWS_REGION:?AWS_REGION is required}.amazonaws.com/${ECR_REPOSITORY:?ECR_REPOSITORY is required}:latest}
env_file:
- path: ./.env
required: true
depends_on:
redis:
condition: service_healthy
restart: unless-stopped
services:
backend:
<<: *backend-common
container_name: eryao-prod-backend
environment:
ERYAO_RUNTIME__ENVIRONMENT: prod
ERYAO_RUNTIME__SERVICE_NAME: web
ERYAO_REDIS__HOST: redis
ERYAO_REDIS__PORT: 6379
ports:
- "${ERYAO_DEPLOY_BIND_HOST:-127.0.0.1}:${ERYAO_WEB__PORT:-5775}:${ERYAO_WEB__PORT:-5775}"
worker-agent:
<<: *backend-common
container_name: eryao-prod-worker-agent
profiles: ["workers"]
environment:
ERYAO_RUNTIME__ENVIRONMENT: prod
ERYAO_RUNTIME__SERVICE_NAME: worker-agent
ERYAO_REDIS__HOST: redis
ERYAO_REDIS__PORT: 6379
command:
- sh
- -c
- exec taskiq worker core.taskiq.app:worker_agent_broker core.agentscope.runtime.tasks --workers ${ERYAO_WORKER__GROUPS__AGENT__CONCURRENCY:-2}
worker-general:
<<: *backend-common
container_name: eryao-prod-worker-general
profiles: ["workers"]
environment:
ERYAO_RUNTIME__ENVIRONMENT: prod
ERYAO_RUNTIME__SERVICE_NAME: worker-general
ERYAO_REDIS__HOST: redis
ERYAO_REDIS__PORT: 6379
command:
- sh
- -c
- exec taskiq worker core.taskiq.app:worker_general_broker core.agentscope.runtime.tasks v1.feedback.tasks --workers ${ERYAO_WORKER__GROUPS__GENERAL__CONCURRENCY:-1}
redis:
image: redis:7.4.2-alpine
container_name: eryao-prod-redis
env_file:
- path: ./.env
required: true
environment:
REDIS_PASSWORD: ${ERYAO_REDIS__PASSWORD:-}
command: >
sh -c 'if [ -n "$$REDIS_PASSWORD" ]; then redis-server --appendonly yes --requirepass "$$REDIS_PASSWORD"; else redis-server --appendonly yes; fi'
volumes:
- redis_data:/data
healthcheck:
test:
[
"CMD",
"sh",
"-c",
'if [ -n "$$REDIS_PASSWORD" ]; then redis-cli -a "$$REDIS_PASSWORD" ping; else redis-cli ping; fi',
]
interval: 5s
timeout: 3s
retries: 5
restart: unless-stopped
volumes:
redis_data: