Deploy to Production #278
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # .github/workflows/deploy.yml | |
| # | |
| # Production Deployment Pipeline | |
| # | |
| # Design principles: | |
| # 1. Triggered ONLY after CodeQL deep scan completes successfully — no polling, no race. | |
| # Uses workflow_run event: deploy is event-driven, not concurrent with security scan. | |
| # 2. Runs ALL validation from scratch — no trust built on PR results alone | |
| # 3. Trivy scan runs BEFORE Docker push — vulnerable images never reach the registry | |
| # 4. target: production + build-args mirror pr.yml exactly (bit-for-bit parity) | |
| # 5. Image digest verified against PR simulation artifact when available | |
| # 6. Blue-Green deploy with automatic rollback on health or smoke test failure | |
| # 7. timeout-minutes on every job — hung processes never block CI indefinitely | |
| # 8. npm ci retried up to 3x — registry flakiness never kills a valid deploy | |
| # | |
| # Pipeline order: | |
| # codeql-gate | |
| # ├─► validate ─┐ | |
| # └─► test-api ├─► build-scan-push ─► vps-readiness-check ─► deploy | |
| # ┘ │ | |
| # api-health-gate ◄────────┘ | |
| # │ | |
| # sync-infra ─► sync-monitoring ─► health-and-smoke | |
| # │ | |
| # rollback ◄──────────────────────────────┘ (on failure) | |
| name: Deploy to Production | |
| on: | |
| # Triggered ONLY when the CodeQL deep scan workflow completes on master. | |
| # This replaces the previous push trigger + polling approach: | |
| # - No race conditions (workflow_run fires AFTER codeql-deep finishes) | |
| # - No API polling loops or timing-dependent checks | |
| # - Deployment is blocked at the event level if CodeQL did not succeed | |
| workflow_run: | |
| workflows: ["CodeQL — Deep Scan (post-merge)"] | |
| types: | |
| - completed | |
| branches: | |
| - master | |
| # Manual dispatch retained for emergency/hotfix deploys. | |
| # The codeql-gate job enforces the conclusion check only for workflow_run. | |
| workflow_dispatch: | |
| # Never cancel an in-progress deployment — let it finish or fail cleanly. | |
| concurrency: | |
| group: production-deploy | |
| cancel-in-progress: false | |
| # Default to read-only. Jobs that need additional access declare it explicitly. | |
| permissions: | |
| contents: read | |
| jobs: | |
| # --------------------------------------------------------------------------- | |
| # JOB: codeql-gate | |
| # | |
| # First job in every deploy run. Two responsibilities: | |
| # | |
| # 1. SECURITY GATE (workflow_run only): | |
| # Reads github.event.workflow_run.conclusion and fails hard if CodeQL | |
| # did not pass. This makes the event-driven guarantee explicit and | |
| # visible in the pipeline UI. | |
| # | |
| # 2. SHA RESOLUTION: | |
| # On workflow_run, github.sha = HEAD of default branch at event time, | |
| # NOT the commit that triggered CodeQL. We must deploy exactly the SHA | |
| # that was scanned. Exports deploy_sha = github.event.workflow_run.head_sha | |
| # so all downstream jobs checkout and tag the correct commit. | |
| # On workflow_dispatch, deploy_sha = github.sha (HEAD of triggered branch). | |
| # | |
| # All subsequent jobs that do git checkout use ref: needs.codeql-gate.outputs.deploy_sha. | |
| # --------------------------------------------------------------------------- | |
| codeql-gate: | |
| name: CodeQL Security Gate | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 5 | |
| outputs: | |
| deploy_sha: ${{ steps.sha.outputs.deploy_sha }} | |
| steps: | |
| - name: Resolve deploy SHA | |
| id: sha | |
| run: | | |
| if [ "${{ github.event_name }}" = "workflow_run" ]; then | |
| echo "deploy_sha=${{ github.event.workflow_run.head_sha }}" >> "$GITHUB_OUTPUT" | |
| else | |
| echo "deploy_sha=${{ github.sha }}" >> "$GITHUB_OUTPUT" | |
| fi | |
| - name: Verify CodeQL deep scan passed | |
| if: github.event_name == 'workflow_run' | |
| run: | | |
| CONCLUSION="${{ github.event.workflow_run.conclusion }}" | |
| BRANCH="${{ github.event.workflow_run.head_branch }}" | |
| SHA="${{ github.event.workflow_run.head_sha }}" | |
| echo "CodeQL deep scan conclusion : $CONCLUSION" | |
| echo "Scanned commit SHA : $SHA" | |
| echo "Head branch : $BRANCH" | |
| # Branch guard: only master commits deploy to production. | |
| # The workflow_run trigger already filters branches: [master], but this | |
| # explicit check makes the policy visible in the job log and provides a | |
| # hard error if the filter is ever widened accidentally. | |
| if [ "$BRANCH" != "master" ]; then | |
| echo "::error::Deploy blocked — head_branch=$BRANCH (only master is allowed to deploy to production)." | |
| exit 1 | |
| fi | |
| if [ "$CONCLUSION" != "success" ]; then | |
| echo "::error::CodeQL deep scan did not pass (conclusion=$CONCLUSION)." | |
| echo " Deployment is blocked. Review findings before retrying:" | |
| echo " https://github.com/${{ github.repository }}/security/code-scanning" | |
| exit 1 | |
| fi | |
| echo "✓ CodeQL gate passed — safe to deploy SHA $SHA (branch=$BRANCH)" | |
| # --------------------------------------------------------------------------- | |
| # JOB: validate | |
| # | |
| # Fast pre-flight: TypeScript check + dependency audit. | |
| # Runs in parallel with test-api to maximise pipeline speed. | |
| # --------------------------------------------------------------------------- | |
| validate: | |
| name: Validate (typecheck + audit) | |
| runs-on: ubuntu-latest | |
| needs: [codeql-gate] | |
| timeout-minutes: 10 | |
| steps: | |
| - name: Confirm deployment trigger | |
| run: | | |
| echo "=========================================" | |
| echo "Deployment triggered on master" | |
| echo " Commit SHA : ${{ github.sha }}" | |
| echo " Event : ${{ github.event_name }}" | |
| echo " Ref : ${{ github.ref }}" | |
| echo "=========================================" | |
| - name: Checkout | |
| uses: actions/checkout@v5 | |
| with: | |
| ref: ${{ needs.codeql-gate.outputs.deploy_sha }} | |
| - name: Setup Node.js 24 | |
| uses: actions/setup-node@v5 | |
| with: | |
| node-version: '24' | |
| cache: npm | |
| cache-dependency-path: package-lock.json | |
| - name: Install dependencies (with retry) | |
| run: | | |
| echo "::group::npm ci" | |
| for attempt in 1 2 3; do | |
| npm ci && break | |
| [ $attempt -eq 3 ] && { echo "::error::npm ci failed after 3 attempts"; exit 1; } | |
| echo "Attempt $attempt failed — retrying in 15s..." | |
| sleep 15 | |
| done | |
| echo "::endgroup::" | |
| - name: TypeScript check | |
| run: npm run typecheck | |
| - name: Env contract guard (no direct process.env outside env.ts) | |
| run: | | |
| if grep -r --include="*.ts" "process\.env" src/ \ | |
| | grep -v "src/config/env\.ts"; then | |
| echo "❌ Direct process.env access detected outside env.ts" | |
| echo " Use: import { env } from './config/env.js' instead" | |
| exit 1 | |
| fi | |
| echo "✅ Env contract clean — no direct process.env access outside env.ts" | |
| # --------------------------------------------------------------------------- | |
| # JOB: test-api | |
| # | |
| # Full backend test suite — unit tests then integration tests. | |
| # Runs in parallel with validate. | |
| # --------------------------------------------------------------------------- | |
| test-api: | |
| name: API Tests (unit + integration) | |
| runs-on: ubuntu-latest | |
| needs: [codeql-gate] | |
| timeout-minutes: 15 | |
| env: | |
| SUPABASE_URL: ${{ secrets.SUPABASE_URL_TEST }} | |
| SUPABASE_ANON_KEY: ${{ secrets.SUPABASE_ANON_KEY_TEST }} | |
| SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_SERVICE_ROLE_KEY_TEST }} | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v5 | |
| with: | |
| ref: ${{ needs.codeql-gate.outputs.deploy_sha }} | |
| - name: Setup Node.js 24 | |
| uses: actions/setup-node@v5 | |
| with: | |
| node-version: '24' | |
| cache: npm | |
| cache-dependency-path: package-lock.json | |
| - name: Install dependencies (with retry) | |
| run: | | |
| echo "::group::npm ci" | |
| for attempt in 1 2 3; do | |
| npm ci && break | |
| [ $attempt -eq 3 ] && { echo "::error::npm ci failed after 3 attempts"; exit 1; } | |
| echo "Attempt $attempt failed — retrying in 15s..." | |
| sleep 15 | |
| done | |
| echo "::endgroup::" | |
| - name: Run all tests | |
| run: npm test | |
| # --------------------------------------------------------------------------- | |
| # JOB: build-scan-push | |
| # | |
| # Three-phase security gate — identical build config to pr.yml: | |
| # Phase 1 — Build locally (target: production, same build-args, same cache) | |
| # Phase 2 — Trivy scan: pinned aquasec/trivy:0.49.1 Docker image, exit-code 1 | |
| # on HIGH/CRITICAL (blocks push). NOT trivy-action — supply-chain safe. | |
| # DB pre-pulled, scan runs --network none (air-gapped). | |
| # Phase 3 — Push exact scanned image to GHCR (no rebuild) | |
| # | |
| # Image digest verification: | |
| # After building, the digest is compared against the digest stored by | |
| # pr.yml's production-simulation job. A match confirms bit-for-bit parity | |
| # between what was validated in PR and what is being deployed. | |
| # Comparison is best-effort (continue-on-error) because the merge commit | |
| # SHA may differ from the PR head SHA on squash-merges. | |
| # --------------------------------------------------------------------------- | |
| build-scan-push: | |
| name: Build, Scan & Push Docker Image | |
| runs-on: ubuntu-latest | |
| needs: [codeql-gate, validate, test-api] | |
| timeout-minutes: 25 | |
| permissions: | |
| contents: read | |
| packages: write | |
| security-events: write | |
| outputs: | |
| sha_short: ${{ steps.meta.outputs.sha_short }} | |
| digest: ${{ steps.digest.outputs.digest }} | |
| deploy_sha: ${{ steps.meta.outputs.deploy_sha }} | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v5 | |
| with: | |
| ref: ${{ needs.codeql-gate.outputs.deploy_sha }} | |
| - name: Extract commit SHA | |
| id: meta | |
| env: | |
| DEPLOY_SHA: ${{ needs.codeql-gate.outputs.deploy_sha }} | |
| run: | | |
| echo "sha_short=${DEPLOY_SHA::7}" >> "$GITHUB_OUTPUT" | |
| echo "deploy_sha=$DEPLOY_SHA" >> "$GITHUB_OUTPUT" | |
| - name: Set up Docker Buildx | |
| uses: docker/setup-buildx-action@v3 | |
| - name: Pull base images (force fresh manifest, prevent stale GHA cache) | |
| run: | | |
| docker pull node:24.2.0-bookworm-slim | |
| docker pull gcr.io/distroless/nodejs24-debian12:nonroot | |
| # Phase 1: Build into local Docker daemon for scanning. | |
| # EXACT same parameters as pr.yml production-simulation: | |
| # target: production, build-args: NODE_ENV=production, GHA cache. | |
| # CACHE_BUSTER forces rebuild when package-lock.json changes (prevents stale deps). | |
| # Cache scoped to production to prevent cross-branch contamination from PR builds. | |
| - name: Build Docker image (pre-scan, no push) | |
| uses: docker/build-push-action@v6 | |
| with: | |
| context: . | |
| file: ./Dockerfile | |
| target: production | |
| build-args: | | |
| NODE_ENV=production | |
| CACHE_BUSTER=${{ hashFiles('**/package-lock.json') }} | |
| push: false | |
| load: true | |
| pull: true | |
| tags: | | |
| fieldtrack-api:${{ steps.meta.outputs.sha_short }} | |
| cache-from: type=gha,scope=production | |
| cache-to: type=gha,mode=max,scope=production | |
| # Verify Node.js runtime — exercises TLS stack, not just compile-time version constant. | |
| # tls.createSecureContext() fails if libssl linkage is broken, proving runtime health. | |
| - name: Verify Node.js runtime (TLS operational check) | |
| run: | | |
| IMAGE_NAME="fieldtrack-api:${{ steps.meta.outputs.sha_short }}" | |
| echo "Testing image: $IMAGE_NAME" | |
| docker run --rm \ | |
| --entrypoint /nodejs/bin/node \ | |
| "$IMAGE_NAME" \ | |
| -e " | |
| const crypto = require('crypto'); | |
| const tls = require('tls'); | |
| const ctx = tls.createSecureContext(); | |
| if (!ctx) { process.stderr.write('FAIL: TLS context failed\n'); process.exit(1); } | |
| const h = crypto.createHash('sha256').update('smoke').digest('hex'); | |
| if (!h) { process.stderr.write('FAIL: hash failed\n'); process.exit(1); } | |
| process.stdout.write('node=' + process.versions.node + ' openssl=' + process.versions.openssl + ' tls=ok\n'); | |
| " | |
| # Capture the content-addressable image digest. | |
| # With cache scoping and cache busting, digest should always reproduce correctly. | |
| - name: Capture image digest | |
| id: digest | |
| run: | | |
| IMAGE_NAME="fieldtrack-api:${{ steps.meta.outputs.sha_short }}" | |
| DIGEST=$(docker inspect "$IMAGE_NAME" --format='{{.Id}}') | |
| echo "digest=$DIGEST" >> "$GITHUB_OUTPUT" | |
| echo "=== Build traceability ===" | |
| echo " Commit SHA : ${{ github.sha }}" | |
| echo " Image tag : $IMAGE_NAME" | |
| echo " Image digest : $DIGEST" | |
| # Compare this digest with the one stored by pr.yml's production-simulation. | |
| # A match = bit-for-bit parity. A mismatch = code/cache divergence (warning). | |
| # continue-on-error: true — squash merges produce a new commit SHA, which | |
| # may cause minor divergence even with identical source code. | |
| - name: Verify image digest parity with PR simulation | |
| continue-on-error: true | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| run: | | |
| # Find the PR number associated with this merge commit | |
| PR_NUMBER=$(gh api \ | |
| "/repos/${{ github.repository }}/commits/${{ github.sha }}/pulls" \ | |
| --header "X-GitHub-Api-Version: 2022-11-28" \ | |
| --jq '.[0].number // empty' 2>/dev/null || echo "") | |
| if [ -z "$PR_NUMBER" ]; then | |
| echo "No associated PR found for commit ${{ github.sha }} — skipping digest comparison." | |
| exit 0 | |
| fi | |
| echo "Associated PR: #${PR_NUMBER}" | |
| # Find the most recent successful pr.yml run for this PR | |
| RUN_ID=$(gh run list \ | |
| --repo "${{ github.repository }}" \ | |
| --workflow "pr.yml" \ | |
| --json databaseId,conclusion,headSha \ | |
| --jq "map(select(.conclusion == \"success\")) | .[0].databaseId // empty" \ | |
| 2>/dev/null || echo "") | |
| if [ -z "$RUN_ID" ]; then | |
| echo "No successful PR validation run found — skipping digest comparison." | |
| exit 0 | |
| fi | |
| # Download the image-digest artifact from that run | |
| gh run download "$RUN_ID" \ | |
| --repo "${{ github.repository }}" \ | |
| --name "image-digest-pr-${PR_NUMBER}" \ | |
| --dir /tmp/pr-digest \ | |
| 2>/dev/null || true | |
| if [ ! -f /tmp/pr-digest/image-digest.txt ]; then | |
| echo "PR image-digest artifact not found — skipping comparison." | |
| exit 0 | |
| fi | |
| PR_DIGEST=$(cat /tmp/pr-digest/image-digest.txt) | |
| DEPLOY_DIGEST="${{ steps.digest.outputs.digest }}" | |
| echo "PR simulation digest: $PR_DIGEST" | |
| echo "Deploy image digest: $DEPLOY_DIGEST" | |
| if [ "$PR_DIGEST" = "$DEPLOY_DIGEST" ]; then | |
| echo "✓ Digest match — bit-for-bit parity confirmed between PR and deploy." | |
| else | |
| echo "⚠ Digest mismatch — builds diverged between PR and deploy." | |
| echo " Expected on squash-merges where the commit SHA changes." | |
| echo " Ensure no source changes occurred between PR approval and deploy trigger." | |
| fi | |
| # Phase 2: Trivy scan — image pinned by immutable digest, NOT trivy-action. | |
| # aquasec/trivy:0.49.1 → sha256:91494b87ddc64f62860d52997532643956c24eeee0d0dda317d563c28c8581bc | |
| # Identical severity gates to pr.yml (HIGH,CRITICAL / exit-code 1). | |
| # Two-phase: DB downloaded first (needs network), then scan runs --network none. | |
| - name: Get date for Trivy DB cache key | |
| id: trivy-date | |
| run: echo "date=$(date +'%Y-%m-%d')" >> "$GITHUB_OUTPUT" | |
| - name: Cache Trivy DB (daily refresh) | |
| uses: actions/cache@v4 | |
| with: | |
| path: /tmp/trivy-cache | |
| key: trivy-db-${{ runner.os }}-${{ steps.trivy-date.outputs.date }} | |
| restore-keys: | | |
| trivy-db-${{ runner.os }}- | |
| - name: Pull Trivy vulnerability database | |
| run: | | |
| docker run --rm \ | |
| -v /tmp/trivy-cache:/root/.cache \ | |
| aquasec/trivy@sha256:91494b87ddc64f62860d52997532643956c24eeee0d0dda317d563c28c8581bc \ | |
| image --download-db-only | |
| - name: Scan image with Trivy (HIGH/CRITICAL, ignore-unfixed) | |
| env: | |
| IMAGE_NAME: fieldtrack-api:${{ steps.meta.outputs.sha_short }} | |
| run: | | |
| SCAN_PASSED=false | |
| for i in 1 2 3; do | |
| if docker run --rm \ | |
| --network none \ | |
| -v /var/run/docker.sock:/var/run/docker.sock \ | |
| -v /tmp/trivy-cache:/root/.cache \ | |
| -v "$(pwd)/.trivyignore:/tmp/.trivyignore:ro" \ | |
| aquasec/trivy@sha256:91494b87ddc64f62860d52997532643956c24eeee0d0dda317d563c28c8581bc image \ | |
| --skip-db-update \ | |
| --ignore-unfixed \ | |
| --severity HIGH,CRITICAL \ | |
| --exit-code 1 \ | |
| --ignorefile /tmp/.trivyignore \ | |
| "$IMAGE_NAME"; then | |
| SCAN_PASSED=true | |
| break | |
| fi | |
| echo "Trivy attempt $i failed..." | |
| [ "$i" -lt 3 ] && sleep 5 | |
| done | |
| if [ "$SCAN_PASSED" != "true" ]; then | |
| echo "::error::Trivy scan failed after 3 attempts — HIGH/CRITICAL vulnerabilities found or scan error." | |
| exit 1 | |
| fi | |
| echo "✓ Trivy scan passed (HIGH/CRITICAL, ignore-unfixed)" | |
| - name: Scan for unfixed CRITICAL vulnerabilities (informational) | |
| continue-on-error: true | |
| env: | |
| IMAGE_NAME: fieldtrack-api:${{ steps.meta.outputs.sha_short }} | |
| run: | | |
| UNFIXED_COUNT=$(docker run --rm \ | |
| --network none \ | |
| -v /var/run/docker.sock:/var/run/docker.sock \ | |
| -v /tmp/trivy-cache:/root/.cache \ | |
| aquasec/trivy@sha256:91494b87ddc64f62860d52997532643956c24eeee0d0dda317d563c28c8581bc image \ | |
| --skip-db-update \ | |
| --severity CRITICAL \ | |
| --format json \ | |
| "$IMAGE_NAME" | jq '[.Results[]?.Misconfigurations[]? // .Results[]?.Vulnerabilities[]? | select(.FixedVersion == null or .FixedVersion == "")] | length') | |
| if [ "$UNFIXED_COUNT" -gt 0 ]; then | |
| echo "⚠ WARNING: $UNFIXED_COUNT unfixed CRITICAL vulnerabilities found" | |
| echo " (No patches available upstream — waiting for vendor fix)" | |
| docker run --rm \ | |
| --network none \ | |
| -v /var/run/docker.sock:/var/run/docker.sock \ | |
| -v /tmp/trivy-cache:/root/.cache \ | |
| aquasec/trivy@sha256:91494b87ddc64f62860d52997532643956c24eeee0d0dda317d563c28c8581bc image \ | |
| --skip-db-update \ | |
| --severity CRITICAL \ | |
| "$IMAGE_NAME" >> /tmp/unfixed-critical.log || true | |
| else | |
| echo "✓ No unfixed CRITICAL vulnerabilities" | |
| fi | |
| - name: Generate Trivy scan results (SARIF for GitHub Security) | |
| env: | |
| IMAGE_NAME: fieldtrack-api:${{ steps.meta.outputs.sha_short }} | |
| run: | | |
| docker run --rm \ | |
| --network none \ | |
| -v /var/run/docker.sock:/var/run/docker.sock \ | |
| -v /tmp/trivy-cache:/root/.cache \ | |
| -v "$(pwd):/workspace" \ | |
| aquasec/trivy@sha256:91494b87ddc64f62860d52997532643956c24eeee0d0dda317d563c28c8581bc image \ | |
| --skip-db-update \ | |
| --format sarif \ | |
| --output /workspace/trivy-results.sarif \ | |
| "$IMAGE_NAME" | |
| echo "✓ SARIF results written to trivy-results.sarif" | |
| - name: Upload Trivy scan results to GitHub Security | |
| uses: github/codeql-action/upload-sarif@v3 | |
| with: | |
| sarif_file: trivy-results.sarif | |
| category: 'trivy-image-scan' | |
| # Phase 3: Scan passed — push the exact scanned image (same layer digests). | |
| # Uses docker tag + push rather than rebuilding to guarantee what was scanned | |
| # is exactly what lands in the registry. | |
| - name: Verify image digest unchanged before push | |
| env: | |
| IMAGE_NAME: fieldtrack-api:${{ steps.meta.outputs.sha_short }} | |
| IMAGE_DIGEST: ${{ steps.digest.outputs.digest }} | |
| run: | | |
| # docker inspect .Id returns the config digest (sha256:...) which is | |
| # stable across tag operations — same value captured in the digest step. | |
| CURRENT=$(docker inspect "$IMAGE_NAME" --format='{{.Id}}') | |
| echo "Expected digest : $IMAGE_DIGEST" | |
| echo "Current digest : $CURRENT" | |
| if [ "$CURRENT" != "$IMAGE_DIGEST" ]; then | |
| echo "ERROR: image digest changed between scan and push — aborting." | |
| exit 1 | |
| fi | |
| echo "✓ Digest verified — pushing exactly what was scanned." | |
| - name: Login to GitHub Container Registry | |
| uses: docker/login-action@v3 | |
| with: | |
| registry: ghcr.io | |
| username: ${{ github.actor }} | |
| password: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Push verified image to registry | |
| run: | | |
| OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') | |
| docker tag \ | |
| fieldtrack-api:${{ steps.meta.outputs.sha_short }} \ | |
| ghcr.io/${OWNER}/api:${{ steps.meta.outputs.sha_short }} | |
| docker push ghcr.io/${OWNER}/api:${{ steps.meta.outputs.sha_short }} | |
| echo "✓ Pushed ghcr.io/${OWNER}/api:${{ steps.meta.outputs.sha_short }}" | |
| # Use the same pinned Trivy image to generate the SBOM — no additional | |
| # tool dependency, no unpinned action, same supply-chain guarantees. | |
| - name: Generate SBOM (CycloneDX) | |
| env: | |
| IMAGE_NAME: fieldtrack-api:${{ steps.meta.outputs.sha_short }} | |
| run: | | |
| docker run --rm \ | |
| -v /var/run/docker.sock:/var/run/docker.sock \ | |
| aquasec/trivy@sha256:91494b87ddc64f62860d52997532643956c24eeee0d0dda317d563c28c8581bc image \ | |
| --format cyclonedx \ | |
| --output /dev/stdout \ | |
| "$IMAGE_NAME" > sbom.json | |
| - name: Upload SBOM artifact | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: sbom-${{ steps.meta.outputs.sha_short }} | |
| path: sbom.json | |
| retention-days: 90 | |
| - name: Save build provenance | |
| env: | |
| IMAGE_NAME: fieldtrack-api:${{ steps.meta.outputs.sha_short }} | |
| IMAGE_DIGEST: ${{ steps.digest.outputs.digest }} | |
| run: | | |
| echo "commit=${{ github.sha }}" > provenance.txt | |
| echo "ref=${{ github.ref }}" >> provenance.txt | |
| echo "image=${IMAGE_NAME}" >> provenance.txt | |
| echo "digest=${IMAGE_DIGEST}" >> provenance.txt | |
| echo "workflow=${{ github.workflow }}" >> provenance.txt | |
| echo "run_id=${{ github.run_id }}" >> provenance.txt | |
| - name: Upload provenance artifact | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: provenance-${{ steps.meta.outputs.sha_short }} | |
| path: provenance.txt | |
| retention-days: 90 | |
| - name: Build & scan summary | |
| if: always() | |
| env: | |
| IMAGE_DIGEST: ${{ steps.digest.outputs.digest }} | |
| run: | | |
| SBOM_COUNT=$(python3 -c "import json; d=json.load(open('sbom.json')); print(len(d.get('components', [])))" 2>/dev/null || echo 'n/a') | |
| { | |
| echo "## Build · Scan · Push" | |
| echo "| Field | Value |" | |
| echo "|---|---|" | |
| echo "| Commit SHA | \`${{ github.sha }}\` |" | |
| echo "| Image tag | \`fieldtrack-api:${{ steps.meta.outputs.sha_short }}\` |" | |
| echo "| Image digest | \`${IMAGE_DIGEST}\` |" | |
| echo "| SBOM components | ${SBOM_COUNT} |" | |
| echo "| Trivy gate | HIGH,CRITICAL / exit-code 1 / ignore-unfixed |" | |
| echo "| Registry | ghcr.io/${{ github.repository_owner }}/api |" | |
| } >> "$GITHUB_STEP_SUMMARY" | |
| # --------------------------------------------------------------------------- | |
| # JOB: vps-readiness-check | |
| # | |
| # Validates the VPS is in a deployable state BEFORE running the deploy. | |
| # Runs in PARALLEL with vps-readiness-check (both depend on build-scan-push). | |
| # Both must succeed before deploy is allowed to proceed. | |
| # | |
| # Delegates to scripts/vps-readiness-check.sh which checks: | |
| # - Docker daemon running | |
| # - api_network exists (auto-created if missing) | |
| # - Ports 80/443 free from non-nginx processes | |
| # - No API containers with host port bindings | |
| # - Required .env file present | |
| # - Runtime directories present (auto-created if missing) | |
| # - Sufficient disk space (auto-prunes if borderline) | |
| # --------------------------------------------------------------------------- | |
| vps-readiness-check: | |
| name: VPS Readiness Gate | |
| runs-on: ubuntu-latest | |
| needs: [build-scan-push] | |
| timeout-minutes: 10 | |
| steps: | |
| - name: Run VPS readiness check via SSH | |
| uses: appleboy/ssh-action@v1.0.3 | |
| with: | |
| host: ${{ secrets.DO_HOST }} | |
| username: ${{ secrets.DO_USER }} | |
| key: ${{ secrets.DO_SSH_KEY }} | |
| script: | | |
| set -euo pipefail | |
| export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" | |
| [ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT — run vps-setup.sh first"; exit 1; } | |
| cd "$DEPLOY_ROOT" | |
| # Pull latest scripts without full deploy | |
| git fetch origin master --depth=1 | |
| git checkout origin/master -- scripts/vps-readiness-check.sh 2>/dev/null || true | |
| chmod +x scripts/vps-readiness-check.sh | |
| ./scripts/vps-readiness-check.sh | |
| # --------------------------------------------------------------------------- | |
| # JOB: deploy | |
| # | |
| # Blue-Green deployment to VPS via SSH. | |
| # The deploy-bluegreen.sh script manages slot switching and container health. | |
| # | |
| # DEPENDENCY GATES (both must pass): | |
| # - vps-readiness-check: ensures VPS can accept the deployment | |
| # --------------------------------------------------------------------------- | |
| deploy: | |
| name: Deploy (Blue-Green SSH) | |
| runs-on: ubuntu-latest | |
| needs: [build-scan-push, vps-readiness-check] | |
| timeout-minutes: 20 | |
| steps: | |
| - name: Validate required deployment secrets | |
| env: | |
| API_BASE_URL: ${{ secrets.API_BASE_URL }} | |
| CORS_ORIGIN: ${{ secrets.CORS_ORIGIN }} | |
| run: | | |
| if [ -z "${API_BASE_URL:-}" ]; then | |
| echo "::error::API_BASE_URL secret is not set. Deployment aborted." | |
| exit 1 | |
| fi | |
| echo "✓ API_BASE_URL is set" | |
| if [ -z "${CORS_ORIGIN:-}" ]; then | |
| echo "::error::CORS_ORIGIN secret is not set. Deployment aborted." | |
| exit 1 | |
| fi | |
| echo "✓ CORS_ORIGIN is set" | |
| - name: Log deployment metadata and trigger info | |
| run: | | |
| { | |
| echo "## Deployment Initiated" | |
| echo "| Field | Value |" | |
| echo "|---|---|" | |
| echo "| Commit SHA | \`${{ github.sha }}\` |" | |
| echo "| Trigger event | ${{ github.event_name }} |" | |
| echo "| Triggered by | ${{ github.actor }} |" | |
| echo "| Branch | ${{ github.ref_name }} |" | |
| echo "| Workflow run | [${{ github.run_id }}](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) |" | |
| echo "| Commit message | \`${{ github.event.head_commit.message }}\` |" | |
| } >> "$GITHUB_STEP_SUMMARY" | |
| echo "📋 Deployment initiated — SHA=${{ github.sha }} EVENT=${{ github.event_name }} ACTOR=${{ github.actor }} RUN=${{ github.run_id }}" | |
| - name: Validate environment contract before deploy | |
| uses: appleboy/ssh-action@v1.0.3 | |
| with: | |
| host: ${{ secrets.DO_HOST }} | |
| username: ${{ secrets.DO_USER }} | |
| key: ${{ secrets.DO_SSH_KEY }} | |
| script: | | |
| set -euo pipefail | |
| export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" | |
| echo "USER=$(whoami)" | |
| echo "HOME=$HOME" | |
| echo "PWD=$(pwd)" | |
| ls -la "$HOME" | |
| ls -la "$HOME/api" | |
| [ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; } | |
| cd "$DEPLOY_ROOT" | |
| # Pin repo to the exact SHA that was built and scanned by CodeQL. | |
| # Prevents stale scripts from running if concurrent commits landed. | |
| git fetch origin | |
| git reset --hard ${{ needs.build-scan-push.outputs.deploy_sha }} | |
| chmod +x scripts/*.sh | |
| echo "=== Pre-deploy environment validation ===" | |
| ./scripts/validate-env.sh --check-monitoring | |
| echo "✓ Environment contract validated" | |
| - name: Blue-Green deploy via SSH | |
| uses: appleboy/ssh-action@v1.0.3 | |
| with: | |
| host: ${{ secrets.DO_HOST }} | |
| username: ${{ secrets.DO_USER }} | |
| key: ${{ secrets.DO_SSH_KEY }} | |
| script: | | |
| set -euo pipefail | |
| T0=$(date +%s) | |
| export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" | |
| echo "USER=$(whoami)" | |
| echo "HOME=$HOME" | |
| echo "PWD=$(pwd)" | |
| ls -la "$HOME" | |
| ls -la "$HOME/api" | |
| ls -la "$DEPLOY_ROOT" | |
| [ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; } | |
| cd "$DEPLOY_ROOT" | |
| # Enforce repo is at the exact SHA being deployed (issue 7 — prevents | |
| # stale deploy scripts if another commit landed during this pipeline run). | |
| git fetch origin | |
| git reset --hard ${{ needs.build-scan-push.outputs.deploy_sha }} | |
| chmod +x scripts/*.sh | |
| # Environment already validated in previous step | |
| ./scripts/deploy-bluegreen.sh "${{ needs.build-scan-push.outputs.sha_short }}" | |
| echo "✓ Deploy completed in $(($(date +%s) - T0))s" | |
| - name: Log deployment state (slot + SHA for debugging) | |
| uses: appleboy/ssh-action@v1.0.3 | |
| if: always() | |
| with: | |
| host: ${{ secrets.DO_HOST }} | |
| username: ${{ secrets.DO_USER }} | |
| key: ${{ secrets.DO_SSH_KEY }} | |
| script: | | |
| ACTIVE_SLOT=$(cat /var/run/api/active-slot 2>/dev/null || echo "unknown") | |
| ACTIVE_CONTAINER="api-${ACTIVE_SLOT}" | |
| DEPLOY_STATUS="UNKNOWN" | |
| # Health check via in-network curl container — exercises Docker DNS | |
| # and bridge routing (same path nginx uses). NO host port binding needed. | |
| FT_CURL_IMG="curlimages/curl:8.7.1" | |
| if docker inspect "$ACTIVE_CONTAINER" >/dev/null 2>&1; then | |
| if docker run --rm --network api_network "$FT_CURL_IMG" \ | |
| -sf --max-time 5 "http://$ACTIVE_CONTAINER:3000/health" >/dev/null 2>&1; then | |
| DEPLOY_STATUS="SUCCESS" | |
| else | |
| DEPLOY_STATUS="UNHEALTHY" | |
| fi | |
| else | |
| DEPLOY_STATUS="CONTAINER_MISSING" | |
| fi | |
| echo "DEPLOY_STATE=$DEPLOY_STATUS | SLOT=$ACTIVE_SLOT | CONTAINER=$ACTIVE_CONTAINER | SHA=${{ github.sha }}" | |
| # --------------------------------------------------------------------------- | |
| # JOB: api-health-gate (Step E+) | |
| # | |
| # Early API health validation — runs AFTER deploy but BEFORE infra sync. | |
| # Ensures the API container is truly healthy before we sync monitoring/nginx. | |
| # If the API is not healthy at this point, STOP before touching infra. | |
| # --------------------------------------------------------------------------- | |
| api-health-gate: | |
| name: API Health Gate | |
| runs-on: ubuntu-latest | |
| needs: [deploy] | |
| timeout-minutes: 5 | |
| steps: | |
| - name: Verify API container is healthy before infra sync | |
| uses: appleboy/ssh-action@v1.0.3 | |
| with: | |
| host: ${{ secrets.DO_HOST }} | |
| username: ${{ secrets.DO_USER }} | |
| key: ${{ secrets.DO_SSH_KEY }} | |
| script: | | |
| set -euo pipefail | |
| export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" | |
| [ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; } | |
| cd "$DEPLOY_ROOT" | |
| source scripts/load-env.sh | |
| # Determine active slot (blue/green) | |
| ACTIVE_SLOT=$(cat /var/run/api/active-slot 2>/dev/null || echo "blue") | |
| ACTIVE_CONTAINER="api-$ACTIVE_SLOT" | |
| echo "=== API Health Gate (slot: $ACTIVE_SLOT, container: $ACTIVE_CONTAINER) ===" | |
| # Guard: container must exist before we try to reach it | |
| docker inspect "$ACTIVE_CONTAINER" >/dev/null 2>&1 || { | |
| echo "❌ Container $ACTIVE_CONTAINER not found" | |
| exit 1 | |
| } | |
| # Poll /ready via in-network curl (Docker DNS + bridge routing). | |
| # /ready checks Redis, Supabase, and BullMQ — definitive readiness gate. | |
| # Uses docker run rather than docker exec so the check exercises the | |
| # same network path nginx uses, not the container's own loopback. | |
| FT_CURL_IMG="curlimages/curl:8.7.1" | |
| for i in $(seq 1 15); do | |
| STATUS=$(docker run --rm --network api_network "$FT_CURL_IMG" \ | |
| -s -o /dev/null -w "%{http_code}" \ | |
| "http://$ACTIVE_CONTAINER:3000/ready" 2>/dev/null || echo "000") | |
| if [ "$STATUS" = "200" ]; then | |
| echo "✓ API ready (container $ACTIVE_CONTAINER, attempt $i)" | |
| exit 0 | |
| fi | |
| echo " Attempt $i: HTTP $STATUS — waiting..." | |
| sleep 2 | |
| done | |
| echo "❌ API /ready did not return 200 after 30s — monitoring sync would fail anyway" | |
| docker logs "$ACTIVE_CONTAINER" --tail 30 2>/dev/null || true | |
| exit 1 | |
| # --------------------------------------------------------------------------- | |
| # JOB: sync-infra | |
| # | |
| # Syncs Nginx config (with slot-aware port substitution). | |
| # Monitoring restarts are handled exclusively by deploy-bluegreen.sh. | |
| # --------------------------------------------------------------------------- | |
| sync-infra: | |
| name: Sync Infrastructure (nginx) | |
| runs-on: ubuntu-latest | |
| needs: [api-health-gate] | |
| timeout-minutes: 10 | |
| steps: | |
| - name: Sync infrastructure configs via SSH | |
| uses: appleboy/ssh-action@v1.0.3 | |
| with: | |
| host: ${{ secrets.DO_HOST }} | |
| username: ${{ secrets.DO_USER }} | |
| key: ${{ secrets.DO_SSH_KEY }} | |
| script: | | |
| set -euo pipefail | |
| T0=$(date +%s) | |
| export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" | |
| echo "USER=$(whoami)" | |
| echo "HOME=$HOME" | |
| echo "PWD=$(pwd)" | |
| ls -la "$HOME" | |
| ls -la "$HOME/api" | |
| [ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; } | |
| cd "$DEPLOY_ROOT" | |
| INFRA_DIR="$DEPLOY_ROOT/infra" | |
| NGINX_LIVE="$DEPLOY_ROOT/infra/nginx/live/api.conf" | |
| NGINX_BACKUP_DIR="$DEPLOY_ROOT/infra/nginx/backup" | |
| ACTIVE_SLOT_FILE="/var/run/api/active-slot" | |
| ACTIVE_SLOT=$(cat "$ACTIVE_SLOT_FILE" 2>/dev/null || echo "blue") | |
| ACTIVE_CONTAINER="api-$ACTIVE_SLOT" | |
| # Load env from .env — exports DEPLOY_ROOT, API_HOSTNAME, and all | |
| # app variables. DEPLOY_ROOT is already exported above; load-env.sh uses it. | |
| source "$DEPLOY_ROOT/scripts/load-env.sh" | |
| echo "✓ API_HOSTNAME: $API_HOSTNAME" | |
| # Ensure live/backup dirs exist | |
| mkdir -p "$(dirname "$NGINX_LIVE")" "$NGINX_BACKUP_DIR" | |
| echo "=== Syncing Nginx (slot: $ACTIVE_SLOT, container: $ACTIVE_CONTAINER) ===" | |
| cp "$NGINX_LIVE" "$NGINX_BACKUP_DIR/api.conf.bak.$(date +%s)" 2>/dev/null || true | |
| NGINX_TMP=$(mktemp /tmp/fieldtrack-nginx.XXXXXX.conf) | |
| sed \ | |
| -e "s|__ACTIVE_CONTAINER__|$ACTIVE_CONTAINER|g" \ | |
| -e "s|__API_HOSTNAME__|$API_HOSTNAME|g" \ | |
| "$INFRA_DIR/nginx/api.conf" > "$NGINX_TMP" | |
| cp "$NGINX_TMP" "$NGINX_LIVE" | |
| rm -f "$NGINX_TMP" | |
| if ! docker exec nginx nginx -t 2>&1; then | |
| echo "Nginx test failed — restoring backup..." | |
| LATEST_BAK=$(ls -1t "$NGINX_BACKUP_DIR"/api.conf.bak.* 2>/dev/null | head -1 || true) | |
| [ -n "$LATEST_BAK" ] && cp "$LATEST_BAK" "$NGINX_LIVE" | |
| exit 1 | |
| fi | |
| docker exec nginx nginx -s reload | |
| echo "✓ Nginx reloaded." | |
| # ROUTING VALIDATION — Test actual traffic through Nginx | |
| # Phase 1 (source of truth): in-network docker run inside api_network. | |
| # Phase 2 (advisory): HTTPS via localhost + Host header; --insecure handles | |
| # Cloudflare origin cert. status=000 = host→Docker TCP routing issue, not TLS. | |
| echo "=== Testing Nginx routing (in-network primary, HTTPS advisory) ===" | |
| sleep 2 # Give Nginx a moment to fully apply reload | |
| ROUTE_STATUS=$(docker run --rm --network api_network \ | |
| curlimages/curl:8.7.1 -s -o /dev/null -w "%{http_code}" \ | |
| --max-time 10 http://nginx/health 2>/dev/null || echo "000") | |
| if [ "$ROUTE_STATUS" = "200" ]; then | |
| echo "✓ Nginx routing verified via in-network check (HTTP $ROUTE_STATUS)" | |
| else | |
| echo "❌ Nginx in-network routing broken (HTTP $ROUTE_STATUS expected 200) — restoring backup..." | |
| LATEST_BAK=$(ls -1t "$NGINX_BACKUP_DIR"/api.conf.bak.* 2>/dev/null | head -1 || true) | |
| [ -n "$LATEST_BAK" ] && cp "$LATEST_BAK" "$NGINX_LIVE" | |
| docker exec nginx nginx -t 2>&1 && docker exec nginx nginx -s reload || true | |
| exit 1 | |
| fi | |
| # HTTPS advisory check (non-blocking — host→Docker loopback may fail with status=000) | |
| HTTPS_STATUS=$(curl -s -o /dev/null -w "%{http_code}" \ | |
| --resolve "$API_HOSTNAME:443:127.0.0.1" \ | |
| -H "Host: $API_HOSTNAME" \ | |
| "https://127.0.0.1/health" --insecure 2>/dev/null || echo "000") | |
| if [ "$HTTPS_STATUS" = "200" ]; then | |
| echo "✓ HTTPS advisory check passed (HTTP $HTTPS_STATUS)" | |
| else | |
| echo "⚠ HTTPS advisory status=$HTTPS_STATUS (host→Docker TCP routing; in-network check is authoritative)" | |
| fi | |
| echo "✓ Infra sync completed in $(($(date +%s) - T0))s" | |
| # --------------------------------------------------------------------------- | |
| # JOB: sync-monitoring (Step F) | |
| # | |
| # Idempotent monitoring stack sync — runs after every deploy. | |
| # Delegates to scripts/monitoring-sync.sh which: | |
| # - Self-heals missing .env.monitoring from example | |
| # - Creates api_network if absent | |
| # - Renders alertmanager.rendered.yml | |
| # - Runs docker compose up -d | |
| # - Validates prometheus / alertmanager / grafana health | |
| # Monitoring is REQUIRED — deploy fails if any required container is unhealthy. | |
| # --------------------------------------------------------------------------- | |
| sync-monitoring: | |
| name: Sync Monitoring Stack | |
| runs-on: ubuntu-latest | |
| needs: [sync-infra] | |
| timeout-minutes: 15 | |
| steps: | |
| - name: Sync and validate monitoring stack via SSH | |
| uses: appleboy/ssh-action@v1.0.3 | |
| with: | |
| host: ${{ secrets.DO_HOST }} | |
| username: ${{ secrets.DO_USER }} | |
| key: ${{ secrets.DO_SSH_KEY }} | |
| script: | | |
| set -euo pipefail | |
| export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" | |
| [ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; } | |
| cd "$DEPLOY_ROOT" | |
| chmod +x scripts/monitoring-sync.sh | |
| ./scripts/monitoring-sync.sh | |
| - name: Monitoring sync summary | |
| if: always() | |
| run: | | |
| { | |
| echo "## Monitoring Sync" | |
| echo "| Container | Required |" | |
| echo "|---|---|" | |
| echo "| prometheus | ✅ |" | |
| echo "| alertmanager | ✅ |" | |
| echo "| grafana | ✅ |" | |
| } >> "$GITHUB_STEP_SUMMARY" | |
| - name: Deployment artifact traceability | |
| if: always() | |
| run: | | |
| { | |
| echo "## Deployment Artifacts" | |
| echo "| Field | Value |" | |
| echo "|---|---|" | |
| echo "| Deployment SHA | \`${{ github.sha }}\` |" | |
| echo "| Image Tag | \`fieldtrack-api:${{ needs.get-metadata.outputs.sha_short || github.sha }}\` |" | |
| echo "| Workflow Run | [\#${{ github.run_number }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) |" | |
| echo "| Triggered By | \`${{ github.event_name }}\` |" | |
| echo "| Commit Message | \`${{ github.event.head_commit.message }}\` |" | |
| } >> "$GITHUB_STEP_SUMMARY" | |
| # Also output to logs for audit trail | |
| echo "DEPLOYMENT_COMPLETE: SHA=${{ github.sha }} IMAGE=ghcr.io/${{ github.repository_owner }}/api:${{ github.sha }} RUN=${{ github.run_id }}" | |
| # --------------------------------------------------------------------------- | |
| # JOB: health-and-smoke | |
| # | |
| # Step 1: Poll /health and /ready until they return 200 (up to 60 s each). | |
| # Step 2: Run the full smoke test suite (login + core API flows). | |
| # Failure here triggers the rollback job automatically. | |
| # --------------------------------------------------------------------------- | |
| health-and-smoke: | |
| name: Health Checks & Smoke Tests | |
| runs-on: ubuntu-latest | |
| needs: [sync-infra, sync-monitoring] | |
| timeout-minutes: 15 | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v5 | |
| - name: Wait for /health endpoint (via VPS) | |
| uses: appleboy/ssh-action@v1.0.3 | |
| with: | |
| host: ${{ secrets.DO_HOST }} | |
| username: ${{ secrets.DO_USER }} | |
| key: ${{ secrets.DO_SSH_KEY }} | |
| script: | | |
| set -euo pipefail | |
| export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" | |
| [ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; } | |
| cd "$DEPLOY_ROOT" | |
| source scripts/load-env.sh | |
| echo "=== Checking /health via VPS (API_HOSTNAME=$API_HOSTNAME) ===" | |
| for i in $(seq 1 30); do | |
| echo "---- Attempt $i ----" | |
| # Phase 1: in-network (source of truth) | |
| INNET_BODY=$(docker run --rm --network api_network \ | |
| curlimages/curl:8.7.1 -s --max-time 5 http://nginx/health 2>/dev/null || echo "") | |
| if echo "$INNET_BODY" | grep -q '"status":"ok"'; then | |
| echo "✓ /health OK via in-network (attempt $i)" | |
| exit 0 | |
| fi | |
| # Phase 2: HTTPS advisory (--insecure for Cloudflare origin cert; status=000 = host→Docker TCP issue) | |
| STATUS=$(curl -sS \ | |
| --resolve "${API_HOSTNAME}:443:127.0.0.1" \ | |
| -o /tmp/resp.txt \ | |
| -w "%{http_code}" \ | |
| https://${API_HOSTNAME}/health \ | |
| --insecure 2>/dev/null || echo "000") | |
| BODY=$(cat /tmp/resp.txt 2>/dev/null || echo "") | |
| echo "HTTP: $STATUS BODY: $BODY" | |
| if [ "$STATUS" = "200" ] && echo "$BODY" | grep -q '"status":"ok"'; then | |
| echo "✓ /health OK via HTTPS (attempt $i)" | |
| exit 0 | |
| fi | |
| [ "$STATUS" = "000" ] && echo "⚠ HTTPS status=000 (host→Docker routing; in-network is authoritative)" | |
| sleep 2 | |
| done | |
| echo "❌ /health failed after 30 attempts" | |
| exit 1 | |
| - name: Wait for /health endpoint (final public check) | |
| uses: appleboy/ssh-action@v1.0.3 | |
| with: | |
| host: ${{ secrets.DO_HOST }} | |
| username: ${{ secrets.DO_USER }} | |
| key: ${{ secrets.DO_SSH_KEY }} | |
| script: | | |
| set -euo pipefail | |
| export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" | |
| [ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; } | |
| cd "$DEPLOY_ROOT" | |
| source scripts/load-env.sh | |
| echo "=== Final health check via public endpoint (API_HOSTNAME=$API_HOSTNAME) ===" | |
| for i in $(seq 1 10); do | |
| echo "---- Attempt $i ----" | |
| # Phase 1: in-network (source of truth) | |
| INNET_BODY=$(docker run --rm --network api_network \ | |
| curlimages/curl:8.7.1 -s --max-time 5 http://nginx/health 2>/dev/null || echo "") | |
| if echo "$INNET_BODY" | grep -q '"status":"ok"'; then | |
| echo "✓ /health OK via in-network (attempt $i)" | |
| exit 0 | |
| fi | |
| # Phase 2: HTTPS advisory (--insecure for Cloudflare origin cert; status=000 = host→Docker TCP issue) | |
| STATUS=$(curl -sS \ | |
| --resolve "${API_HOSTNAME}:443:127.0.0.1" \ | |
| -o /tmp/resp.txt \ | |
| -w "%{http_code}" \ | |
| https://${API_HOSTNAME}/health \ | |
| --insecure 2>/dev/null || echo "000") | |
| BODY=$(cat /tmp/resp.txt 2>/dev/null || echo "") | |
| echo "HTTP: $STATUS BODY: $BODY" | |
| if [ "$STATUS" = "200" ] && echo "$BODY" | grep -q '"status":"ok"'; then | |
| echo "✓ /health OK via HTTPS (attempt $i)" | |
| exit 0 | |
| fi | |
| [ "$STATUS" = "000" ] && echo "⚠ HTTPS status=000 (host→Docker routing; in-network is authoritative)" | |
| sleep 2 | |
| done | |
| echo "❌ /health failed after 10 attempts" | |
| exit 1 | |
| - name: Run smoke tests | |
| env: | |
| API_BASE_URL: ${{ secrets.API_BASE_URL }} | |
| FT_EMP_EMAIL: ${{ secrets.FT_EMP_EMAIL }} | |
| FT_EMP_PASSWORD: ${{ secrets.FT_EMP_PASSWORD }} | |
| FT_ADMIN_EMAIL: ${{ secrets.FT_ADMIN_EMAIL }} | |
| FT_ADMIN_PASSWORD: ${{ secrets.FT_ADMIN_PASSWORD }} | |
| SUPABASE_URL: ${{ secrets.SUPABASE_URL }} | |
| SUPABASE_ANON_KEY: ${{ secrets.SUPABASE_ANON_KEY }} | |
| run: | | |
| chmod +x scripts/smoke-test.sh | |
| ./scripts/smoke-test.sh | |
| - name: Upload smoke test report | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: smoke-test-report-${{ github.sha }} | |
| path: smoke-report.json | |
| retention-days: 30 | |
| - name: Deployment summary | |
| run: | | |
| echo "=====================================================" | |
| echo " Production Deployment: COMPLETE ✅" | |
| echo "=====================================================" | |
| echo " Commit: ${{ github.sha }}" | |
| echo " /health: OK" | |
| echo " /ready: OK" | |
| echo " Smoke: passed" | |
| echo "=====================================================" | |
| # --------------------------------------------------------------------------- | |
| # JOB: rollback | |
| # | |
| # Triggered automatically when deploy, sync-infra, OR health-and-smoke fails. | |
| # Restores the previously healthy Blue-Green slot via the rollback script. | |
| # 'if: always()' ensures this job can evaluate even if upstream jobs failed. | |
| # --------------------------------------------------------------------------- | |
| rollback: | |
| name: Rollback Deployment (auto) | |
| runs-on: ubuntu-latest | |
| needs: [vps-readiness-check, deploy, api-health-gate, sync-infra, sync-monitoring, health-and-smoke] | |
| timeout-minutes: 10 | |
| if: | | |
| always() && | |
| ( | |
| needs.vps-readiness-check.result == 'failure' || | |
| needs.deploy.result == 'failure' || | |
| needs.api-health-gate.result == 'failure' || | |
| needs.sync-infra.result == 'failure' || | |
| needs.sync-monitoring.result == 'failure' || | |
| needs.health-and-smoke.result == 'failure' | |
| ) | |
| steps: | |
| - name: Log rollback trigger | |
| run: | | |
| echo "ROLLBACK_TRIGGERED=TRUE | FAILED_JOBS:" | |
| [ "${{ needs.vps-readiness-check.result }}" = "failure" ] && echo " - vps-readiness-check" | |
| [ "${{ needs.deploy.result }}" = "failure" ] && echo " - deploy" | |
| [ "${{ needs.api-health-gate.result }}" = "failure" ] && echo " - api-health-gate" | |
| [ "${{ needs.sync-infra.result }}" = "failure" ] && echo " - sync-infra" | |
| [ "${{ needs.sync-monitoring.result }}" = "failure" ] && echo " - sync-monitoring" | |
| [ "${{ needs.health-and-smoke.result }}" = "failure" ] && echo " - health-and-smoke" | |
| echo "SHA=${{ github.sha }}" | |
| - name: Rollback on VPS | |
| uses: appleboy/ssh-action@v1.0.3 | |
| with: | |
| host: ${{ secrets.DO_HOST }} | |
| username: ${{ secrets.DO_USER }} | |
| key: ${{ secrets.DO_SSH_KEY }} | |
| script: | | |
| set -euo pipefail | |
| export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" | |
| echo "USER=$(whoami)" | |
| echo "HOME=$HOME" | |
| echo "PWD=$(pwd)" | |
| ls -la "$HOME" | |
| ls -la "$HOME/api" | |
| [ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; } | |
| cd "$DEPLOY_ROOT" | |
| chmod +x scripts/*.sh | |
| ./scripts/rollback.sh --auto | |
| # Log final state | |
| ACTIVE_SLOT=$(cat /var/run/api/active-slot 2>/dev/null || echo "unknown") | |
| echo "ROLLBACK_COMPLETE | ACTIVE_SLOT=$ACTIVE_SLOT | SHA=${{ github.sha }}" |