Skip to content

Deploy to Production #284

Deploy to Production

Deploy to Production #284

Workflow file for this run

# .github/workflows/deploy.yml
#
# Production Deployment Pipeline
#
# Design principles:
# 1. Triggered ONLY after CodeQL deep scan completes successfully β€” no polling, no race.
# Uses workflow_run event: deploy is event-driven, not concurrent with security scan.
# 2. Runs ALL validation from scratch β€” no trust built on PR results alone
# 3. Trivy scan runs BEFORE Docker push β€” vulnerable images never reach the registry
# 4. target: production + build-args mirror pr.yml exactly (bit-for-bit parity)
# 5. Image digest verified against PR simulation artifact when available
# 6. Blue-Green deploy with automatic rollback on health or smoke test failure
# 7. timeout-minutes on every job β€” hung processes never block CI indefinitely
# 8. npm ci retried up to 3x β€” registry flakiness never kills a valid deploy
#
# Pipeline order:
# codeql-gate
# β”œβ”€β–Ί validate ─┐
# └─► test-api β”œβ”€β–Ί build-scan-push ─► vps-readiness-check ─► deploy
# β”˜ β”‚
# api-health-gate β—„β”€β”€β”€β”€β”€β”€β”€β”€β”˜
# β”‚
# health-and-smoke
# β”‚
# rollback ◄───────────────── (on failure)
name: Deploy to Production
on:
# Triggered ONLY when the CodeQL deep scan workflow completes on master.
# This replaces the previous push trigger + polling approach:
# - No race conditions (workflow_run fires AFTER codeql-deep finishes)
# - No API polling loops or timing-dependent checks
# - Deployment is blocked at the event level if CodeQL did not succeed
workflow_run:
workflows: ["CodeQL β€” Deep Scan (post-merge)"]
types:
- completed
branches:
- master
# Manual dispatch retained for emergency/hotfix deploys.
# The codeql-gate job enforces the conclusion check only for workflow_run.
workflow_dispatch:
# Never cancel an in-progress deployment β€” let it finish or fail cleanly.
concurrency:
group: production-deploy
cancel-in-progress: false
# Default to read-only. Jobs that need additional access declare it explicitly.
permissions:
contents: read
jobs:
# ---------------------------------------------------------------------------
# JOB: codeql-gate
#
# First job in every deploy run. Two responsibilities:
#
# 1. SECURITY GATE (workflow_run only):
# Reads github.event.workflow_run.conclusion and fails hard if CodeQL
# did not pass. This makes the event-driven guarantee explicit and
# visible in the pipeline UI.
#
# 2. SHA RESOLUTION:
# On workflow_run, github.sha = HEAD of default branch at event time,
# NOT the commit that triggered CodeQL. We must deploy exactly the SHA
# that was scanned. Exports deploy_sha = github.event.workflow_run.head_sha
# so all downstream jobs checkout and tag the correct commit.
# On workflow_dispatch, deploy_sha = github.sha (HEAD of triggered branch).
#
# All subsequent jobs that do git checkout use ref: needs.codeql-gate.outputs.deploy_sha.
# ---------------------------------------------------------------------------
codeql-gate:
name: CodeQL Security Gate
runs-on: ubuntu-latest
timeout-minutes: 5
outputs:
deploy_sha: ${{ steps.sha.outputs.deploy_sha }}
steps:
- name: Resolve deploy SHA
id: sha
run: |
if [ "${{ github.event_name }}" = "workflow_run" ]; then
echo "deploy_sha=${{ github.event.workflow_run.head_sha }}" >> "$GITHUB_OUTPUT"
else
echo "deploy_sha=${{ github.sha }}" >> "$GITHUB_OUTPUT"
fi
- name: Verify CodeQL deep scan passed
if: github.event_name == 'workflow_run'
run: |
CONCLUSION="${{ github.event.workflow_run.conclusion }}"
BRANCH="${{ github.event.workflow_run.head_branch }}"
SHA="${{ github.event.workflow_run.head_sha }}"
echo "CodeQL deep scan conclusion : $CONCLUSION"
echo "Scanned commit SHA : $SHA"
echo "Head branch : $BRANCH"
# Branch guard: only master commits deploy to production.
# The workflow_run trigger already filters branches: [master], but this
# explicit check makes the policy visible in the job log and provides a
# hard error if the filter is ever widened accidentally.
if [ "$BRANCH" != "master" ]; then
echo "::error::Deploy blocked β€” head_branch=$BRANCH (only master is allowed to deploy to production)."
exit 1
fi
if [ "$CONCLUSION" != "success" ]; then
echo "::error::CodeQL deep scan did not pass (conclusion=$CONCLUSION)."
echo " Deployment is blocked. Review findings before retrying:"
echo " https://github.com/${{ github.repository }}/security/code-scanning"
exit 1
fi
echo "βœ“ CodeQL gate passed β€” safe to deploy SHA $SHA (branch=$BRANCH)"
# ---------------------------------------------------------------------------
# JOB: validate
#
# Fast pre-flight: TypeScript check + dependency audit.
# Runs in parallel with test-api to maximise pipeline speed.
# ---------------------------------------------------------------------------
validate:
name: Validate (typecheck + audit)
runs-on: ubuntu-latest
needs: [codeql-gate]
timeout-minutes: 10
steps:
- name: Confirm deployment trigger
run: |
echo "========================================="
echo "Deployment triggered on master"
echo " Commit SHA : ${{ github.sha }}"
echo " Event : ${{ github.event_name }}"
echo " Ref : ${{ github.ref }}"
echo "========================================="
- name: Checkout
uses: actions/checkout@v5
with:
ref: ${{ needs.codeql-gate.outputs.deploy_sha }}
- name: Setup Node.js 24
uses: actions/setup-node@v5
with:
node-version: '24'
cache: npm
cache-dependency-path: package-lock.json
- name: Install dependencies (with retry)
run: |
echo "::group::npm ci"
for attempt in 1 2 3; do
npm ci && break
[ $attempt -eq 3 ] && { echo "::error::npm ci failed after 3 attempts"; exit 1; }
echo "Attempt $attempt failed β€” retrying in 15s..."
sleep 15
done
echo "::endgroup::"
- name: TypeScript check
run: npm run typecheck
- name: Env contract guard (no direct process.env outside env.ts)
run: |
if grep -r --include="*.ts" "process\.env" src/ \
| grep -v "src/config/env\.ts"; then
echo "::error::Direct process.env access detected outside env.ts β€” use: import { env } from './config/env.js' instead"
echo " Use: import { env } from './config/env.js' instead"
exit 1
fi
echo "βœ… Env contract clean β€” no direct process.env access outside env.ts"
# ---------------------------------------------------------------------------
# JOB: test-api
#
# Full backend test suite β€” unit tests then integration tests.
# Runs in parallel with validate.
# ---------------------------------------------------------------------------
test-api:
name: API Tests (unit + integration)
runs-on: ubuntu-latest
needs: [codeql-gate]
timeout-minutes: 15
env:
SUPABASE_URL: ${{ secrets.SUPABASE_URL_TEST }}
SUPABASE_ANON_KEY: ${{ secrets.SUPABASE_ANON_KEY_TEST }}
SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_SERVICE_ROLE_KEY_TEST }}
steps:
- name: Checkout
uses: actions/checkout@v5
with:
ref: ${{ needs.codeql-gate.outputs.deploy_sha }}
- name: Setup Node.js 24
uses: actions/setup-node@v5
with:
node-version: '24'
cache: npm
cache-dependency-path: package-lock.json
- name: Install dependencies (with retry)
run: |
echo "::group::npm ci"
for attempt in 1 2 3; do
npm ci && break
[ $attempt -eq 3 ] && { echo "::error::npm ci failed after 3 attempts"; exit 1; }
echo "Attempt $attempt failed β€” retrying in 15s..."
sleep 15
done
echo "::endgroup::"
- name: Run all tests
run: npm test
# ---------------------------------------------------------------------------
# JOB: infra-leakage-guard
#
# Pre-deploy safety gate: ensures the API repo has not re-introduced
# references to infra concerns (monitoring stack, /ready in deploy path).
# Runs in parallel with validate and test-api.
#
# Guards:
# 1. No alertmanager/docker-compose.monitoring client code in src/ or tests/
# 2. No docker-compose.monitoring references in deploy.sh or deploy.yml executable steps
# 3. No /ready usage in scripts/deploy.sh (health gate must use /health only)
# ---------------------------------------------------------------------------
infra-leakage-guard:
name: Infra Leakage Guard
runs-on: ubuntu-latest
needs: [codeql-gate]
timeout-minutes: 5
steps:
- name: Checkout
uses: actions/checkout@v5
with:
ref: ${{ needs.codeql-gate.outputs.deploy_sha }}
- name: Block monitoring infra client references in API source
run: |
# The API legitimately uses prom-client (prometheus.ts plugin) and emits
# OTLP traces. What must NOT appear is external infra client code β€”
# i.e., direct references to alertmanager, loki push clients, or
# docker-compose.monitoring in the application source.
# Exclude comment-only lines (-h suppresses filenames for grep -Ev).
# With bash -o pipefail (GHA default), grep exits 1 when there are zero matches;
# that must not fail the step β€” only non-empty LEAKS after filtering is an error.
LEAKS=$(grep -rhE "(alertmanager|docker-compose\.monitoring)" src/ tests/ 2>/dev/null \
| grep -Ev '^\s*(//|#|\*|/\*)' || true)
if [ -n "$LEAKS" ]; then
echo "::error::Infra client references found in src/ or tests/"
echo "$LEAKS"
exit 1
fi
echo "βœ“ No alertmanager/monitoring-compose references in src/ or tests/"
- name: Block docker-compose.monitoring references in deploy path
run: |
# Check deploy.sh (the executable deploy script) only.
# Searching deploy.yml for its own guard step is circular and self-defeating;
# the workflow's guard is enforced at the job level by the existence of this step.
if grep -E "docker-compose\.monitoring" scripts/deploy.sh 2>/dev/null | grep -Ev '^\s*#'; then
echo "::error::deploy.sh references docker-compose.monitoring β€” deploy must be monitoring-independent"
exit 1
fi
echo "βœ“ No docker-compose.monitoring references in scripts/deploy.sh"
- name: Block /ready in deploy path (deploy.sh)
run: |
if grep -E "(/ready)" scripts/deploy.sh | grep -Ev '^\s*#'; then
echo "::error::deploy.sh references /ready β€” deploy gate must use /health only"
exit 1
fi
echo "βœ“ deploy.sh does not reference /ready"
- name: Infra contract naming guard
run: |
# Enforce canonical naming contract (docs/infra-contract.md).
# Redis guard: scan the repo but skip docs, tests, markdown, and known local-dev fixtures.
FAIL=0
# Guard 1: no stale network name (fieldtrack_network is not the canonical name)
if grep -rE '\bfieldtrack_network\b' src/ scripts/ \
--include='*.ts' --include='*.sh' \
2>/dev/null | grep -Ev '^[^:]+:\s*(#|//)'; then
echo "::error::Forbidden network name 'fieldtrack_network' found β€” canonical name is 'api_network'"
FAIL=1
fi
# Guard 2: no localhost Redis URLs outside allowed paths (see docs/infra-contract.md)
if grep -rE 'redis://localhost:[0-9]+|redis://127\.0\.0\.1:[0-9]+' . \
--exclude-dir=docs \
--exclude-dir=tests \
--exclude-dir=node_modules \
--exclude-dir=.git \
--exclude-dir=codeql-db \
--exclude='*.md' \
--exclude='*.test.ts' \
--exclude='*.unit.ts' \
--exclude='.env.example.dev' \
2>/dev/null | grep -Fv 'env-setup.ts' | grep -q .; then
echo "::error::localhost Redis URL found in production paths β€” canonical URL is redis://redis:6379"
FAIL=1
fi
[ "$FAIL" -eq 0 ] || exit 1
echo "βœ“ Infra contract naming guard passed (api_network, redis://redis:6379)"
# ---------------------------------------------------------------------------
# JOB: build-scan-push
#
# Three-phase security gate β€” identical build config to pr.yml:
# Phase 1 β€” Build locally (target: production, same build-args, same cache)
# Phase 2 β€” Trivy scan: pinned aquasec/trivy:0.49.1 Docker image, exit-code 1
# on HIGH/CRITICAL (blocks push). NOT trivy-action β€” supply-chain safe.
# DB pre-pulled, scan runs --network none (air-gapped).
# Phase 3 β€” Push exact scanned image to GHCR (no rebuild)
#
# Image digest verification:
# After building, the digest is compared against the digest stored by
# pr.yml's production-simulation job. A match confirms bit-for-bit parity
# between what was validated in PR and what is being deployed.
# Comparison is best-effort (continue-on-error) because the merge commit
# SHA may differ from the PR head SHA on squash-merges.
# ---------------------------------------------------------------------------
build-scan-push:
name: Build, Scan & Push Docker Image
runs-on: ubuntu-latest
needs: [codeql-gate, validate, test-api, infra-leakage-guard]
timeout-minutes: 25
permissions:
contents: read
packages: write
security-events: write
outputs:
sha_short: ${{ steps.meta.outputs.sha_short }}
digest: ${{ steps.digest.outputs.digest }}
deploy_sha: ${{ steps.meta.outputs.deploy_sha }}
steps:
- name: Checkout
uses: actions/checkout@v5
with:
ref: ${{ needs.codeql-gate.outputs.deploy_sha }}
- name: Extract commit SHA
id: meta
env:
DEPLOY_SHA: ${{ needs.codeql-gate.outputs.deploy_sha }}
run: |
echo "sha_short=${DEPLOY_SHA::7}" >> "$GITHUB_OUTPUT"
echo "deploy_sha=$DEPLOY_SHA" >> "$GITHUB_OUTPUT"
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Pull base images (force fresh manifest, prevent stale GHA cache)
run: |
docker pull node:24.2.0-bookworm-slim
docker pull gcr.io/distroless/nodejs24-debian12:nonroot
# Phase 1: Build into local Docker daemon for scanning.
# EXACT same parameters as pr.yml production-simulation:
# target: production, build-args: NODE_ENV=production, GHA cache.
# CACHE_BUSTER forces rebuild when package-lock.json changes (prevents stale deps).
# Cache scoped to production to prevent cross-branch contamination from PR builds.
- name: Build Docker image (pre-scan, no push)
uses: docker/build-push-action@v6
with:
context: .
file: ./Dockerfile
target: production
build-args: |
NODE_ENV=production
CACHE_BUSTER=${{ hashFiles('**/package-lock.json') }}
push: false
load: true
pull: true
tags: |
fieldtrack-api:${{ steps.meta.outputs.sha_short }}
cache-from: type=gha,scope=production
cache-to: type=gha,mode=max,scope=production
# Verify Node.js runtime β€” exercises TLS stack, not just compile-time version constant.
# tls.createSecureContext() fails if libssl linkage is broken, proving runtime health.
- name: Verify Node.js runtime (TLS operational check)
run: |
IMAGE_NAME="fieldtrack-api:${{ steps.meta.outputs.sha_short }}"
echo "Testing image: $IMAGE_NAME"
docker run --rm \
--entrypoint /nodejs/bin/node \
"$IMAGE_NAME" \
-e "
const crypto = require('crypto');
const tls = require('tls');
const ctx = tls.createSecureContext();
if (!ctx) { process.stderr.write('FAIL: TLS context failed\n'); process.exit(1); }
const h = crypto.createHash('sha256').update('smoke').digest('hex');
if (!h) { process.stderr.write('FAIL: hash failed\n'); process.exit(1); }
process.stdout.write('node=' + process.versions.node + ' openssl=' + process.versions.openssl + ' tls=ok\n');
"
# Capture the content-addressable image digest.
# With cache scoping and cache busting, digest should always reproduce correctly.
- name: Capture image digest
id: digest
run: |
IMAGE_NAME="fieldtrack-api:${{ steps.meta.outputs.sha_short }}"
DIGEST=$(docker inspect "$IMAGE_NAME" --format='{{.Id}}')
echo "digest=$DIGEST" >> "$GITHUB_OUTPUT"
echo "::group::Build traceability"
echo " Commit SHA : ${{ github.sha }}"
echo " Image tag : $IMAGE_NAME"
echo " Image digest : $DIGEST"
echo "::endgroup::"
# Compare this digest with the one stored by pr.yml's production-simulation.
# A match = bit-for-bit parity. A mismatch = code/cache divergence (warning).
# continue-on-error: true β€” squash merges produce a new commit SHA, which
# may cause minor divergence even with identical source code.
- name: Verify image digest parity with PR simulation
continue-on-error: true
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
# Find the PR number associated with this merge commit
PR_NUMBER=$(gh api \
"/repos/${{ github.repository }}/commits/${{ github.sha }}/pulls" \
--header "X-GitHub-Api-Version: 2022-11-28" \
--jq '.[0].number // empty' 2>/dev/null || echo "")
if [ -z "$PR_NUMBER" ]; then
echo "No associated PR found for commit ${{ github.sha }} β€” skipping digest comparison."
exit 0
fi
echo "Associated PR: #${PR_NUMBER}"
# Find the most recent successful pr.yml run for this PR
RUN_ID=$(gh run list \
--repo "${{ github.repository }}" \
--workflow "pr.yml" \
--json databaseId,conclusion,headSha \
--jq "map(select(.conclusion == \"success\")) | .[0].databaseId // empty" \
2>/dev/null || echo "")
if [ -z "$RUN_ID" ]; then
echo "No successful PR validation run found β€” skipping digest comparison."
exit 0
fi
# Download the image-digest artifact from that run
gh run download "$RUN_ID" \
--repo "${{ github.repository }}" \
--name "image-digest-pr-${PR_NUMBER}" \
--dir /tmp/pr-digest \
2>/dev/null || true
if [ ! -f /tmp/pr-digest/image-digest.txt ]; then
echo "PR image-digest artifact not found β€” skipping comparison."
exit 0
fi
PR_DIGEST=$(cat /tmp/pr-digest/image-digest.txt)
DEPLOY_DIGEST="${{ steps.digest.outputs.digest }}"
echo "PR simulation digest: $PR_DIGEST"
echo "Deploy image digest: $DEPLOY_DIGEST"
if [ "$PR_DIGEST" = "$DEPLOY_DIGEST" ]; then
echo "βœ“ Digest match β€” bit-for-bit parity confirmed between PR and deploy."
else
echo "⚠ Digest mismatch β€” builds diverged between PR and deploy."
echo " Expected on squash-merges where the commit SHA changes."
echo " Ensure no source changes occurred between PR approval and deploy trigger."
fi
# Phase 2: Trivy scan β€” image pinned by immutable digest, NOT trivy-action.
# aquasec/trivy:0.49.1 β†’ sha256:91494b87ddc64f62860d52997532643956c24eeee0d0dda317d563c28c8581bc
# Identical severity gates to pr.yml (HIGH,CRITICAL / exit-code 1).
# Two-phase: DB downloaded first (needs network), then scan runs --network none.
- name: Get date for Trivy DB cache key
id: trivy-date
run: echo "date=$(date +'%Y-%m-%d')" >> "$GITHUB_OUTPUT"
- name: Cache Trivy DB (daily refresh)
uses: actions/cache@v4
with:
path: /tmp/trivy-cache
key: trivy-db-${{ runner.os }}-${{ steps.trivy-date.outputs.date }}
restore-keys: |
trivy-db-${{ runner.os }}-
- name: Pull Trivy vulnerability database
run: |
docker run --rm \
-v /tmp/trivy-cache:/root/.cache \
aquasec/trivy@sha256:91494b87ddc64f62860d52997532643956c24eeee0d0dda317d563c28c8581bc \
image --download-db-only
- name: Scan image with Trivy (HIGH/CRITICAL, ignore-unfixed)
env:
IMAGE_NAME: fieldtrack-api:${{ steps.meta.outputs.sha_short }}
run: |
SCAN_PASSED=false
for i in 1 2 3; do
if docker run --rm \
--network none \
-v /var/run/docker.sock:/var/run/docker.sock \
-v /tmp/trivy-cache:/root/.cache \
-v "$(pwd)/.trivyignore:/tmp/.trivyignore:ro" \
aquasec/trivy@sha256:91494b87ddc64f62860d52997532643956c24eeee0d0dda317d563c28c8581bc image \
--skip-db-update \
--ignore-unfixed \
--severity HIGH,CRITICAL \
--exit-code 1 \
--ignorefile /tmp/.trivyignore \
"$IMAGE_NAME"; then
SCAN_PASSED=true
break
fi
echo "Trivy attempt $i failed..."
[ "$i" -lt 3 ] && sleep 5
done
if [ "$SCAN_PASSED" != "true" ]; then
echo "::error::Trivy scan failed after 3 attempts β€” HIGH/CRITICAL vulnerabilities found or scan error."
exit 1
fi
echo "βœ“ Trivy scan passed (HIGH/CRITICAL, ignore-unfixed)"
- name: Scan for unfixed CRITICAL vulnerabilities (informational)
continue-on-error: true
env:
IMAGE_NAME: fieldtrack-api:${{ steps.meta.outputs.sha_short }}
run: |
UNFIXED_COUNT=$(docker run --rm \
--network none \
-v /var/run/docker.sock:/var/run/docker.sock \
-v /tmp/trivy-cache:/root/.cache \
aquasec/trivy@sha256:91494b87ddc64f62860d52997532643956c24eeee0d0dda317d563c28c8581bc image \
--skip-db-update \
--severity CRITICAL \
--format json \
"$IMAGE_NAME" | jq '[.Results[]?.Misconfigurations[]? // .Results[]?.Vulnerabilities[]? | select(.FixedVersion == null or .FixedVersion == "")] | length')
if [ "$UNFIXED_COUNT" -gt 0 ]; then
echo "⚠ WARNING: $UNFIXED_COUNT unfixed CRITICAL vulnerabilities found"
echo " (No patches available upstream β€” waiting for vendor fix)"
docker run --rm \
--network none \
-v /var/run/docker.sock:/var/run/docker.sock \
-v /tmp/trivy-cache:/root/.cache \
aquasec/trivy@sha256:91494b87ddc64f62860d52997532643956c24eeee0d0dda317d563c28c8581bc image \
--skip-db-update \
--severity CRITICAL \
"$IMAGE_NAME" >> /tmp/unfixed-critical.log || true
else
echo "βœ“ No unfixed CRITICAL vulnerabilities"
fi
- name: Generate Trivy scan results (SARIF for GitHub Security)
env:
IMAGE_NAME: fieldtrack-api:${{ steps.meta.outputs.sha_short }}
run: |
docker run --rm \
--network none \
-v /var/run/docker.sock:/var/run/docker.sock \
-v /tmp/trivy-cache:/root/.cache \
-v "$(pwd):/workspace" \
aquasec/trivy@sha256:91494b87ddc64f62860d52997532643956c24eeee0d0dda317d563c28c8581bc image \
--skip-db-update \
--format sarif \
--output /workspace/trivy-results.sarif \
"$IMAGE_NAME"
echo "βœ“ SARIF results written to trivy-results.sarif"
- name: Upload Trivy scan results to GitHub Security
uses: github/codeql-action/upload-sarif@v3
with:
sarif_file: trivy-results.sarif
category: 'trivy-image-scan'
# Phase 3: Scan passed β€” push the exact scanned image (same layer digests).
# Uses docker tag + push rather than rebuilding to guarantee what was scanned
# is exactly what lands in the registry.
- name: Verify image digest unchanged before push
env:
IMAGE_NAME: fieldtrack-api:${{ steps.meta.outputs.sha_short }}
IMAGE_DIGEST: ${{ steps.digest.outputs.digest }}
run: |
# docker inspect .Id returns the config digest (sha256:...) which is
# stable across tag operations β€” same value captured in the digest step.
CURRENT=$(docker inspect "$IMAGE_NAME" --format='{{.Id}}')
echo "Expected digest : $IMAGE_DIGEST"
echo "Current digest : $CURRENT"
if [ "$CURRENT" != "$IMAGE_DIGEST" ]; then
echo "ERROR: image digest changed between scan and push β€” aborting."
exit 1
fi
echo "βœ“ Digest verified β€” pushing exactly what was scanned."
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Push verified image to registry
run: |
OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]')
# Tag and push using the FULL commit SHA so deploy.sh can reference
# the exact image via IMAGE=ghcr.io/.../api:$deploy_sha.
# Short SHA (sha_short) is kept as a convenience alias only.
docker tag \
fieldtrack-api:${{ steps.meta.outputs.sha_short }} \
ghcr.io/${OWNER}/api:${{ steps.meta.outputs.deploy_sha }}
docker push ghcr.io/${OWNER}/api:${{ steps.meta.outputs.deploy_sha }}
# Also push short-SHA alias for human convenience (inspect, debugging).
docker tag \
fieldtrack-api:${{ steps.meta.outputs.sha_short }} \
ghcr.io/${OWNER}/api:${{ steps.meta.outputs.sha_short }}
docker push ghcr.io/${OWNER}/api:${{ steps.meta.outputs.sha_short }}
echo "βœ“ Pushed ghcr.io/${OWNER}/api:${{ steps.meta.outputs.deploy_sha }} (full SHA β€” reference for deploy)"
# Use the same pinned Trivy image to generate the SBOM β€” no additional
# tool dependency, no unpinned action, same supply-chain guarantees.
- name: Generate SBOM (CycloneDX)
env:
IMAGE_NAME: fieldtrack-api:${{ steps.meta.outputs.sha_short }}
run: |
docker run --rm \
-v /var/run/docker.sock:/var/run/docker.sock \
aquasec/trivy@sha256:91494b87ddc64f62860d52997532643956c24eeee0d0dda317d563c28c8581bc image \
--format cyclonedx \
--output /dev/stdout \
"$IMAGE_NAME" > sbom.json
- name: Upload SBOM artifact
uses: actions/upload-artifact@v4
with:
name: sbom-${{ steps.meta.outputs.sha_short }}
path: sbom.json
retention-days: 90
- name: Save build provenance
env:
IMAGE_NAME: fieldtrack-api:${{ steps.meta.outputs.sha_short }}
IMAGE_DIGEST: ${{ steps.digest.outputs.digest }}
run: |
echo "commit=${{ needs.codeql-gate.outputs.deploy_sha }}" > provenance.txt
echo "ref=${{ github.ref }}" >> provenance.txt
echo "image=${IMAGE_NAME}" >> provenance.txt
echo "registry_tag=ghcr.io/${{ github.repository_owner }}/api:${{ steps.meta.outputs.deploy_sha }}" >> provenance.txt
echo "digest=${IMAGE_DIGEST}" >> provenance.txt
echo "workflow=${{ github.workflow }}" >> provenance.txt
echo "run_id=${{ github.run_id }}" >> provenance.txt
- name: Upload provenance artifact
uses: actions/upload-artifact@v4
with:
name: provenance-${{ steps.meta.outputs.sha_short }}
path: provenance.txt
retention-days: 90
- name: Build & scan summary
if: always()
env:
IMAGE_DIGEST: ${{ steps.digest.outputs.digest }}
run: |
SBOM_COUNT=$(python3 -c "import json; d=json.load(open('sbom.json')); print(len(d.get('components', [])))" 2>/dev/null || echo 'n/a')
{
echo "## Build Β· Scan Β· Push"
echo "| Field | Value |"
echo "|---|---|"
echo "| Deploy SHA | \`${{ steps.meta.outputs.deploy_sha }}\` |"
echo "| Image tag (local) | \`fieldtrack-api:${{ steps.meta.outputs.sha_short }}\` |"
echo "| Registry tag | \`ghcr.io/${{ github.repository_owner }}/api:${{ steps.meta.outputs.deploy_sha }}\` |"
echo "| Image digest | \`${IMAGE_DIGEST}\` |"
echo "| SBOM components | ${SBOM_COUNT} |"
echo "| Trivy gate | HIGH,CRITICAL / exit-code 1 / ignore-unfixed |"
echo "| Registry | ghcr.io/${{ github.repository_owner }}/api |"
} >> "$GITHUB_STEP_SUMMARY"
# ---------------------------------------------------------------------------
# JOB: vps-readiness-check
#
# Validates the VPS is in a deployable state BEFORE running the deploy.
# Runs in PARALLEL with vps-readiness-check (both depend on build-scan-push).
# Both must succeed before deploy is allowed to proceed.
#
# Delegates to scripts/vps-readiness-check.sh which checks:
# - Docker daemon running
# - api_network exists (auto-created if missing)
# - Ports 80/443 free from non-nginx processes
# - No API containers with host port bindings
# - Required .env file present
# - Runtime directories present (auto-created if missing)
# - Sufficient disk space (auto-prunes if borderline)
# ---------------------------------------------------------------------------
vps-readiness-check:
name: VPS Readiness Gate
runs-on: ubuntu-latest
needs: [build-scan-push]
timeout-minutes: 10
steps:
- name: Run VPS readiness check via SSH
uses: appleboy/ssh-action@v1.0.3
with:
host: ${{ secrets.DO_HOST }}
username: ${{ secrets.DO_USER }}
key: ${{ secrets.DO_SSH_KEY }}
script: |
set -euo pipefail
export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
[ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
cd "$DEPLOY_ROOT"
# Pull latest scripts without full deploy
git fetch origin master --depth=1
git checkout origin/master -- scripts/vps-readiness-check.sh 2>/dev/null || true
chmod +x scripts/vps-readiness-check.sh
./scripts/vps-readiness-check.sh
# ---------------------------------------------------------------------------
# JOB: deploy
#
# Blue-Green deployment to VPS via SSH.
# deploy.sh manages slot switching and container health.
#
# DEPENDENCY GATES (both must pass):
# - vps-readiness-check: ensures VPS can accept the deployment
# ---------------------------------------------------------------------------
deploy:
name: Deploy (Blue-Green SSH)
runs-on: ubuntu-latest
needs: [build-scan-push, vps-readiness-check]
timeout-minutes: 20
steps:
- name: Validate required deployment secrets
env:
API_BASE_URL: ${{ secrets.API_BASE_URL }}
CORS_ORIGIN: ${{ secrets.CORS_ORIGIN }}
run: |
if [ -z "${API_BASE_URL:-}" ]; then
echo "::error::API_BASE_URL secret is not set. Deployment aborted."
exit 1
fi
echo "βœ“ API_BASE_URL is set"
if [ -z "${CORS_ORIGIN:-}" ]; then
echo "::error::CORS_ORIGIN secret is not set. Deployment aborted."
exit 1
fi
echo "βœ“ CORS_ORIGIN is set"
- name: Log deployment metadata and trigger info
run: |
DEPLOY_SHA="${{ needs.build-scan-push.outputs.deploy_sha }}"
{
echo "## Deployment Initiated"
echo "| Field | Value |"
echo "|---|---|"
echo "| Deploy SHA | \`${DEPLOY_SHA}\` |"
echo "| Trigger event | ${{ github.event_name }} |"
echo "| Triggered by | ${{ github.actor }} |"
echo "| Branch | ${{ github.ref_name }} |"
echo "| Workflow run | [${{ github.run_id }}](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) |"
echo "| Commit message | \`${{ github.event.head_commit.message }}\` |"
} >> "$GITHUB_STEP_SUMMARY"
echo "[DEPLOY] Deployment initiated β€” SHA=${DEPLOY_SHA} EVENT=${{ github.event_name }} ACTOR=${{ github.actor }}"
- name: Blue-Green deploy via SSH
uses: appleboy/ssh-action@v1.0.3
with:
host: ${{ secrets.DO_HOST }}
username: ${{ secrets.DO_USER }}
key: ${{ secrets.DO_SSH_KEY }}
script: |
set -euo pipefail
T0=$(date +%s)
export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
[ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
cd "$DEPLOY_ROOT"
# Enforce repo is at the exact SHA being deployed (issue 7 β€” prevents
# stale deploy scripts if another commit landed during this pipeline run).
git fetch origin
git reset --hard ${{ needs.build-scan-push.outputs.deploy_sha }}
chmod +x scripts/*.sh
./scripts/deploy.sh "${{ needs.build-scan-push.outputs.deploy_sha }}"
echo "[DEPLOY] Deploy completed in $(($(date +%s) - T0))s"
- name: Log deployment state (slot + SHA for debugging)
uses: appleboy/ssh-action@v1.0.3
if: always()
with:
host: ${{ secrets.DO_HOST }}
username: ${{ secrets.DO_USER }}
key: ${{ secrets.DO_SSH_KEY }}
script: |
ACTIVE_SLOT=$(cat /var/lib/fieldtrack/active-slot 2>/dev/null || echo "unknown")
ACTIVE_CONTAINER="api-${ACTIVE_SLOT}"
DEPLOY_STATUS="UNKNOWN"
# Health check via in-network curl container β€” exercises Docker DNS
# and bridge routing (same path nginx uses). NO host port binding needed.
FT_CURL_IMG="curlimages/curl:8.7.1"
if docker inspect "$ACTIVE_CONTAINER" >/dev/null 2>&1; then
if docker run --rm --network api_network "$FT_CURL_IMG" \
-sf --max-time 5 "http://$ACTIVE_CONTAINER:3000/health" >/dev/null 2>&1; then
DEPLOY_STATUS="SUCCESS"
else
DEPLOY_STATUS="UNHEALTHY"
fi
else
DEPLOY_STATUS="CONTAINER_MISSING"
fi
echo "[DEPLOY] state=$DEPLOY_STATUS slot=$ACTIVE_SLOT container=$ACTIVE_CONTAINER sha=${{ github.sha }}"
# ---------------------------------------------------------------------------
# JOB: api-health-gate (Step E+)
#
# Validates the API container is healthy after deploy.
# Ensures /health returns 200 before proceeding to smoke tests.
# If the API is not healthy at this point, rollback is triggered.
# ---------------------------------------------------------------------------
api-health-gate:
name: API Health Gate
runs-on: ubuntu-latest
needs: [deploy]
timeout-minutes: 5
steps:
- name: Verify API container is healthy after deploy
uses: appleboy/ssh-action@v1.0.3
with:
host: ${{ secrets.DO_HOST }}
username: ${{ secrets.DO_USER }}
key: ${{ secrets.DO_SSH_KEY }}
script: |
set -euo pipefail
export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
[ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
cd "$DEPLOY_ROOT"
ACTIVE_SLOT=$(cat /var/lib/fieldtrack/active-slot 2>/dev/null || echo "blue")
ACTIVE_CONTAINER="api-$ACTIVE_SLOT"
docker inspect "$ACTIVE_CONTAINER" >/dev/null 2>&1 || {
echo "::error::Container $ACTIVE_CONTAINER not found"
exit 1
}
FT_CURL_IMG="curlimages/curl:8.7.1"
for i in $(seq 1 15); do
STATUS=$(docker run --rm --network api_network "$FT_CURL_IMG" \
-s -o /dev/null -w "%{http_code}" \
"http://$ACTIVE_CONTAINER:3000/health" 2>/dev/null || echo "000")
if [ "$STATUS" = "200" ]; then
echo "[DEPLOY] API healthy (slot=$ACTIVE_SLOT attempt=$i)"
exit 0
fi
sleep 2
done
echo "::error::API /health did not return 200 after 30s"
docker logs "$ACTIVE_CONTAINER" --tail 30 >&2 2>/dev/null || true
exit 1
# ---------------------------------------------------------------------------
# JOB: health-and-smoke
#
# Post-deploy health verification and CI coupling guard.
# Failure here triggers the rollback job automatically.
# ---------------------------------------------------------------------------
health-and-smoke:
name: Health Checks & Smoke Tests
runs-on: ubuntu-latest
needs: [api-health-gate, build-scan-push]
timeout-minutes: 15
steps:
- name: Checkout
uses: actions/checkout@v5
with:
ref: ${{ needs.build-scan-push.outputs.deploy_sha }}
- name: CI guard β€” deploy.sh must not reference /ready or monitoring stack
run: |
set -euo pipefail
echo "Checking deploy.sh for forbidden references..."
# Exclude comment lines (starting with optional whitespace then #)
if grep -E "(/ready)" scripts/deploy.sh | grep -Ev '^\s*#'; then
echo "::error::deploy.sh references /ready β€” deploy gate must only use /health"
exit 1
fi
if grep -E "(prometheus|grafana|alertmanager|loki)" scripts/deploy.sh | grep -Ev '^\s*#'; then
echo "::error::deploy.sh references monitoring stack β€” deploy must be monitoring-independent"
exit 1
fi
# Check for local repo-relative infra path coupling in scripts/src only.
# Uses the same pattern as verify-stabilization.sh: only relative paths
# (./infra/ or ../infra/) are forbidden. Absolute /opt/infra is allowed.
# Scope: scripts/ and src/ only (not workflows where guard steps live).
if grep -rE "\./infra/|\.\.\./infra/" scripts/ src/ \
--binary-files=without-match --exclude-dir=node_modules 2>/dev/null \
| grep -Ev '^[^:]+:\s*(#|//)'; then
echo "::error::Local repo-relative infra coupling (./infra/ or ../infra/) detected in scripts/ or src/"
exit 1
fi
echo "βœ“ CI guards passed: no /ready, no monitoring, no local infra coupling in deploy path"
- name: Wait for /health endpoint (via VPS)
uses: appleboy/ssh-action@v1.0.3
with:
host: ${{ secrets.DO_HOST }}
username: ${{ secrets.DO_USER }}
key: ${{ secrets.DO_SSH_KEY }}
script: |
set -euo pipefail
export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
[ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
cd "$DEPLOY_ROOT"
API_BASE_URL=$(grep -E '^API_BASE_URL=' .env 2>/dev/null | head -1 | cut -d'=' -f2- || true)
[ -n "$API_BASE_URL" ] || { echo "::error::API_BASE_URL missing or empty in .env"; exit 1; }
API_HOSTNAME=$(echo "$API_BASE_URL" | sed -E 's|^https?://||' | cut -d'/' -f1)
for i in $(seq 1 30); do
# Phase 1: in-network (source of truth)
if docker run --rm --network api_network \
curlimages/curl:8.7.1 -sk --max-time 5 \
-H "Host: ${API_HOSTNAME}" \
https://nginx/health 2>/dev/null \
| grep -q '"status":"ok"'; then
echo "[DEPLOY] /health OK (in-network, attempt $i)"
exit 0
fi
# Phase 2: HTTPS advisory (status=000 = host→Docker TCP issue, non-fatal)
STATUS=$(curl -s --resolve "${API_HOSTNAME}:443:127.0.0.1" \
-o /dev/null -w "%{http_code}" \
"https://${API_HOSTNAME}/health" --insecure 2>/dev/null || echo "000")
if [ "$STATUS" = "200" ]; then
echo "[DEPLOY] /health OK (HTTPS advisory, attempt $i)"
exit 0
fi
sleep 2
done
echo "::error::Health check failed after 30 attempts"
exit 1
- name: Wait for /health endpoint (final public check)
uses: appleboy/ssh-action@v1.0.3
with:
host: ${{ secrets.DO_HOST }}
username: ${{ secrets.DO_USER }}
key: ${{ secrets.DO_SSH_KEY }}
script: |
set -euo pipefail
export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
[ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
cd "$DEPLOY_ROOT"
API_BASE_URL=$(grep -E '^API_BASE_URL=' .env 2>/dev/null | head -1 | cut -d'=' -f2- || true)
[ -n "$API_BASE_URL" ] || { echo "::error::API_BASE_URL missing or empty in .env"; exit 1; }
API_HOSTNAME=$(echo "$API_BASE_URL" | sed -E 's|^https?://||' | cut -d'/' -f1)
for i in $(seq 1 10); do
# Phase 1: in-network (source of truth)
if docker run --rm --network api_network \
curlimages/curl:8.7.1 -sk --max-time 5 \
-H "Host: ${API_HOSTNAME}" \
https://nginx/health 2>/dev/null \
| grep -q '"status":"ok"'; then
echo "[DEPLOY] /health OK (in-network, attempt $i)"
exit 0
fi
# Phase 2: HTTPS advisory
STATUS=$(curl -s --resolve "${API_HOSTNAME}:443:127.0.0.1" \
-o /dev/null -w "%{http_code}" \
"https://${API_HOSTNAME}/health" --insecure 2>/dev/null || echo "000")
if [ "$STATUS" = "200" ]; then
echo "[DEPLOY] /health OK (HTTPS advisory, attempt $i)"
exit 0
fi
sleep 2
done
echo "::error::Final health check failed after 10 attempts"
exit 1
# Post-deploy /ready check β€” informational only, never fails the deploy.
# /health is the deploy gate (shallow, network-independent).
# /ready reflects deep system state: Redis, workers, DB connectivity.
# Logged in the job summary for operator visibility without blocking production.
- name: Post-deploy /ready check (informational)
continue-on-error: true
uses: appleboy/ssh-action@v1.0.3
with:
host: ${{ secrets.DO_HOST }}
username: ${{ secrets.DO_USER }}
key: ${{ secrets.DO_SSH_KEY }}
script: |
ACTIVE_SLOT=$(cat /var/lib/fieldtrack/active-slot 2>/dev/null || echo "unknown")
ACTIVE_CONTAINER="api-${ACTIVE_SLOT}"
FT_CURL_IMG="curlimages/curl:8.7.1"
READY_RESP=$(docker run --rm --network api_network "$FT_CURL_IMG" \
-sf --max-time 10 "http://$ACTIVE_CONTAINER:3000/ready" 2>/dev/null || echo "UNREACHABLE")
READY_STATUS=$(echo "$READY_RESP" | grep -o '"status":"[^"]*"' | head -1 || echo "unknown")
echo "[DEPLOY] /ready check: slot=$ACTIVE_SLOT status=$READY_STATUS"
echo "[DEPLOY] /ready response: $READY_RESP"
- name: Deployment summary
run: |
DEPLOY_SHA="${{ needs.build-scan-push.outputs.deploy_sha }}"
{
echo "## Deployment Complete"
echo "| Field | Value |"
echo "|---|---|"
echo "| Status | βœ… SUCCESS |"
echo "| Deploy SHA | \`${DEPLOY_SHA}\` |"
echo "| Health gate | /health β†’ 200 |"
echo "| Post-deploy checks | passed |"
} >> "$GITHUB_STEP_SUMMARY"
echo "[DEPLOY] Production deployment complete β€” SHA=${DEPLOY_SHA} Health=OK"
# ---------------------------------------------------------------------------
# JOB: rollback
#
# Triggered automatically when POST-DEPLOY health checks fail.
#
# PHASE-AWARE CONDITION (prevents double-rollback):
# Rollback triggers ONLY when the deploy job itself SUCCEEDED (exit 0)
# AND a post-deploy CI health check subsequently failed.
#
# This means nginx traffic was already switched to the new container
# (deploy.sh reported DEPLOY_RESULT=SWITCHED) and then the CI-level
# health gate caught a regression.
#
# It does NOT trigger when:
# β€’ vps-readiness-check fails (deploy never started)
# β€’ deploy job fails (deploy.sh already handled internal recovery;
# see DEPLOY_RESULT=FAILED_PRE_SWITCH or RESTORED in deploy.sh)
#
# 'if: always()' ensures this job evaluates even if upstream jobs failed.
# ---------------------------------------------------------------------------
rollback:
name: Rollback Deployment (auto)
runs-on: ubuntu-latest
needs: [vps-readiness-check, deploy, api-health-gate, health-and-smoke]
timeout-minutes: 10
if: |
always() &&
(
needs.api-health-gate.result == 'failure' ||
needs.health-and-smoke.result == 'failure'
)
steps:
- name: Log rollback trigger
run: |
echo "::error::Rollback triggered (post-switch health failure) β€” deploy_sha=${{ needs.build-scan-push.outputs.deploy_sha }}"
[ "${{ needs.api-health-gate.result }}" = "failure" ] && echo " [ERROR] failed job: api-health-gate" || true
[ "${{ needs.health-and-smoke.result }}" = "failure" ] && echo " [ERROR] failed job: health-and-smoke" || true
- name: Rollback on VPS
uses: appleboy/ssh-action@v1.0.3
with:
host: ${{ secrets.DO_HOST }}
username: ${{ secrets.DO_USER }}
key: ${{ secrets.DO_SSH_KEY }}
script: |
set -euo pipefail
export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
[ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
cd "$DEPLOY_ROOT"
chmod +x scripts/*.sh
./scripts/deploy.sh --rollback --auto
ACTIVE_SLOT=$(cat /var/lib/fieldtrack/active-slot 2>/dev/null || echo "unknown")
echo "[DEPLOY] Rollback complete β€” slot=$ACTIVE_SLOT sha=${{ github.sha }}"