Skip to content

Deploy to Production #278

Deploy to Production

Deploy to Production #278

Workflow file for this run

# .github/workflows/deploy.yml
#
# Production Deployment Pipeline
#
# Design principles:
# 1. Triggered ONLY after CodeQL deep scan completes successfully — no polling, no race.
# Uses workflow_run event: deploy is event-driven, not concurrent with security scan.
# 2. Runs ALL validation from scratch — no trust built on PR results alone
# 3. Trivy scan runs BEFORE Docker push — vulnerable images never reach the registry
# 4. target: production + build-args mirror pr.yml exactly (bit-for-bit parity)
# 5. Image digest verified against PR simulation artifact when available
# 6. Blue-Green deploy with automatic rollback on health or smoke test failure
# 7. timeout-minutes on every job — hung processes never block CI indefinitely
# 8. npm ci retried up to 3x — registry flakiness never kills a valid deploy
#
# Pipeline order:
# codeql-gate
# ├─► validate ─┐
# └─► test-api ├─► build-scan-push ─► vps-readiness-check ─► deploy
# ┘ │
# api-health-gate ◄────────┘
# │
# sync-infra ─► sync-monitoring ─► health-and-smoke
# │
# rollback ◄──────────────────────────────┘ (on failure)
name: Deploy to Production
on:
# Triggered ONLY when the CodeQL deep scan workflow completes on master.
# This replaces the previous push trigger + polling approach:
# - No race conditions (workflow_run fires AFTER codeql-deep finishes)
# - No API polling loops or timing-dependent checks
# - Deployment is blocked at the event level if CodeQL did not succeed
workflow_run:
workflows: ["CodeQL — Deep Scan (post-merge)"]
types:
- completed
branches:
- master
# Manual dispatch retained for emergency/hotfix deploys.
# The codeql-gate job enforces the conclusion check only for workflow_run.
workflow_dispatch:
# Never cancel an in-progress deployment — let it finish or fail cleanly.
concurrency:
group: production-deploy
cancel-in-progress: false
# Default to read-only. Jobs that need additional access declare it explicitly.
permissions:
contents: read
jobs:
# ---------------------------------------------------------------------------
# JOB: codeql-gate
#
# First job in every deploy run. Two responsibilities:
#
# 1. SECURITY GATE (workflow_run only):
# Reads github.event.workflow_run.conclusion and fails hard if CodeQL
# did not pass. This makes the event-driven guarantee explicit and
# visible in the pipeline UI.
#
# 2. SHA RESOLUTION:
# On workflow_run, github.sha = HEAD of default branch at event time,
# NOT the commit that triggered CodeQL. We must deploy exactly the SHA
# that was scanned. Exports deploy_sha = github.event.workflow_run.head_sha
# so all downstream jobs checkout and tag the correct commit.
# On workflow_dispatch, deploy_sha = github.sha (HEAD of triggered branch).
#
# All subsequent jobs that do git checkout use ref: needs.codeql-gate.outputs.deploy_sha.
# ---------------------------------------------------------------------------
codeql-gate:
name: CodeQL Security Gate
runs-on: ubuntu-latest
timeout-minutes: 5
outputs:
deploy_sha: ${{ steps.sha.outputs.deploy_sha }}
steps:
- name: Resolve deploy SHA
id: sha
run: |
if [ "${{ github.event_name }}" = "workflow_run" ]; then
echo "deploy_sha=${{ github.event.workflow_run.head_sha }}" >> "$GITHUB_OUTPUT"
else
echo "deploy_sha=${{ github.sha }}" >> "$GITHUB_OUTPUT"
fi
- name: Verify CodeQL deep scan passed
if: github.event_name == 'workflow_run'
run: |
CONCLUSION="${{ github.event.workflow_run.conclusion }}"
BRANCH="${{ github.event.workflow_run.head_branch }}"
SHA="${{ github.event.workflow_run.head_sha }}"
echo "CodeQL deep scan conclusion : $CONCLUSION"
echo "Scanned commit SHA : $SHA"
echo "Head branch : $BRANCH"
# Branch guard: only master commits deploy to production.
# The workflow_run trigger already filters branches: [master], but this
# explicit check makes the policy visible in the job log and provides a
# hard error if the filter is ever widened accidentally.
if [ "$BRANCH" != "master" ]; then
echo "::error::Deploy blocked — head_branch=$BRANCH (only master is allowed to deploy to production)."
exit 1
fi
if [ "$CONCLUSION" != "success" ]; then
echo "::error::CodeQL deep scan did not pass (conclusion=$CONCLUSION)."
echo " Deployment is blocked. Review findings before retrying:"
echo " https://github.com/${{ github.repository }}/security/code-scanning"
exit 1
fi
echo "✓ CodeQL gate passed — safe to deploy SHA $SHA (branch=$BRANCH)"
# ---------------------------------------------------------------------------
# JOB: validate
#
# Fast pre-flight: TypeScript check + dependency audit.
# Runs in parallel with test-api to maximise pipeline speed.
# ---------------------------------------------------------------------------
validate:
name: Validate (typecheck + audit)
runs-on: ubuntu-latest
needs: [codeql-gate]
timeout-minutes: 10
steps:
- name: Confirm deployment trigger
run: |
echo "========================================="
echo "Deployment triggered on master"
echo " Commit SHA : ${{ github.sha }}"
echo " Event : ${{ github.event_name }}"
echo " Ref : ${{ github.ref }}"
echo "========================================="
- name: Checkout
uses: actions/checkout@v5
with:
ref: ${{ needs.codeql-gate.outputs.deploy_sha }}
- name: Setup Node.js 24
uses: actions/setup-node@v5
with:
node-version: '24'
cache: npm
cache-dependency-path: package-lock.json
- name: Install dependencies (with retry)
run: |
echo "::group::npm ci"
for attempt in 1 2 3; do
npm ci && break
[ $attempt -eq 3 ] && { echo "::error::npm ci failed after 3 attempts"; exit 1; }
echo "Attempt $attempt failed — retrying in 15s..."
sleep 15
done
echo "::endgroup::"
- name: TypeScript check
run: npm run typecheck
- name: Env contract guard (no direct process.env outside env.ts)
run: |
if grep -r --include="*.ts" "process\.env" src/ \
| grep -v "src/config/env\.ts"; then
echo "❌ Direct process.env access detected outside env.ts"
echo " Use: import { env } from './config/env.js' instead"
exit 1
fi
echo "✅ Env contract clean — no direct process.env access outside env.ts"
# ---------------------------------------------------------------------------
# JOB: test-api
#
# Full backend test suite — unit tests then integration tests.
# Runs in parallel with validate.
# ---------------------------------------------------------------------------
test-api:
name: API Tests (unit + integration)
runs-on: ubuntu-latest
needs: [codeql-gate]
timeout-minutes: 15
env:
SUPABASE_URL: ${{ secrets.SUPABASE_URL_TEST }}
SUPABASE_ANON_KEY: ${{ secrets.SUPABASE_ANON_KEY_TEST }}
SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_SERVICE_ROLE_KEY_TEST }}
steps:
- name: Checkout
uses: actions/checkout@v5
with:
ref: ${{ needs.codeql-gate.outputs.deploy_sha }}
- name: Setup Node.js 24
uses: actions/setup-node@v5
with:
node-version: '24'
cache: npm
cache-dependency-path: package-lock.json
- name: Install dependencies (with retry)
run: |
echo "::group::npm ci"
for attempt in 1 2 3; do
npm ci && break
[ $attempt -eq 3 ] && { echo "::error::npm ci failed after 3 attempts"; exit 1; }
echo "Attempt $attempt failed — retrying in 15s..."
sleep 15
done
echo "::endgroup::"
- name: Run all tests
run: npm test
# ---------------------------------------------------------------------------
# JOB: build-scan-push
#
# Three-phase security gate — identical build config to pr.yml:
# Phase 1 — Build locally (target: production, same build-args, same cache)
# Phase 2 — Trivy scan: pinned aquasec/trivy:0.49.1 Docker image, exit-code 1
# on HIGH/CRITICAL (blocks push). NOT trivy-action — supply-chain safe.
# DB pre-pulled, scan runs --network none (air-gapped).
# Phase 3 — Push exact scanned image to GHCR (no rebuild)
#
# Image digest verification:
# After building, the digest is compared against the digest stored by
# pr.yml's production-simulation job. A match confirms bit-for-bit parity
# between what was validated in PR and what is being deployed.
# Comparison is best-effort (continue-on-error) because the merge commit
# SHA may differ from the PR head SHA on squash-merges.
# ---------------------------------------------------------------------------
build-scan-push:
name: Build, Scan & Push Docker Image
runs-on: ubuntu-latest
needs: [codeql-gate, validate, test-api]
timeout-minutes: 25
permissions:
contents: read
packages: write
security-events: write
outputs:
sha_short: ${{ steps.meta.outputs.sha_short }}
digest: ${{ steps.digest.outputs.digest }}
deploy_sha: ${{ steps.meta.outputs.deploy_sha }}
steps:
- name: Checkout
uses: actions/checkout@v5
with:
ref: ${{ needs.codeql-gate.outputs.deploy_sha }}
- name: Extract commit SHA
id: meta
env:
DEPLOY_SHA: ${{ needs.codeql-gate.outputs.deploy_sha }}
run: |
echo "sha_short=${DEPLOY_SHA::7}" >> "$GITHUB_OUTPUT"
echo "deploy_sha=$DEPLOY_SHA" >> "$GITHUB_OUTPUT"
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Pull base images (force fresh manifest, prevent stale GHA cache)
run: |
docker pull node:24.2.0-bookworm-slim
docker pull gcr.io/distroless/nodejs24-debian12:nonroot
# Phase 1: Build into local Docker daemon for scanning.
# EXACT same parameters as pr.yml production-simulation:
# target: production, build-args: NODE_ENV=production, GHA cache.
# CACHE_BUSTER forces rebuild when package-lock.json changes (prevents stale deps).
# Cache scoped to production to prevent cross-branch contamination from PR builds.
- name: Build Docker image (pre-scan, no push)
uses: docker/build-push-action@v6
with:
context: .
file: ./Dockerfile
target: production
build-args: |
NODE_ENV=production
CACHE_BUSTER=${{ hashFiles('**/package-lock.json') }}
push: false
load: true
pull: true
tags: |
fieldtrack-api:${{ steps.meta.outputs.sha_short }}
cache-from: type=gha,scope=production
cache-to: type=gha,mode=max,scope=production
# Verify Node.js runtime — exercises TLS stack, not just compile-time version constant.
# tls.createSecureContext() fails if libssl linkage is broken, proving runtime health.
- name: Verify Node.js runtime (TLS operational check)
run: |
IMAGE_NAME="fieldtrack-api:${{ steps.meta.outputs.sha_short }}"
echo "Testing image: $IMAGE_NAME"
docker run --rm \
--entrypoint /nodejs/bin/node \
"$IMAGE_NAME" \
-e "
const crypto = require('crypto');
const tls = require('tls');
const ctx = tls.createSecureContext();
if (!ctx) { process.stderr.write('FAIL: TLS context failed\n'); process.exit(1); }
const h = crypto.createHash('sha256').update('smoke').digest('hex');
if (!h) { process.stderr.write('FAIL: hash failed\n'); process.exit(1); }
process.stdout.write('node=' + process.versions.node + ' openssl=' + process.versions.openssl + ' tls=ok\n');
"
# Capture the content-addressable image digest.
# With cache scoping and cache busting, digest should always reproduce correctly.
- name: Capture image digest
id: digest
run: |
IMAGE_NAME="fieldtrack-api:${{ steps.meta.outputs.sha_short }}"
DIGEST=$(docker inspect "$IMAGE_NAME" --format='{{.Id}}')
echo "digest=$DIGEST" >> "$GITHUB_OUTPUT"
echo "=== Build traceability ==="
echo " Commit SHA : ${{ github.sha }}"
echo " Image tag : $IMAGE_NAME"
echo " Image digest : $DIGEST"
# Compare this digest with the one stored by pr.yml's production-simulation.
# A match = bit-for-bit parity. A mismatch = code/cache divergence (warning).
# continue-on-error: true — squash merges produce a new commit SHA, which
# may cause minor divergence even with identical source code.
- name: Verify image digest parity with PR simulation
continue-on-error: true
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
# Find the PR number associated with this merge commit
PR_NUMBER=$(gh api \
"/repos/${{ github.repository }}/commits/${{ github.sha }}/pulls" \
--header "X-GitHub-Api-Version: 2022-11-28" \
--jq '.[0].number // empty' 2>/dev/null || echo "")
if [ -z "$PR_NUMBER" ]; then
echo "No associated PR found for commit ${{ github.sha }} — skipping digest comparison."
exit 0
fi
echo "Associated PR: #${PR_NUMBER}"
# Find the most recent successful pr.yml run for this PR
RUN_ID=$(gh run list \
--repo "${{ github.repository }}" \
--workflow "pr.yml" \
--json databaseId,conclusion,headSha \
--jq "map(select(.conclusion == \"success\")) | .[0].databaseId // empty" \
2>/dev/null || echo "")
if [ -z "$RUN_ID" ]; then
echo "No successful PR validation run found — skipping digest comparison."
exit 0
fi
# Download the image-digest artifact from that run
gh run download "$RUN_ID" \
--repo "${{ github.repository }}" \
--name "image-digest-pr-${PR_NUMBER}" \
--dir /tmp/pr-digest \
2>/dev/null || true
if [ ! -f /tmp/pr-digest/image-digest.txt ]; then
echo "PR image-digest artifact not found — skipping comparison."
exit 0
fi
PR_DIGEST=$(cat /tmp/pr-digest/image-digest.txt)
DEPLOY_DIGEST="${{ steps.digest.outputs.digest }}"
echo "PR simulation digest: $PR_DIGEST"
echo "Deploy image digest: $DEPLOY_DIGEST"
if [ "$PR_DIGEST" = "$DEPLOY_DIGEST" ]; then
echo "✓ Digest match — bit-for-bit parity confirmed between PR and deploy."
else
echo "⚠ Digest mismatch — builds diverged between PR and deploy."
echo " Expected on squash-merges where the commit SHA changes."
echo " Ensure no source changes occurred between PR approval and deploy trigger."
fi
# Phase 2: Trivy scan — image pinned by immutable digest, NOT trivy-action.
# aquasec/trivy:0.49.1 → sha256:91494b87ddc64f62860d52997532643956c24eeee0d0dda317d563c28c8581bc
# Identical severity gates to pr.yml (HIGH,CRITICAL / exit-code 1).
# Two-phase: DB downloaded first (needs network), then scan runs --network none.
- name: Get date for Trivy DB cache key
id: trivy-date
run: echo "date=$(date +'%Y-%m-%d')" >> "$GITHUB_OUTPUT"
- name: Cache Trivy DB (daily refresh)
uses: actions/cache@v4
with:
path: /tmp/trivy-cache
key: trivy-db-${{ runner.os }}-${{ steps.trivy-date.outputs.date }}
restore-keys: |
trivy-db-${{ runner.os }}-
- name: Pull Trivy vulnerability database
run: |
docker run --rm \
-v /tmp/trivy-cache:/root/.cache \
aquasec/trivy@sha256:91494b87ddc64f62860d52997532643956c24eeee0d0dda317d563c28c8581bc \
image --download-db-only
- name: Scan image with Trivy (HIGH/CRITICAL, ignore-unfixed)
env:
IMAGE_NAME: fieldtrack-api:${{ steps.meta.outputs.sha_short }}
run: |
SCAN_PASSED=false
for i in 1 2 3; do
if docker run --rm \
--network none \
-v /var/run/docker.sock:/var/run/docker.sock \
-v /tmp/trivy-cache:/root/.cache \
-v "$(pwd)/.trivyignore:/tmp/.trivyignore:ro" \
aquasec/trivy@sha256:91494b87ddc64f62860d52997532643956c24eeee0d0dda317d563c28c8581bc image \
--skip-db-update \
--ignore-unfixed \
--severity HIGH,CRITICAL \
--exit-code 1 \
--ignorefile /tmp/.trivyignore \
"$IMAGE_NAME"; then
SCAN_PASSED=true
break
fi
echo "Trivy attempt $i failed..."
[ "$i" -lt 3 ] && sleep 5
done
if [ "$SCAN_PASSED" != "true" ]; then
echo "::error::Trivy scan failed after 3 attempts — HIGH/CRITICAL vulnerabilities found or scan error."
exit 1
fi
echo "✓ Trivy scan passed (HIGH/CRITICAL, ignore-unfixed)"
- name: Scan for unfixed CRITICAL vulnerabilities (informational)
continue-on-error: true
env:
IMAGE_NAME: fieldtrack-api:${{ steps.meta.outputs.sha_short }}
run: |
UNFIXED_COUNT=$(docker run --rm \
--network none \
-v /var/run/docker.sock:/var/run/docker.sock \
-v /tmp/trivy-cache:/root/.cache \
aquasec/trivy@sha256:91494b87ddc64f62860d52997532643956c24eeee0d0dda317d563c28c8581bc image \
--skip-db-update \
--severity CRITICAL \
--format json \
"$IMAGE_NAME" | jq '[.Results[]?.Misconfigurations[]? // .Results[]?.Vulnerabilities[]? | select(.FixedVersion == null or .FixedVersion == "")] | length')
if [ "$UNFIXED_COUNT" -gt 0 ]; then
echo "⚠ WARNING: $UNFIXED_COUNT unfixed CRITICAL vulnerabilities found"
echo " (No patches available upstream — waiting for vendor fix)"
docker run --rm \
--network none \
-v /var/run/docker.sock:/var/run/docker.sock \
-v /tmp/trivy-cache:/root/.cache \
aquasec/trivy@sha256:91494b87ddc64f62860d52997532643956c24eeee0d0dda317d563c28c8581bc image \
--skip-db-update \
--severity CRITICAL \
"$IMAGE_NAME" >> /tmp/unfixed-critical.log || true
else
echo "✓ No unfixed CRITICAL vulnerabilities"
fi
- name: Generate Trivy scan results (SARIF for GitHub Security)
env:
IMAGE_NAME: fieldtrack-api:${{ steps.meta.outputs.sha_short }}
run: |
docker run --rm \
--network none \
-v /var/run/docker.sock:/var/run/docker.sock \
-v /tmp/trivy-cache:/root/.cache \
-v "$(pwd):/workspace" \
aquasec/trivy@sha256:91494b87ddc64f62860d52997532643956c24eeee0d0dda317d563c28c8581bc image \
--skip-db-update \
--format sarif \
--output /workspace/trivy-results.sarif \
"$IMAGE_NAME"
echo "✓ SARIF results written to trivy-results.sarif"
- name: Upload Trivy scan results to GitHub Security
uses: github/codeql-action/upload-sarif@v3
with:
sarif_file: trivy-results.sarif
category: 'trivy-image-scan'
# Phase 3: Scan passed — push the exact scanned image (same layer digests).
# Uses docker tag + push rather than rebuilding to guarantee what was scanned
# is exactly what lands in the registry.
- name: Verify image digest unchanged before push
env:
IMAGE_NAME: fieldtrack-api:${{ steps.meta.outputs.sha_short }}
IMAGE_DIGEST: ${{ steps.digest.outputs.digest }}
run: |
# docker inspect .Id returns the config digest (sha256:...) which is
# stable across tag operations — same value captured in the digest step.
CURRENT=$(docker inspect "$IMAGE_NAME" --format='{{.Id}}')
echo "Expected digest : $IMAGE_DIGEST"
echo "Current digest : $CURRENT"
if [ "$CURRENT" != "$IMAGE_DIGEST" ]; then
echo "ERROR: image digest changed between scan and push — aborting."
exit 1
fi
echo "✓ Digest verified — pushing exactly what was scanned."
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Push verified image to registry
run: |
OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]')
docker tag \
fieldtrack-api:${{ steps.meta.outputs.sha_short }} \
ghcr.io/${OWNER}/api:${{ steps.meta.outputs.sha_short }}
docker push ghcr.io/${OWNER}/api:${{ steps.meta.outputs.sha_short }}
echo "✓ Pushed ghcr.io/${OWNER}/api:${{ steps.meta.outputs.sha_short }}"
# Use the same pinned Trivy image to generate the SBOM — no additional
# tool dependency, no unpinned action, same supply-chain guarantees.
- name: Generate SBOM (CycloneDX)
env:
IMAGE_NAME: fieldtrack-api:${{ steps.meta.outputs.sha_short }}
run: |
docker run --rm \
-v /var/run/docker.sock:/var/run/docker.sock \
aquasec/trivy@sha256:91494b87ddc64f62860d52997532643956c24eeee0d0dda317d563c28c8581bc image \
--format cyclonedx \
--output /dev/stdout \
"$IMAGE_NAME" > sbom.json
- name: Upload SBOM artifact
uses: actions/upload-artifact@v4
with:
name: sbom-${{ steps.meta.outputs.sha_short }}
path: sbom.json
retention-days: 90
- name: Save build provenance
env:
IMAGE_NAME: fieldtrack-api:${{ steps.meta.outputs.sha_short }}
IMAGE_DIGEST: ${{ steps.digest.outputs.digest }}
run: |
echo "commit=${{ github.sha }}" > provenance.txt
echo "ref=${{ github.ref }}" >> provenance.txt
echo "image=${IMAGE_NAME}" >> provenance.txt
echo "digest=${IMAGE_DIGEST}" >> provenance.txt
echo "workflow=${{ github.workflow }}" >> provenance.txt
echo "run_id=${{ github.run_id }}" >> provenance.txt
- name: Upload provenance artifact
uses: actions/upload-artifact@v4
with:
name: provenance-${{ steps.meta.outputs.sha_short }}
path: provenance.txt
retention-days: 90
- name: Build & scan summary
if: always()
env:
IMAGE_DIGEST: ${{ steps.digest.outputs.digest }}
run: |
SBOM_COUNT=$(python3 -c "import json; d=json.load(open('sbom.json')); print(len(d.get('components', [])))" 2>/dev/null || echo 'n/a')
{
echo "## Build · Scan · Push"
echo "| Field | Value |"
echo "|---|---|"
echo "| Commit SHA | \`${{ github.sha }}\` |"
echo "| Image tag | \`fieldtrack-api:${{ steps.meta.outputs.sha_short }}\` |"
echo "| Image digest | \`${IMAGE_DIGEST}\` |"
echo "| SBOM components | ${SBOM_COUNT} |"
echo "| Trivy gate | HIGH,CRITICAL / exit-code 1 / ignore-unfixed |"
echo "| Registry | ghcr.io/${{ github.repository_owner }}/api |"
} >> "$GITHUB_STEP_SUMMARY"
# ---------------------------------------------------------------------------
# JOB: vps-readiness-check
#
# Validates the VPS is in a deployable state BEFORE running the deploy.
# Runs in PARALLEL with vps-readiness-check (both depend on build-scan-push).
# Both must succeed before deploy is allowed to proceed.
#
# Delegates to scripts/vps-readiness-check.sh which checks:
# - Docker daemon running
# - api_network exists (auto-created if missing)
# - Ports 80/443 free from non-nginx processes
# - No API containers with host port bindings
# - Required .env file present
# - Runtime directories present (auto-created if missing)
# - Sufficient disk space (auto-prunes if borderline)
# ---------------------------------------------------------------------------
vps-readiness-check:
name: VPS Readiness Gate
runs-on: ubuntu-latest
needs: [build-scan-push]
timeout-minutes: 10
steps:
- name: Run VPS readiness check via SSH
uses: appleboy/ssh-action@v1.0.3
with:
host: ${{ secrets.DO_HOST }}
username: ${{ secrets.DO_USER }}
key: ${{ secrets.DO_SSH_KEY }}
script: |
set -euo pipefail
export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
[ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT — run vps-setup.sh first"; exit 1; }
cd "$DEPLOY_ROOT"
# Pull latest scripts without full deploy
git fetch origin master --depth=1
git checkout origin/master -- scripts/vps-readiness-check.sh 2>/dev/null || true
chmod +x scripts/vps-readiness-check.sh
./scripts/vps-readiness-check.sh
# ---------------------------------------------------------------------------
# JOB: deploy
#
# Blue-Green deployment to VPS via SSH.
# The deploy-bluegreen.sh script manages slot switching and container health.
#
# DEPENDENCY GATES (both must pass):
# - vps-readiness-check: ensures VPS can accept the deployment
# ---------------------------------------------------------------------------
deploy:
name: Deploy (Blue-Green SSH)
runs-on: ubuntu-latest
needs: [build-scan-push, vps-readiness-check]
timeout-minutes: 20
steps:
- name: Validate required deployment secrets
env:
API_BASE_URL: ${{ secrets.API_BASE_URL }}
CORS_ORIGIN: ${{ secrets.CORS_ORIGIN }}
run: |
if [ -z "${API_BASE_URL:-}" ]; then
echo "::error::API_BASE_URL secret is not set. Deployment aborted."
exit 1
fi
echo "✓ API_BASE_URL is set"
if [ -z "${CORS_ORIGIN:-}" ]; then
echo "::error::CORS_ORIGIN secret is not set. Deployment aborted."
exit 1
fi
echo "✓ CORS_ORIGIN is set"
- name: Log deployment metadata and trigger info
run: |
{
echo "## Deployment Initiated"
echo "| Field | Value |"
echo "|---|---|"
echo "| Commit SHA | \`${{ github.sha }}\` |"
echo "| Trigger event | ${{ github.event_name }} |"
echo "| Triggered by | ${{ github.actor }} |"
echo "| Branch | ${{ github.ref_name }} |"
echo "| Workflow run | [${{ github.run_id }}](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}) |"
echo "| Commit message | \`${{ github.event.head_commit.message }}\` |"
} >> "$GITHUB_STEP_SUMMARY"
echo "📋 Deployment initiated — SHA=${{ github.sha }} EVENT=${{ github.event_name }} ACTOR=${{ github.actor }} RUN=${{ github.run_id }}"
- name: Validate environment contract before deploy
uses: appleboy/ssh-action@v1.0.3
with:
host: ${{ secrets.DO_HOST }}
username: ${{ secrets.DO_USER }}
key: ${{ secrets.DO_SSH_KEY }}
script: |
set -euo pipefail
export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
echo "USER=$(whoami)"
echo "HOME=$HOME"
echo "PWD=$(pwd)"
ls -la "$HOME"
ls -la "$HOME/api"
[ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
cd "$DEPLOY_ROOT"
# Pin repo to the exact SHA that was built and scanned by CodeQL.
# Prevents stale scripts from running if concurrent commits landed.
git fetch origin
git reset --hard ${{ needs.build-scan-push.outputs.deploy_sha }}
chmod +x scripts/*.sh
echo "=== Pre-deploy environment validation ==="
./scripts/validate-env.sh --check-monitoring
echo "✓ Environment contract validated"
- name: Blue-Green deploy via SSH
uses: appleboy/ssh-action@v1.0.3
with:
host: ${{ secrets.DO_HOST }}
username: ${{ secrets.DO_USER }}
key: ${{ secrets.DO_SSH_KEY }}
script: |
set -euo pipefail
T0=$(date +%s)
export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
echo "USER=$(whoami)"
echo "HOME=$HOME"
echo "PWD=$(pwd)"
ls -la "$HOME"
ls -la "$HOME/api"
ls -la "$DEPLOY_ROOT"
[ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
cd "$DEPLOY_ROOT"
# Enforce repo is at the exact SHA being deployed (issue 7 — prevents
# stale deploy scripts if another commit landed during this pipeline run).
git fetch origin
git reset --hard ${{ needs.build-scan-push.outputs.deploy_sha }}
chmod +x scripts/*.sh
# Environment already validated in previous step
./scripts/deploy-bluegreen.sh "${{ needs.build-scan-push.outputs.sha_short }}"
echo "✓ Deploy completed in $(($(date +%s) - T0))s"
- name: Log deployment state (slot + SHA for debugging)
uses: appleboy/ssh-action@v1.0.3
if: always()
with:
host: ${{ secrets.DO_HOST }}
username: ${{ secrets.DO_USER }}
key: ${{ secrets.DO_SSH_KEY }}
script: |
ACTIVE_SLOT=$(cat /var/run/api/active-slot 2>/dev/null || echo "unknown")
ACTIVE_CONTAINER="api-${ACTIVE_SLOT}"
DEPLOY_STATUS="UNKNOWN"
# Health check via in-network curl container — exercises Docker DNS
# and bridge routing (same path nginx uses). NO host port binding needed.
FT_CURL_IMG="curlimages/curl:8.7.1"
if docker inspect "$ACTIVE_CONTAINER" >/dev/null 2>&1; then
if docker run --rm --network api_network "$FT_CURL_IMG" \
-sf --max-time 5 "http://$ACTIVE_CONTAINER:3000/health" >/dev/null 2>&1; then
DEPLOY_STATUS="SUCCESS"
else
DEPLOY_STATUS="UNHEALTHY"
fi
else
DEPLOY_STATUS="CONTAINER_MISSING"
fi
echo "DEPLOY_STATE=$DEPLOY_STATUS | SLOT=$ACTIVE_SLOT | CONTAINER=$ACTIVE_CONTAINER | SHA=${{ github.sha }}"
# ---------------------------------------------------------------------------
# JOB: api-health-gate (Step E+)
#
# Early API health validation — runs AFTER deploy but BEFORE infra sync.
# Ensures the API container is truly healthy before we sync monitoring/nginx.
# If the API is not healthy at this point, STOP before touching infra.
# ---------------------------------------------------------------------------
api-health-gate:
name: API Health Gate
runs-on: ubuntu-latest
needs: [deploy]
timeout-minutes: 5
steps:
- name: Verify API container is healthy before infra sync
uses: appleboy/ssh-action@v1.0.3
with:
host: ${{ secrets.DO_HOST }}
username: ${{ secrets.DO_USER }}
key: ${{ secrets.DO_SSH_KEY }}
script: |
set -euo pipefail
export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
[ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
cd "$DEPLOY_ROOT"
source scripts/load-env.sh
# Determine active slot (blue/green)
ACTIVE_SLOT=$(cat /var/run/api/active-slot 2>/dev/null || echo "blue")
ACTIVE_CONTAINER="api-$ACTIVE_SLOT"
echo "=== API Health Gate (slot: $ACTIVE_SLOT, container: $ACTIVE_CONTAINER) ==="
# Guard: container must exist before we try to reach it
docker inspect "$ACTIVE_CONTAINER" >/dev/null 2>&1 || {
echo "❌ Container $ACTIVE_CONTAINER not found"
exit 1
}
# Poll /ready via in-network curl (Docker DNS + bridge routing).
# /ready checks Redis, Supabase, and BullMQ — definitive readiness gate.
# Uses docker run rather than docker exec so the check exercises the
# same network path nginx uses, not the container's own loopback.
FT_CURL_IMG="curlimages/curl:8.7.1"
for i in $(seq 1 15); do
STATUS=$(docker run --rm --network api_network "$FT_CURL_IMG" \
-s -o /dev/null -w "%{http_code}" \
"http://$ACTIVE_CONTAINER:3000/ready" 2>/dev/null || echo "000")
if [ "$STATUS" = "200" ]; then
echo "✓ API ready (container $ACTIVE_CONTAINER, attempt $i)"
exit 0
fi
echo " Attempt $i: HTTP $STATUS — waiting..."
sleep 2
done
echo "❌ API /ready did not return 200 after 30s — monitoring sync would fail anyway"
docker logs "$ACTIVE_CONTAINER" --tail 30 2>/dev/null || true
exit 1
# ---------------------------------------------------------------------------
# JOB: sync-infra
#
# Syncs Nginx config (with slot-aware port substitution).
# Monitoring restarts are handled exclusively by deploy-bluegreen.sh.
# ---------------------------------------------------------------------------
sync-infra:
name: Sync Infrastructure (nginx)
runs-on: ubuntu-latest
needs: [api-health-gate]
timeout-minutes: 10
steps:
- name: Sync infrastructure configs via SSH
uses: appleboy/ssh-action@v1.0.3
with:
host: ${{ secrets.DO_HOST }}
username: ${{ secrets.DO_USER }}
key: ${{ secrets.DO_SSH_KEY }}
script: |
set -euo pipefail
T0=$(date +%s)
export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
echo "USER=$(whoami)"
echo "HOME=$HOME"
echo "PWD=$(pwd)"
ls -la "$HOME"
ls -la "$HOME/api"
[ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
cd "$DEPLOY_ROOT"
INFRA_DIR="$DEPLOY_ROOT/infra"
NGINX_LIVE="$DEPLOY_ROOT/infra/nginx/live/api.conf"
NGINX_BACKUP_DIR="$DEPLOY_ROOT/infra/nginx/backup"
ACTIVE_SLOT_FILE="/var/run/api/active-slot"
ACTIVE_SLOT=$(cat "$ACTIVE_SLOT_FILE" 2>/dev/null || echo "blue")
ACTIVE_CONTAINER="api-$ACTIVE_SLOT"
# Load env from .env — exports DEPLOY_ROOT, API_HOSTNAME, and all
# app variables. DEPLOY_ROOT is already exported above; load-env.sh uses it.
source "$DEPLOY_ROOT/scripts/load-env.sh"
echo "✓ API_HOSTNAME: $API_HOSTNAME"
# Ensure live/backup dirs exist
mkdir -p "$(dirname "$NGINX_LIVE")" "$NGINX_BACKUP_DIR"
echo "=== Syncing Nginx (slot: $ACTIVE_SLOT, container: $ACTIVE_CONTAINER) ==="
cp "$NGINX_LIVE" "$NGINX_BACKUP_DIR/api.conf.bak.$(date +%s)" 2>/dev/null || true
NGINX_TMP=$(mktemp /tmp/fieldtrack-nginx.XXXXXX.conf)
sed \
-e "s|__ACTIVE_CONTAINER__|$ACTIVE_CONTAINER|g" \
-e "s|__API_HOSTNAME__|$API_HOSTNAME|g" \
"$INFRA_DIR/nginx/api.conf" > "$NGINX_TMP"
cp "$NGINX_TMP" "$NGINX_LIVE"
rm -f "$NGINX_TMP"
if ! docker exec nginx nginx -t 2>&1; then
echo "Nginx test failed — restoring backup..."
LATEST_BAK=$(ls -1t "$NGINX_BACKUP_DIR"/api.conf.bak.* 2>/dev/null | head -1 || true)
[ -n "$LATEST_BAK" ] && cp "$LATEST_BAK" "$NGINX_LIVE"
exit 1
fi
docker exec nginx nginx -s reload
echo "✓ Nginx reloaded."
# ROUTING VALIDATION — Test actual traffic through Nginx
# Phase 1 (source of truth): in-network docker run inside api_network.
# Phase 2 (advisory): HTTPS via localhost + Host header; --insecure handles
# Cloudflare origin cert. status=000 = host→Docker TCP routing issue, not TLS.
echo "=== Testing Nginx routing (in-network primary, HTTPS advisory) ==="
sleep 2 # Give Nginx a moment to fully apply reload
ROUTE_STATUS=$(docker run --rm --network api_network \
curlimages/curl:8.7.1 -s -o /dev/null -w "%{http_code}" \
--max-time 10 http://nginx/health 2>/dev/null || echo "000")
if [ "$ROUTE_STATUS" = "200" ]; then
echo "✓ Nginx routing verified via in-network check (HTTP $ROUTE_STATUS)"
else
echo "❌ Nginx in-network routing broken (HTTP $ROUTE_STATUS expected 200) — restoring backup..."
LATEST_BAK=$(ls -1t "$NGINX_BACKUP_DIR"/api.conf.bak.* 2>/dev/null | head -1 || true)
[ -n "$LATEST_BAK" ] && cp "$LATEST_BAK" "$NGINX_LIVE"
docker exec nginx nginx -t 2>&1 && docker exec nginx nginx -s reload || true
exit 1
fi
# HTTPS advisory check (non-blocking — host→Docker loopback may fail with status=000)
HTTPS_STATUS=$(curl -s -o /dev/null -w "%{http_code}" \
--resolve "$API_HOSTNAME:443:127.0.0.1" \
-H "Host: $API_HOSTNAME" \
"https://127.0.0.1/health" --insecure 2>/dev/null || echo "000")
if [ "$HTTPS_STATUS" = "200" ]; then
echo "✓ HTTPS advisory check passed (HTTP $HTTPS_STATUS)"
else
echo "⚠ HTTPS advisory status=$HTTPS_STATUS (host→Docker TCP routing; in-network check is authoritative)"
fi
echo "✓ Infra sync completed in $(($(date +%s) - T0))s"
# ---------------------------------------------------------------------------
# JOB: sync-monitoring (Step F)
#
# Idempotent monitoring stack sync — runs after every deploy.
# Delegates to scripts/monitoring-sync.sh which:
# - Self-heals missing .env.monitoring from example
# - Creates api_network if absent
# - Renders alertmanager.rendered.yml
# - Runs docker compose up -d
# - Validates prometheus / alertmanager / grafana health
# Monitoring is REQUIRED — deploy fails if any required container is unhealthy.
# ---------------------------------------------------------------------------
sync-monitoring:
name: Sync Monitoring Stack
runs-on: ubuntu-latest
needs: [sync-infra]
timeout-minutes: 15
steps:
- name: Sync and validate monitoring stack via SSH
uses: appleboy/ssh-action@v1.0.3
with:
host: ${{ secrets.DO_HOST }}
username: ${{ secrets.DO_USER }}
key: ${{ secrets.DO_SSH_KEY }}
script: |
set -euo pipefail
export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
[ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
cd "$DEPLOY_ROOT"
chmod +x scripts/monitoring-sync.sh
./scripts/monitoring-sync.sh
- name: Monitoring sync summary
if: always()
run: |
{
echo "## Monitoring Sync"
echo "| Container | Required |"
echo "|---|---|"
echo "| prometheus | ✅ |"
echo "| alertmanager | ✅ |"
echo "| grafana | ✅ |"
} >> "$GITHUB_STEP_SUMMARY"
- name: Deployment artifact traceability
if: always()
run: |
{
echo "## Deployment Artifacts"
echo "| Field | Value |"
echo "|---|---|"
echo "| Deployment SHA | \`${{ github.sha }}\` |"
echo "| Image Tag | \`fieldtrack-api:${{ needs.get-metadata.outputs.sha_short || github.sha }}\` |"
echo "| Workflow Run | [\#${{ github.run_number }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) |"
echo "| Triggered By | \`${{ github.event_name }}\` |"
echo "| Commit Message | \`${{ github.event.head_commit.message }}\` |"
} >> "$GITHUB_STEP_SUMMARY"
# Also output to logs for audit trail
echo "DEPLOYMENT_COMPLETE: SHA=${{ github.sha }} IMAGE=ghcr.io/${{ github.repository_owner }}/api:${{ github.sha }} RUN=${{ github.run_id }}"
# ---------------------------------------------------------------------------
# JOB: health-and-smoke
#
# Step 1: Poll /health and /ready until they return 200 (up to 60 s each).
# Step 2: Run the full smoke test suite (login + core API flows).
# Failure here triggers the rollback job automatically.
# ---------------------------------------------------------------------------
health-and-smoke:
name: Health Checks & Smoke Tests
runs-on: ubuntu-latest
needs: [sync-infra, sync-monitoring]
timeout-minutes: 15
steps:
- name: Checkout
uses: actions/checkout@v5
- name: Wait for /health endpoint (via VPS)
uses: appleboy/ssh-action@v1.0.3
with:
host: ${{ secrets.DO_HOST }}
username: ${{ secrets.DO_USER }}
key: ${{ secrets.DO_SSH_KEY }}
script: |
set -euo pipefail
export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
[ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
cd "$DEPLOY_ROOT"
source scripts/load-env.sh
echo "=== Checking /health via VPS (API_HOSTNAME=$API_HOSTNAME) ==="
for i in $(seq 1 30); do
echo "---- Attempt $i ----"
# Phase 1: in-network (source of truth)
INNET_BODY=$(docker run --rm --network api_network \
curlimages/curl:8.7.1 -s --max-time 5 http://nginx/health 2>/dev/null || echo "")
if echo "$INNET_BODY" | grep -q '"status":"ok"'; then
echo "✓ /health OK via in-network (attempt $i)"
exit 0
fi
# Phase 2: HTTPS advisory (--insecure for Cloudflare origin cert; status=000 = host→Docker TCP issue)
STATUS=$(curl -sS \
--resolve "${API_HOSTNAME}:443:127.0.0.1" \
-o /tmp/resp.txt \
-w "%{http_code}" \
https://${API_HOSTNAME}/health \
--insecure 2>/dev/null || echo "000")
BODY=$(cat /tmp/resp.txt 2>/dev/null || echo "")
echo "HTTP: $STATUS BODY: $BODY"
if [ "$STATUS" = "200" ] && echo "$BODY" | grep -q '"status":"ok"'; then
echo "✓ /health OK via HTTPS (attempt $i)"
exit 0
fi
[ "$STATUS" = "000" ] && echo "⚠ HTTPS status=000 (host→Docker routing; in-network is authoritative)"
sleep 2
done
echo "❌ /health failed after 30 attempts"
exit 1
- name: Wait for /health endpoint (final public check)
uses: appleboy/ssh-action@v1.0.3
with:
host: ${{ secrets.DO_HOST }}
username: ${{ secrets.DO_USER }}
key: ${{ secrets.DO_SSH_KEY }}
script: |
set -euo pipefail
export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
[ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
cd "$DEPLOY_ROOT"
source scripts/load-env.sh
echo "=== Final health check via public endpoint (API_HOSTNAME=$API_HOSTNAME) ==="
for i in $(seq 1 10); do
echo "---- Attempt $i ----"
# Phase 1: in-network (source of truth)
INNET_BODY=$(docker run --rm --network api_network \
curlimages/curl:8.7.1 -s --max-time 5 http://nginx/health 2>/dev/null || echo "")
if echo "$INNET_BODY" | grep -q '"status":"ok"'; then
echo "✓ /health OK via in-network (attempt $i)"
exit 0
fi
# Phase 2: HTTPS advisory (--insecure for Cloudflare origin cert; status=000 = host→Docker TCP issue)
STATUS=$(curl -sS \
--resolve "${API_HOSTNAME}:443:127.0.0.1" \
-o /tmp/resp.txt \
-w "%{http_code}" \
https://${API_HOSTNAME}/health \
--insecure 2>/dev/null || echo "000")
BODY=$(cat /tmp/resp.txt 2>/dev/null || echo "")
echo "HTTP: $STATUS BODY: $BODY"
if [ "$STATUS" = "200" ] && echo "$BODY" | grep -q '"status":"ok"'; then
echo "✓ /health OK via HTTPS (attempt $i)"
exit 0
fi
[ "$STATUS" = "000" ] && echo "⚠ HTTPS status=000 (host→Docker routing; in-network is authoritative)"
sleep 2
done
echo "❌ /health failed after 10 attempts"
exit 1
- name: Run smoke tests
env:
API_BASE_URL: ${{ secrets.API_BASE_URL }}
FT_EMP_EMAIL: ${{ secrets.FT_EMP_EMAIL }}
FT_EMP_PASSWORD: ${{ secrets.FT_EMP_PASSWORD }}
FT_ADMIN_EMAIL: ${{ secrets.FT_ADMIN_EMAIL }}
FT_ADMIN_PASSWORD: ${{ secrets.FT_ADMIN_PASSWORD }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
SUPABASE_ANON_KEY: ${{ secrets.SUPABASE_ANON_KEY }}
run: |
chmod +x scripts/smoke-test.sh
./scripts/smoke-test.sh
- name: Upload smoke test report
if: always()
uses: actions/upload-artifact@v4
with:
name: smoke-test-report-${{ github.sha }}
path: smoke-report.json
retention-days: 30
- name: Deployment summary
run: |
echo "====================================================="
echo " Production Deployment: COMPLETE ✅"
echo "====================================================="
echo " Commit: ${{ github.sha }}"
echo " /health: OK"
echo " /ready: OK"
echo " Smoke: passed"
echo "====================================================="
# ---------------------------------------------------------------------------
# JOB: rollback
#
# Triggered automatically when deploy, sync-infra, OR health-and-smoke fails.
# Restores the previously healthy Blue-Green slot via the rollback script.
# 'if: always()' ensures this job can evaluate even if upstream jobs failed.
# ---------------------------------------------------------------------------
rollback:
name: Rollback Deployment (auto)
runs-on: ubuntu-latest
needs: [vps-readiness-check, deploy, api-health-gate, sync-infra, sync-monitoring, health-and-smoke]
timeout-minutes: 10
if: |
always() &&
(
needs.vps-readiness-check.result == 'failure' ||
needs.deploy.result == 'failure' ||
needs.api-health-gate.result == 'failure' ||
needs.sync-infra.result == 'failure' ||
needs.sync-monitoring.result == 'failure' ||
needs.health-and-smoke.result == 'failure'
)
steps:
- name: Log rollback trigger
run: |
echo "ROLLBACK_TRIGGERED=TRUE | FAILED_JOBS:"
[ "${{ needs.vps-readiness-check.result }}" = "failure" ] && echo " - vps-readiness-check"
[ "${{ needs.deploy.result }}" = "failure" ] && echo " - deploy"
[ "${{ needs.api-health-gate.result }}" = "failure" ] && echo " - api-health-gate"
[ "${{ needs.sync-infra.result }}" = "failure" ] && echo " - sync-infra"
[ "${{ needs.sync-monitoring.result }}" = "failure" ] && echo " - sync-monitoring"
[ "${{ needs.health-and-smoke.result }}" = "failure" ] && echo " - health-and-smoke"
echo "SHA=${{ github.sha }}"
- name: Rollback on VPS
uses: appleboy/ssh-action@v1.0.3
with:
host: ${{ secrets.DO_HOST }}
username: ${{ secrets.DO_USER }}
key: ${{ secrets.DO_SSH_KEY }}
script: |
set -euo pipefail
export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
echo "USER=$(whoami)"
echo "HOME=$HOME"
echo "PWD=$(pwd)"
ls -la "$HOME"
ls -la "$HOME/api"
[ -d "$DEPLOY_ROOT" ] || { echo "❌ DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
cd "$DEPLOY_ROOT"
chmod +x scripts/*.sh
./scripts/rollback.sh --auto
# Log final state
ACTIVE_SLOT=$(cat /var/run/api/active-slot 2>/dev/null || echo "unknown")
echo "ROLLBACK_COMPLETE | ACTIVE_SLOT=$ACTIVE_SLOT | SHA=${{ github.sha }}"