diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 7b2cdeb..464f3e9 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -241,8 +241,10 @@ jobs: # i.e., direct references to alertmanager, loki push clients, or # docker-compose.monitoring in the application source. # Exclude comment-only lines (-h suppresses filenames for grep -Ev). + # With bash -o pipefail (GHA default), grep exits 1 when there are zero matches; + # that must not fail the step — only non-empty LEAKS after filtering is an error. LEAKS=$(grep -rhE "(alertmanager|docker-compose\.monitoring)" src/ tests/ 2>/dev/null \ - | grep -Ev '^\s*(//|#|\*|/\*)') + | grep -Ev '^\s*(//|#|\*|/\*)' || true) if [ -n "$LEAKS" ]; then echo "::error::Infra client references found in src/ or tests/" echo "$LEAKS" @@ -278,7 +280,7 @@ jobs: # Guard 1: no stale network name (fieldtrack_network is not the canonical name) if grep -rE '\bfieldtrack_network\b' src/ scripts/ \ --include='*.ts' --include='*.sh' \ - 2>/dev/null | grep -Ev '^\s*#'; then + 2>/dev/null | grep -Ev '^[^:]+:\s*(#|//)'; then echo "::error::Forbidden network name 'fieldtrack_network' found — canonical name is 'api_network'" FAIL=1 fi @@ -791,7 +793,8 @@ jobs: username: ${{ secrets.DO_USER }} key: ${{ secrets.DO_SSH_KEY }} script: | - ACTIVE_SLOT=$(cat /var/lib/fieldtrack/active-slot 2>/dev/null || echo "unknown") + export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" + ACTIVE_SLOT=$(cat "$DEPLOY_ROOT/.fieldtrack/active-slot" 2>/dev/null || cat /var/lib/fieldtrack/active-slot 2>/dev/null || echo "unknown") ACTIVE_CONTAINER="api-${ACTIVE_SLOT}" DEPLOY_STATUS="UNKNOWN" @@ -836,7 +839,7 @@ jobs: [ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; } cd "$DEPLOY_ROOT" - ACTIVE_SLOT=$(cat /var/lib/fieldtrack/active-slot 2>/dev/null || echo "blue") + ACTIVE_SLOT=$(cat "$DEPLOY_ROOT/.fieldtrack/active-slot" 2>/dev/null || cat /var/lib/fieldtrack/active-slot 2>/dev/null || echo "blue") ACTIVE_CONTAINER="api-$ACTIVE_SLOT" docker inspect "$ACTIVE_CONTAINER" >/dev/null 2>&1 || { @@ -896,7 +899,7 @@ jobs: # Scope: scripts/ and src/ only (not workflows where guard steps live). if grep -rE "\./infra/|\.\.\./infra/" scripts/ src/ \ --binary-files=without-match --exclude-dir=node_modules 2>/dev/null \ - | grep -Ev '^\s*#'; then + | grep -Ev '^[^:]+:\s*(#|//)'; then echo "::error::Local repo-relative infra coupling (./infra/ or ../infra/) detected in scripts/ or src/" exit 1 fi @@ -913,7 +916,8 @@ jobs: export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" [ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; } cd "$DEPLOY_ROOT" - API_BASE_URL=$(grep -E '^API_BASE_URL=' .env | head -1 | cut -d'=' -f2-) + API_BASE_URL=$(grep -E '^API_BASE_URL=' .env 2>/dev/null | head -1 | cut -d'=' -f2- || true) + [ -n "$API_BASE_URL" ] || { echo "::error::API_BASE_URL missing or empty in .env"; exit 1; } API_HOSTNAME=$(echo "$API_BASE_URL" | sed -E 's|^https?://||' | cut -d'/' -f1) for i in $(seq 1 30); do # Phase 1: in-network (source of truth) @@ -949,7 +953,8 @@ jobs: export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" [ -d "$DEPLOY_ROOT" ] || { echo "::error::DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; } cd "$DEPLOY_ROOT" - API_BASE_URL=$(grep -E '^API_BASE_URL=' .env | head -1 | cut -d'=' -f2-) + API_BASE_URL=$(grep -E '^API_BASE_URL=' .env 2>/dev/null | head -1 | cut -d'=' -f2- || true) + [ -n "$API_BASE_URL" ] || { echo "::error::API_BASE_URL missing or empty in .env"; exit 1; } API_HOSTNAME=$(echo "$API_BASE_URL" | sed -E 's|^https?://||' | cut -d'/' -f1) for i in $(seq 1 10); do # Phase 1: in-network (source of truth) @@ -986,7 +991,8 @@ jobs: username: ${{ secrets.DO_USER }} key: ${{ secrets.DO_SSH_KEY }} script: | - ACTIVE_SLOT=$(cat /var/lib/fieldtrack/active-slot 2>/dev/null || echo "unknown") + export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" + ACTIVE_SLOT=$(cat "$DEPLOY_ROOT/.fieldtrack/active-slot" 2>/dev/null || cat /var/lib/fieldtrack/active-slot 2>/dev/null || echo "unknown") ACTIVE_CONTAINER="api-${ACTIVE_SLOT}" FT_CURL_IMG="curlimages/curl:8.7.1" READY_RESP=$(docker run --rm --network api_network "$FT_CURL_IMG" \ @@ -1061,5 +1067,5 @@ jobs: chmod +x scripts/*.sh ./scripts/deploy.sh --rollback --auto - ACTIVE_SLOT=$(cat /var/lib/fieldtrack/active-slot 2>/dev/null || echo "unknown") + ACTIVE_SLOT=$(cat "$DEPLOY_ROOT/.fieldtrack/active-slot" 2>/dev/null || cat /var/lib/fieldtrack/active-slot 2>/dev/null || echo "unknown") echo "[DEPLOY] Rollback complete — slot=$ACTIVE_SLOT sha=${{ github.sha }}" diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 53f667a..66ed18e 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -123,7 +123,7 @@ jobs: if grep -rE '\bfieldtrack_network\b' src/ scripts/ \ --include='*.ts' --include='*.sh' \ - 2>/dev/null | grep -Ev '^\s*#'; then + 2>/dev/null | grep -Ev '^[^:]+:\s*(#|//)'; then echo "::error::Forbidden network name 'fieldtrack_network' — canonical name is 'api_network'" FAIL=1 fi diff --git a/.gitignore b/.gitignore index 11fb191..9849c2c 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,8 @@ # Deployment history (VPS-side file, never committed) .deploy_history .last_deploy +# Fallback deploy state when /var/lib/fieldtrack is not writable (slot, lock) +.fieldtrack/ # ---------------- # Dependencies diff --git a/scripts/deploy.sh b/scripts/deploy.sh index 4ad1d62..58ffa5b 100644 --- a/scripts/deploy.sh +++ b/scripts/deploy.sh @@ -29,6 +29,10 @@ # - NEVER depends on: Redis, Supabase, BullMQ, monitoring stack # - No /ready usage anywhere in this script # - All nginx reloads flow through switch_nginx() — exactly once per deploy +# +# Deploy state (slot, lock, last-good): +# - FIELDTRACK_STATE_DIR or /var/lib/fieldtrack when writable (sudo chown if needed) +# - Otherwise $DEPLOY_ROOT/.fieldtrack; existing /var/lib/fieldtrack/* is migrated once # ============================================================================= set -euo pipefail if [ "${DEBUG:-false}" = "true" ]; then set -x; fi @@ -302,14 +306,66 @@ run() { # --------------------------------------------------------------------------- # SLOT DIRECTORY AND FILE MANAGEMENT +# +# Primary state dir: FIELDTRACK_STATE_DIR or /var/lib/fieldtrack (persistent). +# If that path exists but is root-owned (common after manual/bootstrap mkdir), +# we sudo chown it for the deploy user. If we still cannot write, fall back to +# $DEPLOY_ROOT/.fieldtrack (always user-writable) and migrate slot files once. # --------------------------------------------------------------------------- +_ft_make_state_dir_writable() { + local d="$1" + if [ ! -d "$d" ]; then + sudo mkdir -p "$d" 2>/dev/null || return 1 + fi + if [ -w "$d" ]; then + return 0 + fi + sudo chown "$(id -un):$(id -gn)" "$d" 2>/dev/null || return 1 + sudo chmod u+rwx "$d" 2>/dev/null || true + [ -w "$d" ] +} + +_ft_migrate_state_from_var_lib_if_needed() { + local legacy="/var/lib/fieldtrack" + [ "$SLOT_DIR" = "$legacy" ] && return 0 + [ -f "$ACTIVE_SLOT_FILE" ] && return 0 + [ ! -r "$legacy/active-slot" ] && return 0 + _ft_log "msg='migrating active-slot from legacy path' from=$legacy/active-slot" + cp -a "$legacy/active-slot" "$ACTIVE_SLOT_FILE" 2>/dev/null || true + if [ -f "$legacy/active-slot.backup" ] && [ ! -f "$SLOT_BACKUP_FILE" ]; then + cp -a "$legacy/active-slot.backup" "$SLOT_BACKUP_FILE" 2>/dev/null || true + fi + if [ -f "$legacy/last-good" ] && [ ! -f "$LAST_GOOD_FILE" ]; then + cp -a "$legacy/last-good" "$LAST_GOOD_FILE" 2>/dev/null || true + fi +} + +_ft_init_fieldtrack_state() { + local preferred="${FIELDTRACK_STATE_DIR:-/var/lib/fieldtrack}" + SLOT_DIR="$preferred" + if ! _ft_make_state_dir_writable "$SLOT_DIR"; then + SLOT_DIR="$DEPLOY_ROOT/.fieldtrack" + mkdir -p "$SLOT_DIR" + _ft_log "msg='preferred state dir not writable; using DEPLOY_ROOT fallback' preferred=$preferred fallback=$SLOT_DIR user=$(id -un)" + fi + ACTIVE_SLOT_FILE="$SLOT_DIR/active-slot" + SLOT_BACKUP_FILE="$SLOT_DIR/active-slot.backup" + LOCK_FILE="$SLOT_DIR/deploy.lock" + SNAP_DIR="$SLOT_DIR" + LAST_GOOD_FILE="$SNAP_DIR/last-good" + _ft_migrate_state_from_var_lib_if_needed +} + _ft_ensure_slot_dir() { if [ ! -d "$SLOT_DIR" ]; then - _ft_log "msg='slot dir missing, creating' path=$SLOT_DIR" - sudo mkdir -p "$SLOT_DIR" - sudo chown "$(id -un):$(id -gn)" "$SLOT_DIR" - sudo chmod 750 "$SLOT_DIR" + mkdir -p "$SLOT_DIR" 2>/dev/null || sudo mkdir -p "$SLOT_DIR" + sudo chown "$(id -un):$(id -gn)" "$SLOT_DIR" 2>/dev/null || true + fi + if [ ! -w "$SLOT_DIR" ]; then + _ft_log "level=ERROR msg='slot directory not writable' path=$SLOT_DIR user=$(id -un)" + return 1 fi + return 0 } _ft_ensure_slot_backup_dir() { @@ -331,7 +387,7 @@ _ft_validate_slot() { _ft_write_slot() { local slot="$1" _ft_validate_slot "$slot" || return 1 - _ft_ensure_slot_dir + _ft_ensure_slot_dir || _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=slot_dir_not_writable" local tmp tmp=$(mktemp "${SLOT_DIR}/active-slot.XXXXXX") printf '%s\n' "$slot" > "$tmp" @@ -350,7 +406,7 @@ _ft_write_slot() { # DEPLOYMENT LOCK # --------------------------------------------------------------------------- _ft_acquire_lock() { - _ft_ensure_slot_dir + _ft_ensure_slot_dir || _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=slot_dir_not_writable" _ft_log "msg='acquiring deployment lock' pid=$$ file=$LOCK_FILE" exec 200>"$LOCK_FILE" if ! flock -n 200; then @@ -469,7 +525,7 @@ pull_image() { # --------------------------------------------------------------------------- resolve_slot() { _ft_state "RESOLVE_SLOT" "msg='determining active slot'" - _ft_ensure_slot_dir + _ft_ensure_slot_dir || _ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=slot_dir_not_writable" local recovered_slot="" @@ -1213,15 +1269,15 @@ DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}" REPO_DIR="$DEPLOY_ROOT" INFRA_ROOT="${INFRA_ROOT:-/opt/infra}" +_ft_init_fieldtrack_state + BLUE_NAME="api-blue" GREEN_NAME="api-green" APP_PORT=3000 NETWORK="api_network" _FT_CURL_IMG="curlimages/curl:8.7.1" -SLOT_DIR="/var/lib/fieldtrack" -ACTIVE_SLOT_FILE="$SLOT_DIR/active-slot" -SLOT_BACKUP_FILE="/var/lib/fieldtrack/active-slot.backup" # persists; same dir as primary +# SLOT_DIR, ACTIVE_SLOT_FILE, LOCK_FILE, etc. — set by _ft_init_fieldtrack_state() NGINX_CONF="$INFRA_ROOT/nginx/live/api.conf" NGINX_LIVE_DIR="$INFRA_ROOT/nginx/live" @@ -1233,10 +1289,6 @@ MAX_HISTORY=5 MAX_HEALTH_ATTEMPTS=40 HEALTH_INTERVAL=3 -LOCK_FILE="$SLOT_DIR/deploy.lock" -SNAP_DIR="$SLOT_DIR" -LAST_GOOD_FILE="$SNAP_DIR/last-good" - # DEPLOY_HISTORY is set inside preflight() after _ft_load_env() DEPLOY_HISTORY="" diff --git a/scripts/vps-readiness-check.sh b/scripts/vps-readiness-check.sh index ade5d32..e24aec9 100644 --- a/scripts/vps-readiness-check.sh +++ b/scripts/vps-readiness-check.sh @@ -13,7 +13,7 @@ # # HARD FAILURES (exit 1): # - Docker daemon not running -# - Ports 80 or 443 occupied by ANY non-docker-proxy, non-nginx process +# - Ports 80 or 443 occupied by processes other than docker-proxy / nginx # - Any container has host port bindings (violates production architecture) # - Required containers not attached to api_network # - Required .env file missing @@ -87,40 +87,34 @@ if ! docker network ls --format '{{.Name}}' | grep -Eq "^${NETWORK}$"; then else ok "Network '$NETWORK' exists." fi -# ── CHECK 4: Ports 80 and 443 — no non-docker processes ────────────────────── +# ── CHECK 4: Ports 80 and 443 — expected listeners only ───────────────────── # -# Design: we do NOT auto-kill unknown processes. If port 80 or 443 is held by -# a non-docker process (e.g., system nginx, apache, lighttpd), that is a VPS -# configuration error that requires operator action. Silently killing unknown -# processes risks breaking the system in unpredictable ways. +# Design: we do NOT auto-kill unknown processes. Published container ports show +# up as docker-proxy (full name in `ss -tlnp`; lsof often truncates COMMAND to +# 8 chars e.g. "docker-pr", which broke older allowlists). # -# Allowed occupants (hard-coded safe list): -# - docker-proxy (managed by Docker / our nginx container) -# - nginx (running as Docker container — docker exec nginx) +# Use ss (same as elsewhere in this script) and allow: +# - docker-proxy / docker-pr (truncated) — Docker publishing nginx :80/:443 +# - nginx — system or container worker name in ss output # -# Everything else → hard fail with diagnostics. +# Everything else on these ports → hard fail (e.g. apache bind-mount). echo "" -echo "--- CHECK 4: Port 80/443 — no non-docker processes ---" +echo "--- CHECK 4: Port 80/443 — docker-proxy / nginx only ---" _check_port() { local port="$1" + local listeners + listeners=$(sudo ss -tlnp "sport = :${port}" 2>/dev/null || ss -tlnp "sport = :${port}" 2>/dev/null || true) - # Check if anything is listening on the port at all - if ! ss -tlnp "sport = :${port}" 2>/dev/null | grep -q 'LISTEN'; then + if ! echo "$listeners" | grep -q 'LISTEN'; then ok "Port $port is free." return 0 fi - # Check for non-docker-proxy, non-nginx processes via lsof - # lsof -i :PORT lists ALL processes holding the port. - # We exclude docker-proxy and nginx (expected Docker-managed processes). - NON_DOCKER=$(sudo lsof -i ":${port}" -sTCP:LISTEN -P -n 2>/dev/null \ - | awk 'NR>1 {print $1, $2}' \ - | grep -vE '^(docker-pro|nginx)' || true) - - if [ -n "$NON_DOCKER" ]; then - record_failure "Port $port is occupied by a non-docker process." - echo " Offending process(es):" - sudo lsof -i ":${port}" -sTCP:LISTEN -P -n 2>/dev/null | awk 'NR>1' | sed 's/^/ /' + # Any LISTEN line that does not reference an allowed process is a failure. + if echo "$listeners" | grep 'LISTEN' | grep -Ev 'docker-proxy|docker-pr|nginx' | grep -q .; then + record_failure "Port $port is occupied by an unexpected process (not docker-proxy/nginx)." + echo " Listeners (ss -tlnp sport = :${port}):" + echo "$listeners" | sed 's/^/ /' echo " This is a VPS configuration error. Stop the conflicting service before deploying." echo " Example: sudo systemctl stop nginx OR sudo systemctl stop apache2" return 1 @@ -198,9 +192,16 @@ for dir in "$RUNTIME_DIR" "$LOG_DIR"; do if [ ! -d "$dir" ]; then warn "Runtime directory missing: $dir — creating it." install -d -m 750 "$dir" 2>/dev/null || sudo install -d -m 750 "$dir" - ok "Created: $dir" + fi + if [ ! -w "$dir" ]; then + warn "Runtime directory not writable by deploy user: $dir — fixing ownership." + sudo chown "$(id -un):$(id -gn)" "$dir" 2>/dev/null || true + sudo chmod u+rwx "$dir" 2>/dev/null || true + fi + if [ ! -w "$dir" ]; then + record_failure "Cannot write to $dir — run: sudo chown -R $(id -un):$(id -gn) $dir" else - ok "Directory exists: $dir" + ok "Directory ready (writable): $dir" fi done