Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 83 additions & 1 deletion scripts/deploy-bluegreen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,39 @@ _ft_final_state() {
_ft_log "msg='final_state' deploy_id=$DEPLOY_ID active=$active_container sha=${image_sha:0:12} nginx_upstream=$nginx_upstream"
}

# ---------------------------------------------------------------------------
# DOCKER HEALTH GATE
# Waits for the container's HEALTHCHECK to reach "healthy" before allowing
# nginx to switch. If the container has no HEALTHCHECK defined, this returns
# immediately (status="none") to avoid blocking on unconfigured containers.
# ---------------------------------------------------------------------------
_ft_wait_docker_health() {
local name="$1"
local i=1
local STATUS
while [ "$i" -le 30 ]; do
STATUS=$(docker inspect --format='{{.State.Health.Status}}' "$name" 2>/dev/null || echo "none")
if [ "$STATUS" = "healthy" ]; then
_ft_log "msg='docker health check passed' container=$name"
return 0
fi
if [ "$STATUS" = "unhealthy" ]; then
_ft_error "msg='docker health check failed' container=$name status=unhealthy"
return 1
fi
# "none" means the image has no HEALTHCHECK β€” skip gate (return 0 immediately)
if [ "$STATUS" = "none" ]; then
_ft_log "msg='docker health gate skipped (no HEALTHCHECK defined)' container=$name"
return 0
fi
[ $(( i % 5 )) -eq 0 ] && _ft_log "msg='waiting for docker health' attempt=$i/30 status=$STATUS container=$name"
sleep 2
i=$(( i + 1 ))
done
_ft_error "msg='docker health timeout' container=$name last_status=$STATUS"
return 1
}

# ---------------------------------------------------------------------------
# SYSTEM SNAPSHOT -- emitted on any unrecoverable failure
# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -991,7 +1024,35 @@ _ft_log "msg='phase_complete' phase=HEALTH_CHECK_INTERNAL status=success contain
_ft_phase_end "HEALTH_CHECK_INTERNAL"

# ---------------------------------------------------------------------------
# [5/7] SWITCH NGINX UPSTREAM
# DOCKER HEALTH GATE
# Ensures the container's HEALTHCHECK has settled to "healthy" before
# switching nginx. Prevents routing to a container that is "starting".
# ---------------------------------------------------------------------------
if ! _ft_wait_docker_health "$INACTIVE_NAME"; then
docker logs "$INACTIVE_NAME" --tail 50 >&2 || true
docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
docker rm "$INACTIVE_NAME" || true
_ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=docker_health_failed container=$INACTIVE_NAME"
fi

# STABILIZATION DELAY -- brief pause after docker health gate to let
# any in-flight connection setup settle (TLS session init, worker warm-up).
_ft_log "msg='stabilization delay' container=$INACTIVE_NAME"
sleep 3

# PRE-SWITCH CONNECTIVITY CHECK
# Direct in-network probe of the new container BEFORE touching nginx.
# Validates Docker DNS resolution + bridge routing work for the new container
# one final time with a clean, fresh curl invocation.
if ! docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" \
-sf --max-time 5 "http://$INACTIVE_NAME:$APP_PORT/ready" >/dev/null 2>&1; then
_ft_error "msg='pre-switch connectivity check failed' container=$INACTIVE_NAME"
docker logs "$INACTIVE_NAME" --tail 50 >&2 || true
docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
docker rm "$INACTIVE_NAME" || true
_ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=pre_switch_connectivity_failed container=$INACTIVE_NAME"
fi
_ft_log "msg='pre-switch connectivity check passed' container=$INACTIVE_NAME"
# ---------------------------------------------------------------------------
_ft_state "SWITCH_NGINX" "msg='switching nginx upstream' container=$INACTIVE_NAME"

Expand Down Expand Up @@ -1113,6 +1174,27 @@ fi
unset _POST_SWITCH_OK _ps
_ft_log "msg='post-switch routing verification passed'"

# POST-SWITCH UPSTREAM VERIFICATION
# Directly probe the new container via its in-network address after nginx
# has confirmed routing. Ensures the upstream backend itself is still
# responding β€” nginx routing healthy does NOT imply backend healthy.
if ! docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" \
-sf --max-time 5 "http://$INACTIVE_NAME:$APP_PORT/ready" >/dev/null 2>&1; then
_ft_error "msg='post-switch upstream verification failed' container=$INACTIVE_NAME"
_ft_snapshot
cp "$NGINX_BACKUP" "$NGINX_CONF"
if docker exec nginx nginx -t >/dev/null 2>&1 && docker exec nginx nginx -s reload >/dev/null 2>&1; then
_ft_log "msg='nginx restored (post-switch upstream failure)'"
else
_ft_log "level=ERROR msg='nginx restore failed during upstream verification rollback'"
fi
_ft_write_slot "$ACTIVE"
docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
docker rm "$INACTIVE_NAME" || true
_ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=post_switch_upstream_failed container=$INACTIVE_NAME"
fi
_ft_log "msg='post-switch upstream verification passed' container=$INACTIVE_NAME"

# ---------------------------------------------------------------------------
# [6/7] PUBLIC HEALTH CHECK (end-to-end nginx routing)
# Validates:
Expand Down
2 changes: 1 addition & 1 deletion scripts/vps-readiness-check.sh
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ for f in "${REQUIRED_ENV_FILES[@]}"; do
done

# .env.monitoring is optional (monitoring-sync.sh self-heals from example)
if [ ! -f "$DEPLOY_ROOT/.env.monitoring" ]; then
if [ ! -f "$DEPLOY_ROOT/infra/.env.monitoring" ]; then
warn ".env.monitoring not found β€” monitoring-sync.sh will create it from example during deploy."
fi

Expand Down
Loading