Skip to content

Commit c69a1d1

Browse files
committed
fix(vps): update path for optional .env.monitoring file check
1 parent 61d359f commit c69a1d1

2 files changed

Lines changed: 84 additions & 2 deletions

File tree

scripts/deploy-bluegreen.sh

Lines changed: 83 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,39 @@ _ft_final_state() {
170170
_ft_log "msg='final_state' deploy_id=$DEPLOY_ID active=$active_container sha=${image_sha:0:12} nginx_upstream=$nginx_upstream"
171171
}
172172

173+
# ---------------------------------------------------------------------------
174+
# DOCKER HEALTH GATE
175+
# Waits for the container's HEALTHCHECK to reach "healthy" before allowing
176+
# nginx to switch. If the container has no HEALTHCHECK defined, this returns
177+
# immediately (status="none") to avoid blocking on unconfigured containers.
178+
# ---------------------------------------------------------------------------
179+
_ft_wait_docker_health() {
180+
local name="$1"
181+
local i=1
182+
local STATUS
183+
while [ "$i" -le 30 ]; do
184+
STATUS=$(docker inspect --format='{{.State.Health.Status}}' "$name" 2>/dev/null || echo "none")
185+
if [ "$STATUS" = "healthy" ]; then
186+
_ft_log "msg='docker health check passed' container=$name"
187+
return 0
188+
fi
189+
if [ "$STATUS" = "unhealthy" ]; then
190+
_ft_error "msg='docker health check failed' container=$name status=unhealthy"
191+
return 1
192+
fi
193+
# "none" means the image has no HEALTHCHECK — skip gate (return 0 immediately)
194+
if [ "$STATUS" = "none" ]; then
195+
_ft_log "msg='docker health gate skipped (no HEALTHCHECK defined)' container=$name"
196+
return 0
197+
fi
198+
[ $(( i % 5 )) -eq 0 ] && _ft_log "msg='waiting for docker health' attempt=$i/30 status=$STATUS container=$name"
199+
sleep 2
200+
i=$(( i + 1 ))
201+
done
202+
_ft_error "msg='docker health timeout' container=$name last_status=$STATUS"
203+
return 1
204+
}
205+
173206
# ---------------------------------------------------------------------------
174207
# SYSTEM SNAPSHOT -- emitted on any unrecoverable failure
175208
# ---------------------------------------------------------------------------
@@ -991,7 +1024,35 @@ _ft_log "msg='phase_complete' phase=HEALTH_CHECK_INTERNAL status=success contain
9911024
_ft_phase_end "HEALTH_CHECK_INTERNAL"
9921025

9931026
# ---------------------------------------------------------------------------
994-
# [5/7] SWITCH NGINX UPSTREAM
1027+
# DOCKER HEALTH GATE
1028+
# Ensures the container's HEALTHCHECK has settled to "healthy" before
1029+
# switching nginx. Prevents routing to a container that is "starting".
1030+
# ---------------------------------------------------------------------------
1031+
if ! _ft_wait_docker_health "$INACTIVE_NAME"; then
1032+
docker logs "$INACTIVE_NAME" --tail 50 >&2 || true
1033+
docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
1034+
docker rm "$INACTIVE_NAME" || true
1035+
_ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=docker_health_failed container=$INACTIVE_NAME"
1036+
fi
1037+
1038+
# STABILIZATION DELAY -- brief pause after docker health gate to let
1039+
# any in-flight connection setup settle (TLS session init, worker warm-up).
1040+
_ft_log "msg='stabilization delay' container=$INACTIVE_NAME"
1041+
sleep 3
1042+
1043+
# PRE-SWITCH CONNECTIVITY CHECK
1044+
# Direct in-network probe of the new container BEFORE touching nginx.
1045+
# Validates Docker DNS resolution + bridge routing work for the new container
1046+
# one final time with a clean, fresh curl invocation.
1047+
if ! docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" \
1048+
-sf --max-time 5 "http://$INACTIVE_NAME:$APP_PORT/ready" >/dev/null 2>&1; then
1049+
_ft_error "msg='pre-switch connectivity check failed' container=$INACTIVE_NAME"
1050+
docker logs "$INACTIVE_NAME" --tail 50 >&2 || true
1051+
docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
1052+
docker rm "$INACTIVE_NAME" || true
1053+
_ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=pre_switch_connectivity_failed container=$INACTIVE_NAME"
1054+
fi
1055+
_ft_log "msg='pre-switch connectivity check passed' container=$INACTIVE_NAME"
9951056
# ---------------------------------------------------------------------------
9961057
_ft_state "SWITCH_NGINX" "msg='switching nginx upstream' container=$INACTIVE_NAME"
9971058

@@ -1113,6 +1174,27 @@ fi
11131174
unset _POST_SWITCH_OK _ps
11141175
_ft_log "msg='post-switch routing verification passed'"
11151176

1177+
# POST-SWITCH UPSTREAM VERIFICATION
1178+
# Directly probe the new container via its in-network address after nginx
1179+
# has confirmed routing. Ensures the upstream backend itself is still
1180+
# responding — nginx routing healthy does NOT imply backend healthy.
1181+
if ! docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" \
1182+
-sf --max-time 5 "http://$INACTIVE_NAME:$APP_PORT/ready" >/dev/null 2>&1; then
1183+
_ft_error "msg='post-switch upstream verification failed' container=$INACTIVE_NAME"
1184+
_ft_snapshot
1185+
cp "$NGINX_BACKUP" "$NGINX_CONF"
1186+
if docker exec nginx nginx -t >/dev/null 2>&1 && docker exec nginx nginx -s reload >/dev/null 2>&1; then
1187+
_ft_log "msg='nginx restored (post-switch upstream failure)'"
1188+
else
1189+
_ft_log "level=ERROR msg='nginx restore failed during upstream verification rollback'"
1190+
fi
1191+
_ft_write_slot "$ACTIVE"
1192+
docker stop --time 10 "$INACTIVE_NAME" 2>/dev/null || true
1193+
docker rm "$INACTIVE_NAME" || true
1194+
_ft_exit 1 "DEPLOY_FAILED_SAFE" "reason=post_switch_upstream_failed container=$INACTIVE_NAME"
1195+
fi
1196+
_ft_log "msg='post-switch upstream verification passed' container=$INACTIVE_NAME"
1197+
11161198
# ---------------------------------------------------------------------------
11171199
# [6/7] PUBLIC HEALTH CHECK (end-to-end nginx routing)
11181200
# Validates:

scripts/vps-readiness-check.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ for f in "${REQUIRED_ENV_FILES[@]}"; do
192192
done
193193

194194
# .env.monitoring is optional (monitoring-sync.sh self-heals from example)
195-
if [ ! -f "$DEPLOY_ROOT/.env.monitoring" ]; then
195+
if [ ! -f "$DEPLOY_ROOT/infra/.env.monitoring" ]; then
196196
warn ".env.monitoring not found — monitoring-sync.sh will create it from example during deploy."
197197
fi
198198

0 commit comments

Comments
 (0)