@@ -170,6 +170,39 @@ _ft_final_state() {
170170 _ft_log " msg='final_state' deploy_id=$DEPLOY_ID active=$active_container sha=${image_sha: 0: 12} nginx_upstream=$nginx_upstream "
171171}
172172
173+ # ---------------------------------------------------------------------------
174+ # DOCKER HEALTH GATE
175+ # Waits for the container's HEALTHCHECK to reach "healthy" before allowing
176+ # nginx to switch. If the container has no HEALTHCHECK defined, this returns
177+ # immediately (status="none") to avoid blocking on unconfigured containers.
178+ # ---------------------------------------------------------------------------
179+ _ft_wait_docker_health () {
180+ local name=" $1 "
181+ local i=1
182+ local STATUS
183+ while [ " $i " -le 30 ]; do
184+ STATUS=$( docker inspect --format=' {{.State.Health.Status}}' " $name " 2> /dev/null || echo " none" )
185+ if [ " $STATUS " = " healthy" ]; then
186+ _ft_log " msg='docker health check passed' container=$name "
187+ return 0
188+ fi
189+ if [ " $STATUS " = " unhealthy" ]; then
190+ _ft_error " msg='docker health check failed' container=$name status=unhealthy"
191+ return 1
192+ fi
193+ # "none" means the image has no HEALTHCHECK — skip gate (return 0 immediately)
194+ if [ " $STATUS " = " none" ]; then
195+ _ft_log " msg='docker health gate skipped (no HEALTHCHECK defined)' container=$name "
196+ return 0
197+ fi
198+ [ $(( i % 5 )) -eq 0 ] && _ft_log " msg='waiting for docker health' attempt=$i /30 status=$STATUS container=$name "
199+ sleep 2
200+ i=$(( i + 1 ))
201+ done
202+ _ft_error " msg='docker health timeout' container=$name last_status=$STATUS "
203+ return 1
204+ }
205+
173206# ---------------------------------------------------------------------------
174207# SYSTEM SNAPSHOT -- emitted on any unrecoverable failure
175208# ---------------------------------------------------------------------------
@@ -991,7 +1024,35 @@ _ft_log "msg='phase_complete' phase=HEALTH_CHECK_INTERNAL status=success contain
9911024_ft_phase_end " HEALTH_CHECK_INTERNAL"
9921025
9931026# ---------------------------------------------------------------------------
994- # [5/7] SWITCH NGINX UPSTREAM
1027+ # DOCKER HEALTH GATE
1028+ # Ensures the container's HEALTHCHECK has settled to "healthy" before
1029+ # switching nginx. Prevents routing to a container that is "starting".
1030+ # ---------------------------------------------------------------------------
1031+ if ! _ft_wait_docker_health " $INACTIVE_NAME " ; then
1032+ docker logs " $INACTIVE_NAME " --tail 50 >&2 || true
1033+ docker stop --time 10 " $INACTIVE_NAME " 2> /dev/null || true
1034+ docker rm " $INACTIVE_NAME " || true
1035+ _ft_exit 1 " DEPLOY_FAILED_SAFE" " reason=docker_health_failed container=$INACTIVE_NAME "
1036+ fi
1037+
1038+ # STABILIZATION DELAY -- brief pause after docker health gate to let
1039+ # any in-flight connection setup settle (TLS session init, worker warm-up).
1040+ _ft_log " msg='stabilization delay' container=$INACTIVE_NAME "
1041+ sleep 3
1042+
1043+ # PRE-SWITCH CONNECTIVITY CHECK
1044+ # Direct in-network probe of the new container BEFORE touching nginx.
1045+ # Validates Docker DNS resolution + bridge routing work for the new container
1046+ # one final time with a clean, fresh curl invocation.
1047+ if ! docker run --rm --network " $NETWORK " " $_FT_CURL_IMG " \
1048+ -sf --max-time 5 " http://$INACTIVE_NAME :$APP_PORT /ready" > /dev/null 2>&1 ; then
1049+ _ft_error " msg='pre-switch connectivity check failed' container=$INACTIVE_NAME "
1050+ docker logs " $INACTIVE_NAME " --tail 50 >&2 || true
1051+ docker stop --time 10 " $INACTIVE_NAME " 2> /dev/null || true
1052+ docker rm " $INACTIVE_NAME " || true
1053+ _ft_exit 1 " DEPLOY_FAILED_SAFE" " reason=pre_switch_connectivity_failed container=$INACTIVE_NAME "
1054+ fi
1055+ _ft_log " msg='pre-switch connectivity check passed' container=$INACTIVE_NAME "
9951056# ---------------------------------------------------------------------------
9961057_ft_state " SWITCH_NGINX" " msg='switching nginx upstream' container=$INACTIVE_NAME "
9971058
@@ -1113,6 +1174,27 @@ fi
11131174unset _POST_SWITCH_OK _ps
11141175_ft_log " msg='post-switch routing verification passed'"
11151176
1177+ # POST-SWITCH UPSTREAM VERIFICATION
1178+ # Directly probe the new container via its in-network address after nginx
1179+ # has confirmed routing. Ensures the upstream backend itself is still
1180+ # responding — nginx routing healthy does NOT imply backend healthy.
1181+ if ! docker run --rm --network " $NETWORK " " $_FT_CURL_IMG " \
1182+ -sf --max-time 5 " http://$INACTIVE_NAME :$APP_PORT /ready" > /dev/null 2>&1 ; then
1183+ _ft_error " msg='post-switch upstream verification failed' container=$INACTIVE_NAME "
1184+ _ft_snapshot
1185+ cp " $NGINX_BACKUP " " $NGINX_CONF "
1186+ if docker exec nginx nginx -t > /dev/null 2>&1 && docker exec nginx nginx -s reload > /dev/null 2>&1 ; then
1187+ _ft_log " msg='nginx restored (post-switch upstream failure)'"
1188+ else
1189+ _ft_log " level=ERROR msg='nginx restore failed during upstream verification rollback'"
1190+ fi
1191+ _ft_write_slot " $ACTIVE "
1192+ docker stop --time 10 " $INACTIVE_NAME " 2> /dev/null || true
1193+ docker rm " $INACTIVE_NAME " || true
1194+ _ft_exit 1 " DEPLOY_FAILED_SAFE" " reason=post_switch_upstream_failed container=$INACTIVE_NAME "
1195+ fi
1196+ _ft_log " msg='post-switch upstream verification passed' container=$INACTIVE_NAME "
1197+
11161198# ---------------------------------------------------------------------------
11171199# [6/7] PUBLIC HEALTH CHECK (end-to-end nginx routing)
11181200# Validates:
0 commit comments