From b12a5e11a5b87727bf7ee3c547ad849ed66600da Mon Sep 17 00:00:00 2001
From: RoyLin <roylin@RoyLindeMacBook-Pro.local>
Date: Mon, 1 Jun 2026 10:03:06 +0800
Subject: [PATCH] fix(passive-health): break the deadlock that pinned a backend
 at 503
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Once a backend exceeded the error threshold it was marked unhealthy and
dropped from rotation, but recovery only happened inside record_success.
An unhealthy backend receives no traffic, so no success ever arrived and
the service stayed 503 until the gateway was restarted — a single
transient burst of SendRequest/5xx errors could take a whole service
down indefinitely.

A background recovery ticker now drives a half-open probe: after
recovery_time elapses the backend is re-enabled so it receives traffic
again; if still broken the next errors re-mark it, otherwise it stays
healthy. BackendErrors keeps an Arc<Backend> so the ticker can flip the
flag without live traffic. The ticker holds a Weak ref and exits when
its checker is dropped (config reload), avoiding task accumulation.

Release 1.0.6.
---
 CHANGELOG.md                  |  6 +++
 Cargo.lock                    |  2 +-
 Cargo.toml                    |  2 +-
 src/gateway/builders.rs       |  9 ++--
 src/service/passive_health.rs | 93 ++++++++++++++++++++++++++++++++++-
 5 files changed, 104 insertions(+), 8 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ebc77a0..85eb110 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.0.6] - 2026-06-01
+
+### Fixed
+
+- Passive health check no longer deadlocks a backend into permanent unavailability. Previously, once a backend exceeded the error threshold it was marked unhealthy and dropped from rotation; recovery only happened inside `record_success`, but an unhealthy backend receives no traffic, so no success ever arrived and the service returned `503` until the gateway was restarted (a single transient burst of `SendRequest`/5xx errors could take a whole service down indefinitely). A background recovery ticker now drives a half-open probe: after `recovery_time` elapses the backend is re-enabled so it receives traffic again — if it is still broken the next errors re-mark it, otherwise it stays healthy. The ticker holds a `Weak` reference and exits when its checker is dropped (config reload), avoiding task accumulation.
+
 ## [1.0.5] - 2026-05-31
 
 ### Fixed
diff --git a/Cargo.lock b/Cargo.lock
index 6b8daac..c6f900c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -8,7 +8,7 @@ version = "0.2.1"
 
 [[package]]
 name = "a3s-gateway"
-version = "1.0.5"
+version = "1.0.6"
 dependencies = [
  "a3s-acl",
  "a3s-updater",
diff --git a/Cargo.toml b/Cargo.toml
index 1f12c27..ad0ebcf 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "a3s-gateway"
-version = "1.0.5"
+version = "1.0.6"
 edition = "2021"
 rust-version = "1.88"
 authors = ["A3S Lab"]
diff --git a/src/gateway/builders.rs b/src/gateway/builders.rs
index c4af93f..d2a7faa 100644
--- a/src/gateway/builders.rs
+++ b/src/gateway/builders.rs
@@ -269,10 +269,11 @@ pub fn build_passive_health(config: &GatewayConfig) -> HashMap<String, Arc<Passi
         .services
         .keys()
         .map(|name| {
-            (
-                name.clone(),
-                Arc::new(PassiveHealthCheck::new(PassiveHealthConfig::default())),
-            )
+            let phc = Arc::new(PassiveHealthCheck::new(PassiveHealthConfig::default()));
+            // 启动半开自愈后台任务:后端被拉黑超过 recovery_time 后主动放行试探,
+            // 破"拉黑→无流量→无成功→永不恢复"的死锁(否则一次瞬时抖动 = 持续 503 直到重启)。
+            phc.spawn_recovery();
+            (name.clone(), phc)
         })
         .collect()
 }
diff --git a/src/service/passive_health.rs b/src/service/passive_health.rs
index 3474c63..5c8d915 100644
--- a/src/service/passive_health.rs
+++ b/src/service/passive_health.rs
@@ -35,6 +35,11 @@ impl Default for PassiveHealthConfig {
 
 /// Error record for a single backend
 struct BackendErrors {
+    /// Backend handle — kept so the half-open recovery ticker can re-enable it
+    /// without needing live traffic (passive recovery via `record_success` alone
+    /// deadlocks: an unhealthy backend receives no requests, so no success ever
+    /// arrives to clear it).
+    backend: Arc<Backend>,
     /// Timestamps of recent errors within the window
     errors: Vec<Instant>,
     /// When the backend was marked unhealthy (if applicable)
@@ -44,8 +49,9 @@ struct BackendErrors {
 }
 
 impl BackendErrors {
-    fn new() -> Self {
+    fn new(backend: Arc<Backend>) -> Self {
         Self {
+            backend,
             errors: Vec::new(),
             marked_unhealthy_at: None,
             total_errors: AtomicU64::new(0),
@@ -104,7 +110,7 @@ impl PassiveHealthCheck {
         let mut errors = self.backend_errors.write().unwrap();
         let entry = errors
             .entry(backend.url.clone())
-            .or_insert_with(BackendErrors::new);
+            .or_insert_with(|| BackendErrors::new(Arc::clone(backend)));
 
         entry.total_errors.fetch_add(1, Ordering::Relaxed);
 
@@ -133,6 +139,59 @@ impl PassiveHealthCheck {
         }
     }
 
+    /// Re-enable backends whose recovery window has elapsed (half-open probe).
+    /// Called periodically by the recovery ticker so a backend marked unhealthy
+    /// gets traffic again after `recovery_time`; if it is still broken the next
+    /// errors re-mark it, otherwise `record_success` keeps it healthy. This is
+    /// what breaks the passive-health deadlock.
+    pub fn recover_expired(&self) {
+        let now = Instant::now();
+        let mut errors = self.backend_errors.write().unwrap();
+        for entry in errors.values_mut() {
+            if let Some(marked_at) = entry.marked_unhealthy_at {
+                if now.duration_since(marked_at) >= self.config.recovery_time {
+                    entry.backend.set_healthy(true);
+                    entry.marked_unhealthy_at = None;
+                    entry.errors.clear();
+                    tracing::info!(
+                        backend = entry.backend.url,
+                        "Backend re-enabled for probe (passive health half-open recovery)"
+                    );
+                }
+            }
+        }
+    }
+
+    /// Spawn a background ticker that drives [`recover_expired`]. Without it an
+    /// unhealthy backend receives no traffic, so no success ever arrives to clear
+    /// it and the service stays 503 until the gateway is restarted. Uses a `Weak`
+    /// ref so the task exits when this checker is dropped (e.g. on config reload),
+    /// avoiding accumulating tasks across reloads.
+    pub fn spawn_recovery(self: &Arc<Self>) {
+        // 仅在有 Tokio runtime 时启动(生产启动期满足);无 runtime(如单元测试构建器)
+        // 直接跳过,recover_expired 仍可被显式调用/测试。
+        if tokio::runtime::Handle::try_current().is_err() {
+            return;
+        }
+        let weak = Arc::downgrade(self);
+        let tick = self
+            .config
+            .recovery_time
+            .min(Duration::from_secs(5))
+            .max(Duration::from_secs(1));
+        tokio::spawn(async move {
+            let mut ticker = tokio::time::interval(tick);
+            ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
+            loop {
+                ticker.tick().await;
+                match weak.upgrade() {
+                    Some(this) => this.recover_expired(),
+                    None => break,
+                }
+            }
+        });
+    }
+
     /// Check if a status code is considered an error
     pub fn is_error_status(&self, status_code: u16) -> bool {
         self.config.error_status_codes.contains(&status_code)
@@ -417,4 +476,34 @@ mod tests {
         phc.record_error(&backend, 502);
         assert_eq!(phc.recent_errors("http://127.0.0.1:8001"), 2);
     }
+
+    #[test]
+    fn test_recover_expired_reenables_after_recovery_time() {
+        // recovery_time=0 让恢复立即可触发,确定性测试半开放行(不依赖 sleep)。
+        let cfg = PassiveHealthConfig {
+            error_threshold: 2,
+            window: Duration::from_secs(60),
+            error_status_codes: vec![503],
+            recovery_time: Duration::from_secs(0),
+        };
+        let phc = PassiveHealthCheck::new(cfg);
+        let backend = make_backend("http://127.0.0.1:8010");
+
+        // 触发拉黑
+        phc.record_error(&backend, 503);
+        phc.record_error(&backend, 503);
+        assert!(!backend.is_healthy(), "达到阈值应被标记不健康");
+
+        // 半开恢复:recovery_time 过后主动放行(破死锁,无需成功请求)
+        phc.recover_expired();
+        assert!(
+            backend.is_healthy(),
+            "recovery_time 过后 recover_expired 应放行后端"
+        );
+        assert_eq!(
+            phc.recent_errors("http://127.0.0.1:8010"),
+            0,
+            "放行后错误窗口应清空"
+        );
+    }
 }