From b12a5e11a5b87727bf7ee3c547ad849ed66600da Mon Sep 17 00:00:00 2001 From: RoyLin Date: Mon, 1 Jun 2026 10:03:06 +0800 Subject: [PATCH] fix(passive-health): break the deadlock that pinned a backend at 503 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Once a backend exceeded the error threshold it was marked unhealthy and dropped from rotation, but recovery only happened inside record_success. An unhealthy backend receives no traffic, so no success ever arrived and the service stayed 503 until the gateway was restarted — a single transient burst of SendRequest/5xx errors could take a whole service down indefinitely. A background recovery ticker now drives a half-open probe: after recovery_time elapses the backend is re-enabled so it receives traffic again; if still broken the next errors re-mark it, otherwise it stays healthy. BackendErrors keeps an Arc so the ticker can flip the flag without live traffic. The ticker holds a Weak ref and exits when its checker is dropped (config reload), avoiding task accumulation. Release 1.0.6. --- CHANGELOG.md | 6 +++ Cargo.lock | 2 +- Cargo.toml | 2 +- src/gateway/builders.rs | 9 ++-- src/service/passive_health.rs | 93 ++++++++++++++++++++++++++++++++++- 5 files changed, 104 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ebc77a0..85eb110 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.0.6] - 2026-06-01 + +### Fixed + +- Passive health check no longer deadlocks a backend into permanent unavailability. Previously, once a backend exceeded the error threshold it was marked unhealthy and dropped from rotation; recovery only happened inside `record_success`, but an unhealthy backend receives no traffic, so no success ever arrived and the service returned `503` until the gateway was restarted (a single transient burst of `SendRequest`/5xx errors could take a whole service down indefinitely). A background recovery ticker now drives a half-open probe: after `recovery_time` elapses the backend is re-enabled so it receives traffic again — if it is still broken the next errors re-mark it, otherwise it stays healthy. The ticker holds a `Weak` reference and exits when its checker is dropped (config reload), avoiding task accumulation. + ## [1.0.5] - 2026-05-31 ### Fixed diff --git a/Cargo.lock b/Cargo.lock index 6b8daac..c6f900c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8,7 +8,7 @@ version = "0.2.1" [[package]] name = "a3s-gateway" -version = "1.0.5" +version = "1.0.6" dependencies = [ "a3s-acl", "a3s-updater", diff --git a/Cargo.toml b/Cargo.toml index 1f12c27..ad0ebcf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "a3s-gateway" -version = "1.0.5" +version = "1.0.6" edition = "2021" rust-version = "1.88" authors = ["A3S Lab"] diff --git a/src/gateway/builders.rs b/src/gateway/builders.rs index c4af93f..d2a7faa 100644 --- a/src/gateway/builders.rs +++ b/src/gateway/builders.rs @@ -269,10 +269,11 @@ pub fn build_passive_health(config: &GatewayConfig) -> HashMap, /// Timestamps of recent errors within the window errors: Vec, /// When the backend was marked unhealthy (if applicable) @@ -44,8 +49,9 @@ struct BackendErrors { } impl BackendErrors { - fn new() -> Self { + fn new(backend: Arc) -> Self { Self { + backend, errors: Vec::new(), marked_unhealthy_at: None, total_errors: AtomicU64::new(0), @@ -104,7 +110,7 @@ impl PassiveHealthCheck { let mut errors = self.backend_errors.write().unwrap(); let entry = errors .entry(backend.url.clone()) - .or_insert_with(BackendErrors::new); + .or_insert_with(|| BackendErrors::new(Arc::clone(backend))); entry.total_errors.fetch_add(1, Ordering::Relaxed); @@ -133,6 +139,59 @@ impl PassiveHealthCheck { } } + /// Re-enable backends whose recovery window has elapsed (half-open probe). + /// Called periodically by the recovery ticker so a backend marked unhealthy + /// gets traffic again after `recovery_time`; if it is still broken the next + /// errors re-mark it, otherwise `record_success` keeps it healthy. This is + /// what breaks the passive-health deadlock. + pub fn recover_expired(&self) { + let now = Instant::now(); + let mut errors = self.backend_errors.write().unwrap(); + for entry in errors.values_mut() { + if let Some(marked_at) = entry.marked_unhealthy_at { + if now.duration_since(marked_at) >= self.config.recovery_time { + entry.backend.set_healthy(true); + entry.marked_unhealthy_at = None; + entry.errors.clear(); + tracing::info!( + backend = entry.backend.url, + "Backend re-enabled for probe (passive health half-open recovery)" + ); + } + } + } + } + + /// Spawn a background ticker that drives [`recover_expired`]. Without it an + /// unhealthy backend receives no traffic, so no success ever arrives to clear + /// it and the service stays 503 until the gateway is restarted. Uses a `Weak` + /// ref so the task exits when this checker is dropped (e.g. on config reload), + /// avoiding accumulating tasks across reloads. + pub fn spawn_recovery(self: &Arc) { + // 仅在有 Tokio runtime 时启动(生产启动期满足);无 runtime(如单元测试构建器) + // 直接跳过,recover_expired 仍可被显式调用/测试。 + if tokio::runtime::Handle::try_current().is_err() { + return; + } + let weak = Arc::downgrade(self); + let tick = self + .config + .recovery_time + .min(Duration::from_secs(5)) + .max(Duration::from_secs(1)); + tokio::spawn(async move { + let mut ticker = tokio::time::interval(tick); + ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay); + loop { + ticker.tick().await; + match weak.upgrade() { + Some(this) => this.recover_expired(), + None => break, + } + } + }); + } + /// Check if a status code is considered an error pub fn is_error_status(&self, status_code: u16) -> bool { self.config.error_status_codes.contains(&status_code) @@ -417,4 +476,34 @@ mod tests { phc.record_error(&backend, 502); assert_eq!(phc.recent_errors("http://127.0.0.1:8001"), 2); } + + #[test] + fn test_recover_expired_reenables_after_recovery_time() { + // recovery_time=0 让恢复立即可触发,确定性测试半开放行(不依赖 sleep)。 + let cfg = PassiveHealthConfig { + error_threshold: 2, + window: Duration::from_secs(60), + error_status_codes: vec![503], + recovery_time: Duration::from_secs(0), + }; + let phc = PassiveHealthCheck::new(cfg); + let backend = make_backend("http://127.0.0.1:8010"); + + // 触发拉黑 + phc.record_error(&backend, 503); + phc.record_error(&backend, 503); + assert!(!backend.is_healthy(), "达到阈值应被标记不健康"); + + // 半开恢复:recovery_time 过后主动放行(破死锁,无需成功请求) + phc.recover_expired(); + assert!( + backend.is_healthy(), + "recovery_time 过后 recover_expired 应放行后端" + ); + assert_eq!( + phc.recent_errors("http://127.0.0.1:8010"), + 0, + "放行后错误窗口应清空" + ); + } }