Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [1.0.6] - 2026-06-01

### Fixed

- Passive health check no longer deadlocks a backend into permanent unavailability. Previously, once a backend exceeded the error threshold it was marked unhealthy and dropped from rotation; recovery only happened inside `record_success`, but an unhealthy backend receives no traffic, so no success ever arrived and the service returned `503` until the gateway was restarted (a single transient burst of `SendRequest`/5xx errors could take a whole service down indefinitely). A background recovery ticker now drives a half-open probe: after `recovery_time` elapses the backend is re-enabled so it receives traffic again — if it is still broken the next errors re-mark it, otherwise it stays healthy. The ticker holds a `Weak` reference and exits when its checker is dropped (config reload), avoiding task accumulation.

## [1.0.5] - 2026-05-31

### Fixed
Expand Down
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "a3s-gateway"
version = "1.0.5"
version = "1.0.6"
edition = "2021"
rust-version = "1.88"
authors = ["A3S Lab"]
Expand Down
9 changes: 5 additions & 4 deletions src/gateway/builders.rs
Original file line number Diff line number Diff line change
Expand Up @@ -269,10 +269,11 @@ pub fn build_passive_health(config: &GatewayConfig) -> HashMap<String, Arc<Passi
.services
.keys()
.map(|name| {
(
name.clone(),
Arc::new(PassiveHealthCheck::new(PassiveHealthConfig::default())),
)
let phc = Arc::new(PassiveHealthCheck::new(PassiveHealthConfig::default()));
// 启动半开自愈后台任务:后端被拉黑超过 recovery_time 后主动放行试探,
// 破"拉黑→无流量→无成功→永不恢复"的死锁(否则一次瞬时抖动 = 持续 503 直到重启)。
phc.spawn_recovery();
(name.clone(), phc)
})
.collect()
}
Expand Down
93 changes: 91 additions & 2 deletions src/service/passive_health.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ impl Default for PassiveHealthConfig {

/// Error record for a single backend
struct BackendErrors {
/// Backend handle — kept so the half-open recovery ticker can re-enable it
/// without needing live traffic (passive recovery via `record_success` alone
/// deadlocks: an unhealthy backend receives no requests, so no success ever
/// arrives to clear it).
backend: Arc<Backend>,
/// Timestamps of recent errors within the window
errors: Vec<Instant>,
/// When the backend was marked unhealthy (if applicable)
Expand All @@ -44,8 +49,9 @@ struct BackendErrors {
}

impl BackendErrors {
fn new() -> Self {
fn new(backend: Arc<Backend>) -> Self {
Self {
backend,
errors: Vec::new(),
marked_unhealthy_at: None,
total_errors: AtomicU64::new(0),
Expand Down Expand Up @@ -104,7 +110,7 @@ impl PassiveHealthCheck {
let mut errors = self.backend_errors.write().unwrap();
let entry = errors
.entry(backend.url.clone())
.or_insert_with(BackendErrors::new);
.or_insert_with(|| BackendErrors::new(Arc::clone(backend)));

entry.total_errors.fetch_add(1, Ordering::Relaxed);

Expand Down Expand Up @@ -133,6 +139,59 @@ impl PassiveHealthCheck {
}
}

/// Re-enable backends whose recovery window has elapsed (half-open probe).
/// Called periodically by the recovery ticker so a backend marked unhealthy
/// gets traffic again after `recovery_time`; if it is still broken the next
/// errors re-mark it, otherwise `record_success` keeps it healthy. This is
/// what breaks the passive-health deadlock.
pub fn recover_expired(&self) {
let now = Instant::now();
let mut errors = self.backend_errors.write().unwrap();
for entry in errors.values_mut() {
if let Some(marked_at) = entry.marked_unhealthy_at {
if now.duration_since(marked_at) >= self.config.recovery_time {
entry.backend.set_healthy(true);
entry.marked_unhealthy_at = None;
entry.errors.clear();
tracing::info!(
backend = entry.backend.url,
"Backend re-enabled for probe (passive health half-open recovery)"
);
}
}
}
}

/// Spawn a background ticker that drives [`recover_expired`]. Without it an
/// unhealthy backend receives no traffic, so no success ever arrives to clear
/// it and the service stays 503 until the gateway is restarted. Uses a `Weak`
/// ref so the task exits when this checker is dropped (e.g. on config reload),
/// avoiding accumulating tasks across reloads.
pub fn spawn_recovery(self: &Arc<Self>) {
// 仅在有 Tokio runtime 时启动(生产启动期满足);无 runtime(如单元测试构建器)
// 直接跳过,recover_expired 仍可被显式调用/测试。
if tokio::runtime::Handle::try_current().is_err() {
return;
}
let weak = Arc::downgrade(self);
let tick = self
.config
.recovery_time
.min(Duration::from_secs(5))
.max(Duration::from_secs(1));
tokio::spawn(async move {
let mut ticker = tokio::time::interval(tick);
ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
loop {
ticker.tick().await;
match weak.upgrade() {
Some(this) => this.recover_expired(),
None => break,
}
}
});
}

/// Check if a status code is considered an error
pub fn is_error_status(&self, status_code: u16) -> bool {
self.config.error_status_codes.contains(&status_code)
Expand Down Expand Up @@ -417,4 +476,34 @@ mod tests {
phc.record_error(&backend, 502);
assert_eq!(phc.recent_errors("http://127.0.0.1:8001"), 2);
}

#[test]
fn test_recover_expired_reenables_after_recovery_time() {
// recovery_time=0 让恢复立即可触发,确定性测试半开放行(不依赖 sleep)。
let cfg = PassiveHealthConfig {
error_threshold: 2,
window: Duration::from_secs(60),
error_status_codes: vec![503],
recovery_time: Duration::from_secs(0),
};
let phc = PassiveHealthCheck::new(cfg);
let backend = make_backend("http://127.0.0.1:8010");

// 触发拉黑
phc.record_error(&backend, 503);
phc.record_error(&backend, 503);
assert!(!backend.is_healthy(), "达到阈值应被标记不健康");

// 半开恢复:recovery_time 过后主动放行(破死锁,无需成功请求)
phc.recover_expired();
assert!(
backend.is_healthy(),
"recovery_time 过后 recover_expired 应放行后端"
);
assert_eq!(
phc.recent_errors("http://127.0.0.1:8010"),
0,
"放行后错误窗口应清空"
);
}
}
Loading