From a602690cfdbff799f76d01fedbe413a65380d980 Mon Sep 17 00:00:00 2001 From: Nick Reisenauer Date: Tue, 16 Jun 2026 19:30:32 -0700 Subject: [PATCH] Decouple listRunners-outage backstop window from the zombie-reap interval MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #43 review (Copilot): the runner-API outage backstop reused neverConfirmedReapInterval as its "sustained" threshold, coupling two independent behaviors — tuning or disabling the zombie reaper would silently widen or disable the outage backstop and could re-enable the runaway scale-up during a prolonged outage. Give the backstop its own listRunnersOutageHoldInterval (default 120s). Also document neverConfirmedReapInterval's value semantics: a negative value disables the reaper; a very small value (e.g. 0) reaps a runner before it can come online (test-only) — the production default leaves a real connect window. OrchestratorTests 37 passing; build clean. --- Sources/MactionsCore/Orchestrator.swift | 19 +++++++++++++++++-- .../MactionsCoreTests/OrchestratorTests.swift | 8 +++++--- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/Sources/MactionsCore/Orchestrator.swift b/Sources/MactionsCore/Orchestrator.swift index 45d9c54..033f679 100644 --- a/Sources/MactionsCore/Orchestrator.swift +++ b/Sources/MactionsCore/Orchestrator.swift @@ -152,7 +152,20 @@ public final class RunnerOrchestrator { /// the 5-min sustained-offline grace (there's no job to orphan: an unconfirmed /// runner is never assigned work), and count it toward the launch-failure /// backoff so a repeated failure surfaces instead of spawning endlessly. + /// + /// VALUES: a NEGATIVE value DISABLES the reaper (the `>= 0` guard at the use + /// site). A very small value (e.g. `0`) reaps a runner before it has any + /// chance to come online — only sensible in tests; the production default + /// leaves a real connect window. Independent of the `listRunners`-outage + /// backstop, which has its own `listRunnersOutageHoldInterval`. private let neverConfirmedReapInterval: TimeInterval + /// How long `listRunners` must be CONTINUOUSLY failing before the converge + /// step holds the fleet instead of scaling up against unknown busy-ness (the + /// outage backstop in `reconcile`). Deliberately SEPARATE from + /// `neverConfirmedReapInterval`: tuning or disabling the zombie reaper must + /// not silently widen or disable this safety window, which would re-enable the + /// runaway scale-up during a prolonged outage. A brief blip still scales up. + private let listRunnersOutageHoldInterval: TimeInterval /// What the last queue poll saw — the UI's window into scale-from-zero /// (zero runners is the normal armed state, so the demand signal itself is @@ -280,7 +293,8 @@ public final class RunnerOrchestrator { idleTrimGraceInterval: TimeInterval = 90, trimConfirmInterval: TimeInterval = 25, launchFailureGraceInterval: TimeInterval = 30, - neverConfirmedReapInterval: TimeInterval = 120 + neverConfirmedReapInterval: TimeInterval = 120, + listRunnersOutageHoldInterval: TimeInterval = 120 ) { self.controlPlane = controlPlane self.factory = factory @@ -296,6 +310,7 @@ public final class RunnerOrchestrator { self.trimConfirmInterval = trimConfirmInterval self.launchFailureGraceInterval = launchFailureGraceInterval self.neverConfirmedReapInterval = neverConfirmedReapInterval + self.listRunnersOutageHoldInterval = listRunnersOutageHoldInterval } public var runners: [ManagedRunner] { @@ -421,7 +436,7 @@ public final class RunnerOrchestrator { // online, which is exactly the snowball that piled up containers. Mirrors // the "failed queue poll HOLDS" rule above. A brief blip still scales up. if let since = listRunnersFailingSince, - Date().timeIntervalSince(since) >= neverConfirmedReapInterval + Date().timeIntervalSince(since) >= listRunnersOutageHoldInterval { lastError = "Can't reach GitHub's runner API — holding the fleet (not spawning more)." notify() diff --git a/Tests/MactionsCoreTests/OrchestratorTests.swift b/Tests/MactionsCoreTests/OrchestratorTests.swift index 0bb4e50..c7ad88f 100644 --- a/Tests/MactionsCoreTests/OrchestratorTests.swift +++ b/Tests/MactionsCoreTests/OrchestratorTests.swift @@ -176,7 +176,8 @@ final class OrchestratorTests: XCTestCase { idleTrimGraceInterval: TimeInterval = 90, trimConfirmInterval: TimeInterval = 0.01, // tests: one 50ms tick elapses it launchFailureGraceInterval: TimeInterval = 0, // tests: disabled unless opted in - neverConfirmedReapInterval: TimeInterval = 3600 // tests: effectively off unless opted in + neverConfirmedReapInterval: TimeInterval = 3600, // tests: effectively off unless opted in + listRunnersOutageHoldInterval: TimeInterval = 3600 // tests: effectively off unless opted in ) -> (RunnerOrchestrator, FakeControlPlane, FakeFactory) { let cp = FakeControlPlane() cp.queuedJobs = queued ?? Array(repeating: labels, count: count) @@ -193,7 +194,8 @@ final class OrchestratorTests: XCTestCase { idleTrimGraceInterval: idleTrimGraceInterval, trimConfirmInterval: trimConfirmInterval, launchFailureGraceInterval: launchFailureGraceInterval, - neverConfirmedReapInterval: neverConfirmedReapInterval) + neverConfirmedReapInterval: neverConfirmedReapInterval, + listRunnersOutageHoldInterval: listRunnersOutageHoldInterval) return (orch, cp, factory) } @@ -877,7 +879,7 @@ final class OrchestratorTests: XCTestCase { let (orch, cp, factory) = makeOrchestrator( count: 3, queued: [["self-hosted"]], // one job → one runner to start reconcileInterval: 50_000_000, - neverConfirmedReapInterval: 0) // the "sustained" window is immediate in the test + listRunnersOutageHoldInterval: 0) // the outage-hold window is immediate in the test await orch.start() let runner = try! XCTUnwrap(orch.runners.first) // Confirm it online + busy so the reaper leaves it alone and a slot persists.