diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..c1a0582 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,40 @@ +# Changelog + +All notable changes to Mactions are documented here. This project adheres to +[Semantic Versioning](https://semver.org/). + +## [0.1.3] - 2026-06-16 + +### Fixed + +- **Linux runner containers could pile up when a runner never reached GitHub.** + If an ephemeral Linux container registered but never connected — e.g. its + network egress failed — it never exited, so the orchestrator dropped it from + tracking and launched a replacement every reap cycle, accumulating zombie + containers (15 observed in one incident) that exhausted Apple `container`'s + shared vmnet NAT bridge and wedged the network for the rest. + + Root cause: `LinuxContainerProvider.stop()` only SIGTERM'd the foreground + `container run` client (an attachment to the daemon-managed container) and + relied on its termination handler to force-delete the container — which did + not fire reliably, so the container kept running. `stop()` now force-deletes + the container by name directly (`container delete --force`, verified to reap a + running container), so every reap actually removes its container. + +### Added + +- **Liveness reaper for "registered but never online" runners.** A runner alive + past `neverConfirmedReapInterval` (default 2 min) that GitHub never once + confirmed `online` is force-deleted, deregistered, and counted toward the + launch-failure backoff — rather than waiting out the 5-minute sustained-offline + grace (which exists to protect a mid-job runner from a transient blip). A + repeated failure now flips `launchFailing` ("substrate broken") instead of + respawning forever. +- **Bridge-safe Linux concurrency ceiling** (`maxConcurrentContainersCeiling`, + default 4). The effective per-host container cap is the lower of the RAM/CPU + divide and this ceiling, because Apple `container`'s single shared vmnet NAT + bridge — not RAM/CPU — is the binding constraint past a handful of containers. +- **Sustained runner-API outage backstop.** When `listRunners` has been failing + for a sustained window, the orchestrator holds the fleet instead of scaling up + against unknown busy-ness (which, during an egress outage, would spawn + replacements that can never connect). diff --git a/Sources/Mactions/AppState.swift b/Sources/Mactions/AppState.swift index b3af9cb..161a685 100644 --- a/Sources/Mactions/AppState.swift +++ b/Sources/Mactions/AppState.swift @@ -777,7 +777,7 @@ final class AppState: ObservableObject { let maxLinux = linuxFactory == nil ? 0 - : LinuxContainerBudget.maxConcurrentContainers( + : LinuxContainerBudget.effectiveMaxConcurrentContainers( physicalMemoryBytes: ProcessInfo.processInfo.physicalMemory, activeProcessorCount: ProcessInfo.processInfo.activeProcessorCount) @@ -1887,7 +1887,7 @@ final class AppState: ObservableObject { ? WindowsVMBudget.maxConcurrentVMs(physicalMemoryBytes: ram, perVMGB: perVM) : 0, linuxMaxConcurrentContainers: linuxImageReady - ? LinuxContainerBudget.maxConcurrentContainers( + ? LinuxContainerBudget.effectiveMaxConcurrentContainers( physicalMemoryBytes: ram, activeProcessorCount: ProcessInfo.processInfo.activeProcessorCount) : 0, diff --git a/Sources/MactionsCore/LinuxContainerBudget.swift b/Sources/MactionsCore/LinuxContainerBudget.swift index 7c61531..d8a3e03 100644 --- a/Sources/MactionsCore/LinuxContainerBudget.swift +++ b/Sources/MactionsCore/LinuxContainerBudget.swift @@ -23,6 +23,15 @@ public enum LinuxContainerBudget { /// is OOM-killed (exit 137) rather than swamping the host. public static let defaultMemoryGBPerContainer = 6 + /// Hard ceiling on concurrent containers, applied BELOW the RAM/CPU divide. + /// Apple `container` shares a single vmnet NAT bridge across every container; + /// past a handful of concurrent lightweight VMs the bridge/DHCP path — not + /// RAM/CPU — becomes the binding constraint, and a registered-but-offline + /// container can wedge it for the others. Clamp explicitly so a large Mac + /// can't authorize a fan-out that exhausts the bridge (the raw divide permitted + /// 8+ on a 64 GB host, which is how a stuck runner snowballed into a pileup). + public static let maxConcurrentContainersCeiling = 4 + /// Max concurrent containers given the host's physical memory + core count. /// /// Reserves `reservedGB` for macOS, the app, and idle agents, then divides the @@ -45,4 +54,24 @@ public enum LinuxContainerBudget { let byCPU = max(0, activeProcessorCount) / cpusPerContainer return max(0, min(byRAM, byCPU)) } + + /// The cap the host budget should actually ENFORCE: the RAM/CPU divide above, + /// then clamped to `maxConcurrentContainersCeiling`. The raw divide can + /// authorize many containers on a big Mac, but Apple `container`'s shared + /// vmnet NAT bridge — not RAM/CPU — binds first, so the effective cap is the + /// lower of the two. Callers computing the live Linux budget use THIS. + public static func effectiveMaxConcurrentContainers( + physicalMemoryBytes: UInt64, + activeProcessorCount: Int, + memoryGBPerContainer: Int = defaultMemoryGBPerContainer, + cpusPerContainer: Int = defaultCPUsPerContainer, + reservedGB: Int = 4 + ) -> Int { + min( + maxConcurrentContainers( + physicalMemoryBytes: physicalMemoryBytes, activeProcessorCount: activeProcessorCount, + memoryGBPerContainer: memoryGBPerContainer, cpusPerContainer: cpusPerContainer, + reservedGB: reservedGB), + maxConcurrentContainersCeiling) + } } diff --git a/Sources/MactionsCore/LinuxContainerProvider.swift b/Sources/MactionsCore/LinuxContainerProvider.swift index 4cba9b4..1a1fac4 100644 --- a/Sources/MactionsCore/LinuxContainerProvider.swift +++ b/Sources/MactionsCore/LinuxContainerProvider.swift @@ -209,11 +209,19 @@ public final class LinuxContainerProvider: RunnerProvider, @unchecked Sendable { public func stop() { lock.lock(); let process = self.process; self.process = nil; lock.unlock() - if let process, process.isRunning { - process.terminate() // terminationHandler runs cleanup() - } else { - cleanup() - } + // Terminate the foreground `container run` client so its Process settles and + // the `onExit` callback fires — but that client is only an ATTACHMENT to a + // daemon-managed container. SIGTERM does NOT reliably make `container run` + // exit, so we must not depend on its terminationHandler to invoke cleanup(). + // Force-delete by NAME ourselves, unconditionally: `container delete --force` + // removes a still-running container, and `cleanup()` is idempotent (the + // `cleaned` guard), so a later terminationHandler is a harmless no-op. + // + // Without this, a reaped runner's container kept running in the daemon while + // the orchestrator dropped it from `slots` and launched a replacement — so a + // never-connecting runner (failed egress) piled up one zombie per reap cycle. + process?.terminate() + cleanup() } /// Idempotent: `--rm` reaps the container on a normal exit, but a SIGKILL diff --git a/Sources/MactionsCore/Orchestrator.swift b/Sources/MactionsCore/Orchestrator.swift index 9dc5e13..45d9c54 100644 --- a/Sources/MactionsCore/Orchestrator.swift +++ b/Sources/MactionsCore/Orchestrator.swift @@ -144,6 +144,15 @@ public final class RunnerOrchestrator { /// don't record a run, don't hot-loop a replacement — the periodic tick /// paces the retry. private let launchFailureGraceInterval: TimeInterval + /// A slot ALIVE this long without GitHub ever confirming it `online` + /// (`confirmedOnline` never set) is a never-connected zombie: it minted a + /// registration but its egress failed, so it never reached GitHub and never + /// claimed a job — and, unlike a launch-FAILURE *exit*, its container keeps + /// running, so it never trips `launchFailureGraceInterval`. Reap it well before + /// the 5-min sustained-offline grace (there's no job to orphan: an unconfirmed + /// runner is never assigned work), and count it toward the launch-failure + /// backoff so a repeated failure surfaces instead of spawning endlessly. + private let neverConfirmedReapInterval: TimeInterval /// What the last queue poll saw — the UI's window into scale-from-zero /// (zero runners is the normal armed state, so the demand signal itself is @@ -169,6 +178,12 @@ public final class RunnerOrchestrator { /// broken (dead daemon, bad base image) and "starting…" would be a lie. private var consecutiveLaunchFailures = 0 public var launchFailing: Bool { consecutiveLaunchFailures >= 2 } + /// When `listRunners` first started failing (cleared on the next success). A + /// SUSTAINED runner-API outage must NOT read as "spawn more": with busy-ness + /// unknown, the converge step would scale up to a fixed target every tick, and + /// when the outage is an egress failure those new runners can never connect — + /// the snowball. Used by the `health == nil` backstop in `reconcile`. + private var listRunnersFailingSince: Date? /// Fired (on the main actor) whenever `state`/`runners` change. public var onChange: (() -> Void)? /// Fired (on the main actor) once for each runner that finishes — a clean @@ -264,7 +279,8 @@ public final class RunnerOrchestrator { idleJITRefreshInterval: TimeInterval? = 8 * 60, idleTrimGraceInterval: TimeInterval = 90, trimConfirmInterval: TimeInterval = 25, - launchFailureGraceInterval: TimeInterval = 30 + launchFailureGraceInterval: TimeInterval = 30, + neverConfirmedReapInterval: TimeInterval = 120 ) { self.controlPlane = controlPlane self.factory = factory @@ -279,6 +295,7 @@ public final class RunnerOrchestrator { self.idleTrimGraceInterval = idleTrimGraceInterval self.trimConfirmInterval = trimConfirmInterval self.launchFailureGraceInterval = launchFailureGraceInterval + self.neverConfirmedReapInterval = neverConfirmedReapInterval } public var runners: [ManagedRunner] { @@ -397,6 +414,19 @@ public final class RunnerOrchestrator { // 3. Converge. guard let health else { + // SUSTAINED runner-API outage → HOLD the fleet (don't spawn more). With + // busy-ness unknown, the scale-up below assumes queued jobs need NEW + // runners — but when the outage is an egress failure (often the same one + // stopping runners from connecting), those replacements can never come + // online, which is exactly the snowball that piled up containers. Mirrors + // the "failed queue poll HOLDS" rule above. A brief blip still scales up. + if let since = listRunnersFailingSince, + Date().timeIntervalSince(since) >= neverConfirmedReapInterval + { + lastError = "Can't reach GitHub's runner API — holding the fleet (not spawning more)." + notify() + return + } // The queue read is real but busy-ness is unknown (listRunners failed). // Assume every current slot is busy: demand that PERSISTS alongside // genuinely idle runners doesn't happen (GitHub assigns within seconds), @@ -538,10 +568,12 @@ public final class RunnerOrchestrator { do { remote = try await controlPlane.listRunners() } catch { + if listRunnersFailingSince == nil { listRunnersFailingSince = Date() } lastError = String(describing: error) notify() return nil } + listRunnersFailingSince = nil // a successful fetch clears the outage clock guard epoch == myEpoch, state == .online else { return nil } let byId = Dictionary(remote.map { ($0.id, $0) }, uniquingKeysWith: { first, _ in first }) @@ -556,6 +588,26 @@ public final class RunnerOrchestrator { let healthy = remoteRunner?.status.lowercased() == "online" if !healthy { + // NEVER-CONNECTED ZOMBIE: appended, alive past `neverConfirmedReapInterval`, + // and GitHub has NEVER once confirmed it online. This is the egress-failed + // container — it registered but never reached GitHub, so it has no job to + // orphan (an unconfirmed runner is never assigned work) and its container + // won't exit on its own (so the launch-failure-on-EXIT path never fires). + // Reap it FAST instead of waiting out the 5-min sustained-offline grace + // (which exists to protect a mid-job runner from a transient blip) — else + // replacements pile up behind it. Count it toward the launch-failure + // backoff so a repeated failure flips `launchFailing` ("substrate broken") + // rather than silently respawning forever. + if slot.appended, !slot.confirmedOnline, neverConfirmedReapInterval >= 0, + age >= neverConfirmedReapInterval + { + consecutiveLaunchFailures += 1 + lastError = + "Runner \(slot.name) registered but never reached GitHub within " + + "\(Int(neverConfirmedReapInterval))s — reaping (check network/egress)." + stale.append((slot, remoteRunner)) + continue + } // Missing from GitHub's list, or not online. Prune ONLY after SUSTAINED // unhealth (≥ grace), never on a single bad reading — a transient // eventual-consistency / connection blip must not tear down a runner diff --git a/Tests/MactionsCoreTests/LinuxContainerBudgetTests.swift b/Tests/MactionsCoreTests/LinuxContainerBudgetTests.swift index 6d60683..ed3668c 100644 --- a/Tests/MactionsCoreTests/LinuxContainerBudgetTests.swift +++ b/Tests/MactionsCoreTests/LinuxContainerBudgetTests.swift @@ -58,4 +58,30 @@ final class LinuxContainerBudgetTests: XCTestCase { XCTAssertEqual(LinuxContainerBudget.defaultCPUsPerContainer, 2) XCTAssertEqual(LinuxContainerBudget.defaultMemoryGBPerContainer, 6) } + + func testEffectiveCapClampsLargeHostToBridgeCeiling() { + // 128 GB / 64 cores: the raw RAM/CPU divide authorizes min((128-4)/6, 64/2) + // = min(20, 32) = 20 — far past what the shared vmnet bridge tolerates. + XCTAssertEqual( + LinuxContainerBudget.maxConcurrentContainers( + physicalMemoryBytes: gb(128), activeProcessorCount: 64), 20) + // The EFFECTIVE cap (what the host budget enforces) clamps to the ceiling. + XCTAssertEqual(LinuxContainerBudget.maxConcurrentContainersCeiling, 4) + XCTAssertEqual( + LinuxContainerBudget.effectiveMaxConcurrentContainers( + physicalMemoryBytes: gb(128), activeProcessorCount: 64), + LinuxContainerBudget.maxConcurrentContainersCeiling) + } + + func testEffectiveCapNeverRaisesASmallHost() { + // 12 GB / 4 cores: raw = min((12-4)/6, 4/2) = min(1, 2) = 1. The ceiling is + // an UPPER clamp only — it must not raise a host that's below it. + XCTAssertEqual( + LinuxContainerBudget.effectiveMaxConcurrentContainers( + physicalMemoryBytes: gb(12), activeProcessorCount: 4), 1) + // A host that can't fit even one container stays 0 through the clamp. + XCTAssertEqual( + LinuxContainerBudget.effectiveMaxConcurrentContainers( + physicalMemoryBytes: gb(4), activeProcessorCount: 8), 0) + } } diff --git a/Tests/MactionsCoreTests/OrchestratorTests.swift b/Tests/MactionsCoreTests/OrchestratorTests.swift index da657d5..0bb4e50 100644 --- a/Tests/MactionsCoreTests/OrchestratorTests.swift +++ b/Tests/MactionsCoreTests/OrchestratorTests.swift @@ -175,7 +175,8 @@ final class OrchestratorTests: XCTestCase { idleJITRefreshInterval: TimeInterval? = 8 * 60, idleTrimGraceInterval: TimeInterval = 90, trimConfirmInterval: TimeInterval = 0.01, // tests: one 50ms tick elapses it - launchFailureGraceInterval: TimeInterval = 0 // tests: disabled unless opted in + launchFailureGraceInterval: TimeInterval = 0, // tests: disabled unless opted in + neverConfirmedReapInterval: TimeInterval = 3600 // tests: effectively off unless opted in ) -> (RunnerOrchestrator, FakeControlPlane, FakeFactory) { let cp = FakeControlPlane() cp.queuedJobs = queued ?? Array(repeating: labels, count: count) @@ -191,7 +192,8 @@ final class OrchestratorTests: XCTestCase { idleJITRefreshInterval: idleJITRefreshInterval, idleTrimGraceInterval: idleTrimGraceInterval, trimConfirmInterval: trimConfirmInterval, - launchFailureGraceInterval: launchFailureGraceInterval) + launchFailureGraceInterval: launchFailureGraceInterval, + neverConfirmedReapInterval: neverConfirmedReapInterval) return (orch, cp, factory) } @@ -830,6 +832,73 @@ final class OrchestratorTests: XCTestCase { await orch.stop() } + /// A container that REGISTERS but never reaches GitHub (its egress failed) is a + /// zombie: it won't exit, so the launch-failure-on-exit path never fires, and + /// waiting out the 5-min sustained-offline grace lets replacements pile up + /// behind it (the observed 15-container incident). With a short + /// `neverConfirmedReapInterval` it's reaped FAST — `provider.stop()` runs (so + /// the real provider force-deletes the container), the never-online + /// registration is deregistered, and it's counted toward the launch-failure + /// backoff so a repeated failure surfaces instead of respawning forever. + func testReapsRegisteredButNeverConfirmedOnlineZombieFast() async { + let (orch, cp, factory) = makeOrchestrator( + count: 1, reconcileInterval: 50_000_000, + remoteRegistrationGraceInterval: 5 * 60, // sustained-offline grace stays LONG… + idleJITRefreshInterval: nil, + neverConfirmedReapInterval: 0) // …but the never-online reaper fires immediately + await orch.start() + let runner = try! XCTUnwrap(orch.runners.first) + + // Registered (has a remoteId) but GitHub reports it offline and it was NEVER + // online — the egress-failed container. The 5-min grace hasn't elapsed, so + // ONLY the never-confirmed reaper can catch this. + cp.remote = [ + RemoteRunner(id: runner.remoteId!, name: runner.id, status: "offline", busy: false) + ] + let reaped = await waitUntil { + factory.made[0].stopped && cp.deleted.contains(runner.remoteId!) + && orch.runners.first?.id != runner.id + } + XCTAssertTrue( + reaped, + "a never-online zombie must be stopped (→ container force-deleted), deregistered, and replaced — fast") + // Repeated never-online reaps must surface as `launchFailing` ("substrate + // broken") rather than silently respawning forever. + let surfaced = await waitUntil { orch.launchFailing } + XCTAssertTrue(surfaced, "two never-online reaps must flip launchFailing") + await orch.stop() + } + + /// A SUSTAINED `listRunners` outage must HOLD the fleet, not keep scaling up. + /// With busy-ness unknown the converge step assumes queued jobs need NEW + /// runners — but when the outage is an egress failure those replacements can + /// never connect, which is the snowball. (A brief blip still scales up.) + func testSustainedListRunnersOutageHoldsTheFleet() async { + let (orch, cp, factory) = makeOrchestrator( + count: 3, queued: [["self-hosted"]], // one job → one runner to start + reconcileInterval: 50_000_000, + neverConfirmedReapInterval: 0) // the "sustained" window is immediate in the test + await orch.start() + let runner = try! XCTUnwrap(orch.runners.first) + // Confirm it online + busy so the reaper leaves it alone and a slot persists. + cp.remote = [ + RemoteRunner(id: runner.remoteId!, name: runner.id, status: "online", busy: true) + ] + let started = await waitUntil { orch.runners.count == 1 } + XCTAssertTrue(started) + + // The runner API goes dark and STAYS dark, while demand spikes to 3. The + // fleet must NOT grow toward demand against unknown busy-ness. + cp.failListRunners = true + cp.queuedJobs = Array(repeating: ["self-hosted"], count: 3) + try? await Task.sleep(nanoseconds: 400_000_000) // ~8 ticks; a negative assertion + XCTAssertEqual( + orch.runners.count, 1, + "a sustained runner-API outage must hold the fleet, not scale up to demand") + _ = factory + await orch.stop() + } + func testReconcileRefreshesIdleOnlineRunnerBeforeJITExpires() async { let (orch, cp, factory) = makeOrchestrator( count: 1, reconcileInterval: 100_000_000, remoteRegistrationGraceInterval: 0, diff --git a/project.yml b/project.yml index 5092677..f47d371 100644 --- a/project.yml +++ b/project.yml @@ -91,7 +91,7 @@ targets: # The Icon Composer icon → real AppIcon (compiled by actool). ASSETCATALOG_COMPILER_APPICON_NAME: Mactions SWIFT_VERSION: "6.0" - MARKETING_VERSION: "0.1.2" + MARKETING_VERSION: "0.1.3" CURRENT_PROJECT_VERSION: "1" GENERATE_INFOPLIST_FILE: NO INFOPLIST_FILE: xcode/Info.plist