Kyter-com · NickReisenauer · Jun 17, 2026 · Jun 17, 2026 · NickReisenauer · Jun 17, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,40 @@
+# Changelog
+
+All notable changes to Mactions are documented here. This project adheres to
+[Semantic Versioning](https://semver.org/).
+
+## [0.1.3] - 2026-06-16
+
+### Fixed
+
+- **Linux runner containers could pile up when a runner never reached GitHub.**
+  If an ephemeral Linux container registered but never connected — e.g. its
+  network egress failed — it never exited, so the orchestrator dropped it from
+  tracking and launched a replacement every reap cycle, accumulating zombie
+  containers (15 observed in one incident) that exhausted Apple `container`'s
+  shared vmnet NAT bridge and wedged the network for the rest.
+
+  Root cause: `LinuxContainerProvider.stop()` only SIGTERM'd the foreground
+  `container run` client (an attachment to the daemon-managed container) and
+  relied on its termination handler to force-delete the container — which did
+  not fire reliably, so the container kept running. `stop()` now force-deletes
+  the container by name directly (`container delete --force`, verified to reap a
+  running container), so every reap actually removes its container.
+
+### Added
+
+- **Liveness reaper for "registered but never online" runners.** A runner alive
+  past `neverConfirmedReapInterval` (default 2 min) that GitHub never once
+  confirmed `online` is force-deleted, deregistered, and counted toward the
+  launch-failure backoff — rather than waiting out the 5-minute sustained-offline
+  grace (which exists to protect a mid-job runner from a transient blip). A
+  repeated failure now flips `launchFailing` ("substrate broken") instead of
+  respawning forever.
+- **Bridge-safe Linux concurrency ceiling** (`maxConcurrentContainersCeiling`,
+  default 4). The effective per-host container cap is the lower of the RAM/CPU
+  divide and this ceiling, because Apple `container`'s single shared vmnet NAT
+  bridge — not RAM/CPU — is the binding constraint past a handful of containers.
+- **Sustained runner-API outage backstop.** When `listRunners` has been failing
+  for a sustained window, the orchestrator holds the fleet instead of scaling up
+  against unknown busy-ness (which, during an egress outage, would spawn
+  replacements that can never connect).
diff --git a/Sources/Mactions/AppState.swift b/Sources/Mactions/AppState.swift
@@ -777,7 +777,7 @@ final class AppState: ObservableObject {
         let maxLinux =
           linuxFactory == nil
           ? 0
-          : LinuxContainerBudget.maxConcurrentContainers(
+          : LinuxContainerBudget.effectiveMaxConcurrentContainers(
             physicalMemoryBytes: ProcessInfo.processInfo.physicalMemory,
             activeProcessorCount: ProcessInfo.processInfo.activeProcessorCount)
 
@@ -1887,7 +1887,7 @@ final class AppState: ObservableObject {
         ? WindowsVMBudget.maxConcurrentVMs(physicalMemoryBytes: ram, perVMGB: perVM)
         : 0,
       linuxMaxConcurrentContainers: linuxImageReady
-        ? LinuxContainerBudget.maxConcurrentContainers(
+        ? LinuxContainerBudget.effectiveMaxConcurrentContainers(
           physicalMemoryBytes: ram,
           activeProcessorCount: ProcessInfo.processInfo.activeProcessorCount)
         : 0,

diff --git a/Sources/MactionsCore/LinuxContainerBudget.swift b/Sources/MactionsCore/LinuxContainerBudget.swift
@@ -23,6 +23,15 @@ public enum LinuxContainerBudget {
   /// is OOM-killed (exit 137) rather than swamping the host.
   public static let defaultMemoryGBPerContainer = 6
 
+  /// Hard ceiling on concurrent containers, applied BELOW the RAM/CPU divide.
+  /// Apple `container` shares a single vmnet NAT bridge across every container;
+  /// past a handful of concurrent lightweight VMs the bridge/DHCP path — not
+  /// RAM/CPU — becomes the binding constraint, and a registered-but-offline
+  /// container can wedge it for the others. Clamp explicitly so a large Mac
+  /// can't authorize a fan-out that exhausts the bridge (the raw divide permitted
+  /// 8+ on a 64 GB host, which is how a stuck runner snowballed into a pileup).
+  public static let maxConcurrentContainersCeiling = 4
+
   /// Max concurrent containers given the host's physical memory + core count.
   ///
   /// Reserves `reservedGB` for macOS, the app, and idle agents, then divides the
@@ -45,4 +54,24 @@ public enum LinuxContainerBudget {
     let byCPU = max(0, activeProcessorCount) / cpusPerContainer
     return max(0, min(byRAM, byCPU))
   }
+
+  /// The cap the host budget should actually ENFORCE: the RAM/CPU divide above,
+  /// then clamped to `maxConcurrentContainersCeiling`. The raw divide can
+  /// authorize many containers on a big Mac, but Apple `container`'s shared
+  /// vmnet NAT bridge — not RAM/CPU — binds first, so the effective cap is the
+  /// lower of the two. Callers computing the live Linux budget use THIS.
+  public static func effectiveMaxConcurrentContainers(
+    physicalMemoryBytes: UInt64,
+    activeProcessorCount: Int,
+    memoryGBPerContainer: Int = defaultMemoryGBPerContainer,
+    cpusPerContainer: Int = defaultCPUsPerContainer,
+    reservedGB: Int = 4
+  ) -> Int {
+    min(
+      maxConcurrentContainers(
+        physicalMemoryBytes: physicalMemoryBytes, activeProcessorCount: activeProcessorCount,
+        memoryGBPerContainer: memoryGBPerContainer, cpusPerContainer: cpusPerContainer,
+        reservedGB: reservedGB),
+      maxConcurrentContainersCeiling)
+  }
 }
diff --git a/Sources/MactionsCore/LinuxContainerProvider.swift b/Sources/MactionsCore/LinuxContainerProvider.swift
@@ -209,11 +209,19 @@ public final class LinuxContainerProvider: RunnerProvider, @unchecked Sendable {
 
   public func stop() {
     lock.lock(); let process = self.process; self.process = nil; lock.unlock()
-    if let process, process.isRunning {
-      process.terminate()  // terminationHandler runs cleanup()
-    } else {
-      cleanup()
-    }
+    // Terminate the foreground `container run` client so its Process settles and
+    // the `onExit` callback fires — but that client is only an ATTACHMENT to a
+    // daemon-managed container. SIGTERM does NOT reliably make `container run`
+    // exit, so we must not depend on its terminationHandler to invoke cleanup().
+    // Force-delete by NAME ourselves, unconditionally: `container delete --force`
+    // removes a still-running container, and `cleanup()` is idempotent (the
+    // `cleaned` guard), so a later terminationHandler is a harmless no-op.
+    //
+    // Without this, a reaped runner's container kept running in the daemon while
+    // the orchestrator dropped it from `slots` and launched a replacement — so a
+    // never-connecting runner (failed egress) piled up one zombie per reap cycle.
+    process?.terminate()
+    cleanup()
   }
 
   /// Idempotent: `--rm` reaps the container on a normal exit, but a SIGKILL

diff --git a/Sources/MactionsCore/Orchestrator.swift b/Sources/MactionsCore/Orchestrator.swift
@@ -144,6 +144,15 @@ public final class RunnerOrchestrator {
   /// don't record a run, don't hot-loop a replacement — the periodic tick
   /// paces the retry.
   private let launchFailureGraceInterval: TimeInterval
+  /// A slot ALIVE this long without GitHub ever confirming it `online`
+  /// (`confirmedOnline` never set) is a never-connected zombie: it minted a
+  /// registration but its egress failed, so it never reached GitHub and never
+  /// claimed a job — and, unlike a launch-FAILURE *exit*, its container keeps
+  /// running, so it never trips `launchFailureGraceInterval`. Reap it well before
+  /// the 5-min sustained-offline grace (there's no job to orphan: an unconfirmed
+  /// runner is never assigned work), and count it toward the launch-failure
+  /// backoff so a repeated failure surfaces instead of spawning endlessly.
+  private let neverConfirmedReapInterval: TimeInterval
 
   /// What the last queue poll saw — the UI's window into scale-from-zero
   /// (zero runners is the normal armed state, so the demand signal itself is
@@ -169,6 +178,12 @@ public final class RunnerOrchestrator {
   /// broken (dead daemon, bad base image) and "starting…" would be a lie.
   private var consecutiveLaunchFailures = 0
   public var launchFailing: Bool { consecutiveLaunchFailures >= 2 }
+  /// When `listRunners` first started failing (cleared on the next success). A
+  /// SUSTAINED runner-API outage must NOT read as "spawn more": with busy-ness
+  /// unknown, the converge step would scale up to a fixed target every tick, and
+  /// when the outage is an egress failure those new runners can never connect —
+  /// the snowball. Used by the `health == nil` backstop in `reconcile`.
+  private var listRunnersFailingSince: Date?
   /// Fired (on the main actor) whenever `state`/`runners` change.
   public var onChange: (() -> Void)?
   /// Fired (on the main actor) once for each runner that finishes — a clean
@@ -264,7 +279,8 @@ public final class RunnerOrchestrator {
     idleJITRefreshInterval: TimeInterval? = 8 * 60,
     idleTrimGraceInterval: TimeInterval = 90,
     trimConfirmInterval: TimeInterval = 25,
-    launchFailureGraceInterval: TimeInterval = 30
+    launchFailureGraceInterval: TimeInterval = 30,
+    neverConfirmedReapInterval: TimeInterval = 120
   ) {
     self.controlPlane = controlPlane
     self.factory = factory
@@ -279,6 +295,7 @@ public final class RunnerOrchestrator {
     self.idleTrimGraceInterval = idleTrimGraceInterval
     self.trimConfirmInterval = trimConfirmInterval
     self.launchFailureGraceInterval = launchFailureGraceInterval
+    self.neverConfirmedReapInterval = neverConfirmedReapInterval
   }
 
   public var runners: [ManagedRunner] {
@@ -397,6 +414,19 @@ public final class RunnerOrchestrator {
 
     // 3. Converge.
     guard let health else {
+      // SUSTAINED runner-API outage → HOLD the fleet (don't spawn more). With
+      // busy-ness unknown, the scale-up below assumes queued jobs need NEW
+      // runners — but when the outage is an egress failure (often the same one
+      // stopping runners from connecting), those replacements can never come
+      // online, which is exactly the snowball that piled up containers. Mirrors
+      // the "failed queue poll HOLDS" rule above. A brief blip still scales up.
+      if let since = listRunnersFailingSince,
+        Date().timeIntervalSince(since) >= neverConfirmedReapInterval
+      {
+        lastError = "Can't reach GitHub's runner API — holding the fleet (not spawning more)."
+        notify()
+        return
+      }
       // The queue read is real but busy-ness is unknown (listRunners failed).
       // Assume every current slot is busy: demand that PERSISTS alongside
       // genuinely idle runners doesn't happen (GitHub assigns within seconds),
@@ -538,10 +568,12 @@ public final class RunnerOrchestrator {
     do {
       remote = try await controlPlane.listRunners()
     } catch {
+      if listRunnersFailingSince == nil { listRunnersFailingSince = Date() }
       lastError = String(describing: error)
       notify()
       return nil
     }
+    listRunnersFailingSince = nil  // a successful fetch clears the outage clock
     guard epoch == myEpoch, state == .online else { return nil }
 
     let byId = Dictionary(remote.map { ($0.id, $0) }, uniquingKeysWith: { first, _ in first })
@@ -556,6 +588,26 @@ public final class RunnerOrchestrator {
       let healthy = remoteRunner?.status.lowercased() == "online"
 
       if !healthy {
+        // NEVER-CONNECTED ZOMBIE: appended, alive past `neverConfirmedReapInterval`,
+        // and GitHub has NEVER once confirmed it online. This is the egress-failed
+        // container — it registered but never reached GitHub, so it has no job to
+        // orphan (an unconfirmed runner is never assigned work) and its container
+        // won't exit on its own (so the launch-failure-on-EXIT path never fires).
+        // Reap it FAST instead of waiting out the 5-min sustained-offline grace
+        // (which exists to protect a mid-job runner from a transient blip) — else
+        // replacements pile up behind it. Count it toward the launch-failure
+        // backoff so a repeated failure flips `launchFailing` ("substrate broken")
+        // rather than silently respawning forever.
+        if slot.appended, !slot.confirmedOnline, neverConfirmedReapInterval >= 0,
+          age >= neverConfirmedReapInterval
+        {
+          consecutiveLaunchFailures += 1
+          lastError =
+            "Runner \(slot.name) registered but never reached GitHub within "
+            + "\(Int(neverConfirmedReapInterval))s — reaping (check network/egress)."
+          stale.append((slot, remoteRunner))
+          continue
+        }
         // Missing from GitHub's list, or not online. Prune ONLY after SUSTAINED
         // unhealth (≥ grace), never on a single bad reading — a transient
         // eventual-consistency / connection blip must not tear down a runner

diff --git a/Tests/MactionsCoreTests/LinuxContainerBudgetTests.swift b/Tests/MactionsCoreTests/LinuxContainerBudgetTests.swift
@@ -58,4 +58,30 @@ final class LinuxContainerBudgetTests: XCTestCase {
     XCTAssertEqual(LinuxContainerBudget.defaultCPUsPerContainer, 2)
     XCTAssertEqual(LinuxContainerBudget.defaultMemoryGBPerContainer, 6)
   }
+
+  func testEffectiveCapClampsLargeHostToBridgeCeiling() {
+    // 128 GB / 64 cores: the raw RAM/CPU divide authorizes min((128-4)/6, 64/2)
+    // = min(20, 32) = 20 — far past what the shared vmnet bridge tolerates.
+    XCTAssertEqual(
+      LinuxContainerBudget.maxConcurrentContainers(
+        physicalMemoryBytes: gb(128), activeProcessorCount: 64), 20)
+    // The EFFECTIVE cap (what the host budget enforces) clamps to the ceiling.
+    XCTAssertEqual(LinuxContainerBudget.maxConcurrentContainersCeiling, 4)
+    XCTAssertEqual(
+      LinuxContainerBudget.effectiveMaxConcurrentContainers(
+        physicalMemoryBytes: gb(128), activeProcessorCount: 64),
+      LinuxContainerBudget.maxConcurrentContainersCeiling)
+  }
+
+  func testEffectiveCapNeverRaisesASmallHost() {
+    // 12 GB / 4 cores: raw = min((12-4)/6, 4/2) = min(1, 2) = 1. The ceiling is
+    // an UPPER clamp only — it must not raise a host that's below it.
+    XCTAssertEqual(
+      LinuxContainerBudget.effectiveMaxConcurrentContainers(
+        physicalMemoryBytes: gb(12), activeProcessorCount: 4), 1)
+    // A host that can't fit even one container stays 0 through the clamp.
+    XCTAssertEqual(
+      LinuxContainerBudget.effectiveMaxConcurrentContainers(
+        physicalMemoryBytes: gb(4), activeProcessorCount: 8), 0)
+  }
 }
diff --git a/Tests/MactionsCoreTests/OrchestratorTests.swift b/Tests/MactionsCoreTests/OrchestratorTests.swift
@@ -175,7 +175,8 @@ final class OrchestratorTests: XCTestCase {
     idleJITRefreshInterval: TimeInterval? = 8 * 60,
     idleTrimGraceInterval: TimeInterval = 90,
     trimConfirmInterval: TimeInterval = 0.01,  // tests: one 50ms tick elapses it
-    launchFailureGraceInterval: TimeInterval = 0  // tests: disabled unless opted in
+    launchFailureGraceInterval: TimeInterval = 0,  // tests: disabled unless opted in
+    neverConfirmedReapInterval: TimeInterval = 3600  // tests: effectively off unless opted in
   ) -> (RunnerOrchestrator, FakeControlPlane, FakeFactory) {
     let cp = FakeControlPlane()
     cp.queuedJobs = queued ?? Array(repeating: labels, count: count)
@@ -191,7 +192,8 @@ final class OrchestratorTests: XCTestCase {
       idleJITRefreshInterval: idleJITRefreshInterval,
       idleTrimGraceInterval: idleTrimGraceInterval,
       trimConfirmInterval: trimConfirmInterval,
-      launchFailureGraceInterval: launchFailureGraceInterval)
+      launchFailureGraceInterval: launchFailureGraceInterval,
+      neverConfirmedReapInterval: neverConfirmedReapInterval)
     return (orch, cp, factory)
   }
 
@@ -830,6 +832,73 @@ final class OrchestratorTests: XCTestCase {
     await orch.stop()
   }
 
+  /// A container that REGISTERS but never reaches GitHub (its egress failed) is a
+  /// zombie: it won't exit, so the launch-failure-on-exit path never fires, and
+  /// waiting out the 5-min sustained-offline grace lets replacements pile up
+  /// behind it (the observed 15-container incident). With a short
+  /// `neverConfirmedReapInterval` it's reaped FAST — `provider.stop()` runs (so
+  /// the real provider force-deletes the container), the never-online
+  /// registration is deregistered, and it's counted toward the launch-failure
+  /// backoff so a repeated failure surfaces instead of respawning forever.
+  func testReapsRegisteredButNeverConfirmedOnlineZombieFast() async {
+    let (orch, cp, factory) = makeOrchestrator(
+      count: 1, reconcileInterval: 50_000_000,
+      remoteRegistrationGraceInterval: 5 * 60,  // sustained-offline grace stays LONG…
+      idleJITRefreshInterval: nil,
+      neverConfirmedReapInterval: 0)  // …but the never-online reaper fires immediately
+    await orch.start()
+    let runner = try! XCTUnwrap(orch.runners.first)
+
+    // Registered (has a remoteId) but GitHub reports it offline and it was NEVER
+    // online — the egress-failed container. The 5-min grace hasn't elapsed, so
+    // ONLY the never-confirmed reaper can catch this.
+    cp.remote = [
+      RemoteRunner(id: runner.remoteId!, name: runner.id, status: "offline", busy: false)
+    ]
+    let reaped = await waitUntil {
+      factory.made[0].stopped && cp.deleted.contains(runner.remoteId!)
+        && orch.runners.first?.id != runner.id
+    }
+    XCTAssertTrue(
+      reaped,
+      "a never-online zombie must be stopped (→ container force-deleted), deregistered, and replaced — fast")
+    // Repeated never-online reaps must surface as `launchFailing` ("substrate
+    // broken") rather than silently respawning forever.
+    let surfaced = await waitUntil { orch.launchFailing }
+    XCTAssertTrue(surfaced, "two never-online reaps must flip launchFailing")
+    await orch.stop()
+  }
+
+  /// A SUSTAINED `listRunners` outage must HOLD the fleet, not keep scaling up.
+  /// With busy-ness unknown the converge step assumes queued jobs need NEW
+  /// runners — but when the outage is an egress failure those replacements can
+  /// never connect, which is the snowball. (A brief blip still scales up.)
+  func testSustainedListRunnersOutageHoldsTheFleet() async {
+    let (orch, cp, factory) = makeOrchestrator(
+      count: 3, queued: [["self-hosted"]],  // one job → one runner to start
+      reconcileInterval: 50_000_000,
+      neverConfirmedReapInterval: 0)  // the "sustained" window is immediate in the test
+    await orch.start()
+    let runner = try! XCTUnwrap(orch.runners.first)
+    // Confirm it online + busy so the reaper leaves it alone and a slot persists.
+    cp.remote = [
+      RemoteRunner(id: runner.remoteId!, name: runner.id, status: "online", busy: true)
+    ]
+    let started = await waitUntil { orch.runners.count == 1 }
+    XCTAssertTrue(started)
+
+    // The runner API goes dark and STAYS dark, while demand spikes to 3. The
+    // fleet must NOT grow toward demand against unknown busy-ness.
+    cp.failListRunners = true
+    cp.queuedJobs = Array(repeating: ["self-hosted"], count: 3)
+    try? await Task.sleep(nanoseconds: 400_000_000)  // ~8 ticks; a negative assertion
+    XCTAssertEqual(
+      orch.runners.count, 1,
+      "a sustained runner-API outage must hold the fleet, not scale up to demand")
+    _ = factory
+    await orch.stop()
+  }
+
   func testReconcileRefreshesIdleOnlineRunnerBeforeJITExpires() async {
     let (orch, cp, factory) = makeOrchestrator(
       count: 1, reconcileInterval: 100_000_000, remoteRegistrationGraceInterval: 0,

diff --git a/project.yml b/project.yml
@@ -91,7 +91,7 @@ targets:
         # The Icon Composer icon → real AppIcon (compiled by actool).
         ASSETCATALOG_COMPILER_APPICON_NAME: Mactions
         SWIFT_VERSION: "6.0"
-        MARKETING_VERSION: "0.1.2"
+        MARKETING_VERSION: "0.1.3"
         CURRENT_PROJECT_VERSION: "1"
         GENERATE_INFOPLIST_FILE: NO
         INFOPLIST_FILE: xcode/Info.plist