From 116a3cc6c149036c4121364f805a202c8d37a83f Mon Sep 17 00:00:00 2001 From: Christian Wendler Date: Tue, 9 Jun 2026 00:37:09 +0200 Subject: [PATCH] fix: orphan-gate the pre-GUI mcp-stdio sweep so cold-starts don't kill the bootstrapper (v0.8.2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When Cowork / Claude Code spawned aiui --mcp-stdio with no GUI running, that child cold-started the GUI, and the freshly launched GUI then ran kill_mcp_stdio_started_before_self (v0.4.43) and SIGTERM'd its own bootstrapper — the trace even named the killer: housekeeping: killing pre-GUI mcp-stdio child pid=2483 … startup: killed 1 pre-GUI mcp-stdio child(ren); Claude Desktop will respawn them against the current binary Claude Desktop respawn was the old assumption; Cowork / Claude Code don't respawn, so the bootstrapper just died → user saw "MCP aiui: Server disconnected" on the first MCP call after every cold start. Same fix pattern as v0.4.46's Bug A (kill_orphaned_mcp_stdio_children): add is_orphaned_child as a filter — the pre-GUI sweep still reaps older mcp-stdio children, but only when their parent wrapper is also gone (genuine leak). The live bootstrapper has a live parent, so it is now spared. Regression tests cover both cases: - pre_gui_kill_spares_bootstrapper_with_live_parent - pre_gui_kill_reaps_orphaned_older_child cargo test 131/0, clippy clean (-D warnings), svelte-check 0 errors, vite build ok. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 25 +++++++ companion/src-tauri/Cargo.lock | 2 +- companion/src-tauri/Cargo.toml | 2 +- companion/src-tauri/src/housekeeping.rs | 87 ++++++++++++++++++++----- companion/src-tauri/src/lib.rs | 10 ++- companion/src-tauri/tauri.conf.json | 2 +- python/pyproject.toml | 2 +- 7 files changed, 108 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d11d604..e8bb543 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,31 @@ All notable changes to this project are documented here. +## [0.8.2] — 2026-06-08 + +Targeted Cowork cold-start fix. (CHANGELOG entries for v0.5.0 – v0.8.1 +live with the release notes on GitHub for those tags; the file resumes +here.) + +### Fixed + +- **Cowork: "MCP aiui Server disconnected" on every cold start.** When + Cowork (or any Claude Code client) spawned a fresh `aiui --mcp-stdio` + and no GUI was already running, that child cold-started the GUI — + and the freshly-launched GUI then ran `kill_mcp_stdio_started_before_self` + (v0.4.43) and SIGTERM'd its own bootstrapper. The bootstrapper carried + the assumption "Claude Desktop will respawn them" right in its trace + message; Cowork / Claude Code don't respawn, so the user's MCP + connection died on the first call of every cold start. (2026-06-08 + trace: `housekeeping: killing pre-GUI mcp-stdio child pid=2483 … + cutoff=…`, 138 ms after the bootstrapper attached.) The pre-GUI sweep + is now **orphan-gated** the same way `kill_orphaned_mcp_stdio_children` + (v0.4.46, Bug A) handled the sibling-kill: it still reaps mcp-stdio + children older than the new GUI, but only when their parent wrapper + is already gone (`ppid==1` / absent from the snapshot). A live + bootstrapper has a live parent → spared. Regression tests cover both + the bootstrapper-spared case and the orphan-reaped case. + ## [0.4.46] — 2026-05-29 Dialog-lifecycle hardening. Two field-reported regressions from the diff --git a/companion/src-tauri/Cargo.lock b/companion/src-tauri/Cargo.lock index aab30fd..853d761 100644 --- a/companion/src-tauri/Cargo.lock +++ b/companion/src-tauri/Cargo.lock @@ -30,7 +30,7 @@ dependencies = [ [[package]] name = "aiui" -version = "0.8.1" +version = "0.8.2" dependencies = [ "axum", "base64 0.22.1", diff --git a/companion/src-tauri/Cargo.toml b/companion/src-tauri/Cargo.toml index 4edc43b..0d47c28 100644 --- a/companion/src-tauri/Cargo.toml +++ b/companion/src-tauri/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "aiui" -version = "0.8.1" +version = "0.8.2" description = "aiui companion — renders dialogs for remote Claude Code sessions" authors = ["byte5"] license = "" diff --git a/companion/src-tauri/src/housekeeping.rs b/companion/src-tauri/src/housekeeping.rs index bafbf4f..24fe240 100644 --- a/companion/src-tauri/src/housekeeping.rs +++ b/companion/src-tauri/src/housekeeping.rs @@ -306,19 +306,29 @@ pub fn kill_orphaned_mcp_stdio_children() -> usize { n } -/// Filter: every `aiui --mcp-stdio` child started *strictly before* -/// `own_start_time`, excluding `own_pid`. Pure function over a +/// Filter: every *orphaned* `aiui --mcp-stdio` child started strictly +/// before `own_start_time`, excluding `own_pid`. Pure function over a /// snapshot — caller passes own pid + own start_time so tests don't /// need to spoof `std::process::id`. /// -/// Used by the GUI at startup right after it wins the process-lifetime -/// lock: any mcp-stdio that predates the freshly-started GUI carries -/// the binary it was spawned with (potentially pre-update RAM), and -/// `disk_version_if_stale` would only catch the version-drift case on -/// its own. The newer GUI is the source of truth → all older children -/// are kicked, Claude Desktop respawns them against the current binary -/// with all of the current GUI's protections (sibling-kill, periodic -/// stale-check, etc.). v0.4.43. +/// Called by the GUI at startup right after it wins the +/// process-lifetime lock. Original intent (v0.4.43): clear out +/// mcp-stdio children left over from a *previous* GUI generation — +/// they carry the pre-update binary in RAM, and `disk_version_if_stale` +/// alone wouldn't reach them. +/// +/// Rescoped (v0.8.2): require the child to also be **orphaned** +/// (`is_orphaned_child` — parent process gone). The previous "older +/// than the GUI" rule alone tore down the mcp-stdio child that had +/// just bootstrapped the very GUI which then ran the sweep — Cowork's +/// 2026-06-08 "Server disconnected, then `housekeeping: killing +/// pre-GUI mcp-stdio child pid=2483 … cutoff=1780932439`" trace. The +/// old code's docstring assumed "Claude Desktop respawns them against +/// the current binary"; Claude Code / Cowork **do not** respawn, so +/// the bootstrapper just died and the user saw an MCP error every +/// cold start. Orphan-status is the precise discriminator: a stale +/// leftover from a dead client has lost its parent; a live +/// bootstrapper has not. fn find_pre_gui_mcp_stdio_to_kill( snap: &[ProcSnap], own_pid: u32, @@ -329,6 +339,7 @@ fn find_pre_gui_mcp_stdio_to_kill( .filter(|p| has_mcp_stdio_flag(&p.args)) .filter(|p| is_aiui_binary(&p.exe)) .filter(|p| p.start_time < own_start_time) + .filter(|p| is_orphaned_child(snap, p)) .map(|p| StaleChild { pid: p.pid, exe: p.exe.clone(), @@ -336,10 +347,14 @@ fn find_pre_gui_mcp_stdio_to_kill( .collect() } -/// Public entry: terminate every `aiui --mcp-stdio` child older than -/// us. Returns the count of children signalled. Safe to call from -/// the GUI startup path after winning the process-lifetime lock — -/// no race because at most one GUI holds the lock. +/// Public entry: terminate every *orphaned* `aiui --mcp-stdio` child +/// older than us. Returns the count of children signalled. Safe to +/// call from the GUI startup path after winning the process-lifetime +/// lock — no race because at most one GUI holds the lock. +/// +/// Rescoped in v0.8.2: requires orphan-status (parent gone) so we no +/// longer take down the live bootstrapper child that just spawned us. +/// See `find_pre_gui_mcp_stdio_to_kill` for the rationale. pub fn kill_mcp_stdio_started_before_self() -> usize { let own_pid = std::process::id(); let snap = snapshot_processes(); @@ -361,14 +376,15 @@ pub fn kill_mcp_stdio_started_before_self() -> usize { let victims = find_pre_gui_mcp_stdio_to_kill(&snap, own_pid, own_start_time); for victim in &victims { trace(&format!( - "housekeeping: killing pre-GUI mcp-stdio child pid={} exe={} (cutoff={})", + "housekeeping: killing pre-GUI orphan mcp-stdio pid={} exe={} \ + (older than GUI cutoff={} AND parent gone)", victim.pid, victim.exe, own_start_time )); terminate_pid(victim.pid); } if !victims.is_empty() { trace(&format!( - "housekeeping: terminated {} pre-GUI mcp-stdio child(ren) at startup", + "housekeeping: terminated {} pre-GUI orphan mcp-stdio child(ren) at startup", victims.len() )); } @@ -944,6 +960,45 @@ mod tests { ); } + #[test] + fn pre_gui_kill_spares_bootstrapper_with_live_parent() { + // THE 2026-06-08 regression test. A fresh Cowork session + // spawned mcp-stdio #300, which then cold-started GUI #400. + // The GUI's pre-GUI sweep would see #300 as "older than me" + // and SIGTERM it — taking down Cowork's still-live MCP + // connection. Orphan-gate fix: #300's parent wrapper #200 + // is alive in the snap → #300 is a live bootstrapper, not a + // leak → spared. + let snap = vec![ + snap_full(100, 1, "/Applications/Claude.app/Contents/MacOS/Claude", &["Claude"], 500), + snap_full(200, 100, "/Applications/Claude.app/Contents/Helpers/disclaimer", &["disclaimer", CURRENT, "--mcp-stdio"], 999), + // bootstrapper mcp-stdio: older than the GUI, parent (200) alive + snap_full(300, 200, CURRENT, &[CURRENT, "--mcp-stdio"], 1000), + // us — the freshly started GUI: + snap_full(400, 1, "/Applications/aiui.app/Contents/MacOS/aiui", &["aiui"], 2000), + ]; + let victims = find_pre_gui_mcp_stdio_to_kill(&snap, 400, 2000); + assert!( + victims.is_empty(), + "bootstrapper child with live parent must be spared (Bug 2026-06-08)" + ); + } + + #[test] + fn pre_gui_kill_reaps_orphaned_older_child() { + // The legit case the sweep was added for (v0.4.43): an + // mcp-stdio left over from a previous, dead Cowork session — + // parent wrapper gone, child reparented to launchd. Older + // than us AND orphaned → must still be reaped. + let snap = vec![ + snap_full(200, 1, CURRENT, &[CURRENT, "--mcp-stdio"], 1000), + snap_full(400, 1, "/Applications/aiui.app/Contents/MacOS/aiui", &["aiui"], 2000), + ]; + let victims = find_pre_gui_mcp_stdio_to_kill(&snap, 400, 2000); + assert_eq!(victims.len(), 1); + assert_eq!(victims[0].pid, 200); + } + // ---------- ProcessLock ---------- #[test] diff --git a/companion/src-tauri/src/lib.rs b/companion/src-tauri/src/lib.rs index 7d1d3ce..aae2677 100644 --- a/companion/src-tauri/src/lib.rs +++ b/companion/src-tauri/src/lib.rs @@ -1300,11 +1300,17 @@ pub fn run() { // generation and may carry stale in-RAM code (the 2026-05-23 // 0.4.40-children-survive-update scenario). New GUI = new truth. // Race-safe because only the lock-winner reaches this line. + // v0.8.2: this sweep is now orphan-gated. It only kills pre-GUI + // mcp-stdio children whose parent is already gone (real stale + // leftovers). The bootstrapper child that just spawned us has a + // live parent and is left alone — Cowork / Claude Code don't + // respawn disconnected MCP servers, so killing the bootstrapper + // is a one-way trip to "Server disconnected". let pre_gui_killed = housekeeping::kill_mcp_stdio_started_before_self(); if pre_gui_killed > 0 { logging::trace(&format!( - "[aiui] startup: killed {pre_gui_killed} pre-GUI mcp-stdio child(ren); \ - Claude Desktop will respawn them against the current binary" + "[aiui] startup: reaped {pre_gui_killed} pre-GUI orphan mcp-stdio child(ren) \ + (older than this GUI AND parent gone)" )); } diff --git a/companion/src-tauri/tauri.conf.json b/companion/src-tauri/tauri.conf.json index 867f959..9ce52af 100644 --- a/companion/src-tauri/tauri.conf.json +++ b/companion/src-tauri/tauri.conf.json @@ -1,7 +1,7 @@ { "$schema": "../node_modules/@tauri-apps/cli/config.schema.json", "productName": "aiui", - "version": "0.8.1", + "version": "0.8.2", "identifier": "de.byte5.aiui", "build": { "frontendDist": "../dist", diff --git a/python/pyproject.toml b/python/pyproject.toml index 14ee5e9..b6be3c6 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "aiui-mcp" -version = "0.8.1" +version = "0.8.2" description = "MCP server for aiui — native macOS dialogs from any Claude Code session, local or remote." readme = "README.md" requires-python = ">=3.10"