From ce14f1825c42e5fe42e66ab5e76545bbcaab67a5 Mon Sep 17 00:00:00 2001 From: harshitsinghbhandari <24b4506@iitb.ac.in> Date: Sat, 23 May 2026 16:01:00 +0530 Subject: [PATCH 1/3] fix(core): auto-terminate sessions when agent exits but runtime stays alive When the agent process exits but tmux stays alive (keep-alive shell), resolveProbeDecision treated runtime=alive + process=dead as a signal disagreement, ran the detecting cycle, and parked the session at stuck/probe_failure forever. The native activity signal + process probe both agree the agent is gone, so terminate directly instead. Closes #1933, #1966. Co-Authored-By: Claude Opus 4.7 --- .../src/__tests__/lifecycle-manager.test.ts | 4 +- .../src/__tests__/lifecycle-state.test.ts | 5 ++ .../lifecycle-status-decisions.test.ts | 59 +++++++++++++++++++ packages/core/src/lifecycle-state.ts | 1 + .../core/src/lifecycle-status-decisions.ts | 21 +++++++ 5 files changed, 88 insertions(+), 2 deletions(-) diff --git a/packages/core/src/__tests__/lifecycle-manager.test.ts b/packages/core/src/__tests__/lifecycle-manager.test.ts index 68edb12fee..d7a57e8968 100644 --- a/packages/core/src/__tests__/lifecycle-manager.test.ts +++ b/packages/core/src/__tests__/lifecycle-manager.test.ts @@ -589,7 +589,7 @@ describe("check (single session)", () => { expect(lm.getStates().get("app-1")).toBe("killed"); }); - it("detects killed state when getActivityState returns exited", async () => { + it("terminates when getActivityState returns exited even though the runtime is still alive (#1933, #1966)", async () => { vi.mocked(plugins.agent.getActivityState).mockResolvedValue({ state: "exited" }); vi.mocked(plugins.runtime.isAlive).mockResolvedValue(true); @@ -598,7 +598,7 @@ describe("check (single session)", () => { }); await lm.check("app-1"); - expect(lm.getStates().get("app-1")).toBe("detecting"); + expect(lm.getStates().get("app-1")).toBe("killed"); }); it("detects killed via terminal fallback when getActivityState returns null", async () => { diff --git a/packages/core/src/__tests__/lifecycle-state.test.ts b/packages/core/src/__tests__/lifecycle-state.test.ts index 97c0ac61b6..f7f0b8e38e 100644 --- a/packages/core/src/__tests__/lifecycle-state.test.ts +++ b/packages/core/src/__tests__/lifecycle-state.test.ts @@ -55,10 +55,15 @@ describe("deriveLegacyStatus", () => { merged.session.state = "terminated"; merged.session.reason = "pr_merged"; + const agentExited = createOpenPRLifecycle(); + agentExited.session.state = "terminated"; + agentExited.session.reason = "agent_process_exited"; + expect(deriveLegacyStatus(killed)).toBe("killed"); expect(deriveLegacyStatus(cleanup)).toBe("cleanup"); expect(deriveLegacyStatus(errored)).toBe("errored"); expect(deriveLegacyStatus(merged)).toBe("cleanup"); + expect(deriveLegacyStatus(agentExited)).toBe("killed"); }); it("keeps PR-oriented aliases for idle workers with open PRs", () => { diff --git a/packages/core/src/__tests__/lifecycle-status-decisions.test.ts b/packages/core/src/__tests__/lifecycle-status-decisions.test.ts index f35e735d8e..47ae186272 100644 --- a/packages/core/src/__tests__/lifecycle-status-decisions.test.ts +++ b/packages/core/src/__tests__/lifecycle-status-decisions.test.ts @@ -3,9 +3,12 @@ import { createDetectingDecision, hashEvidence, isDetectingTimedOut, + resolveProbeDecision, DETECTING_MAX_ATTEMPTS, DETECTING_MAX_DURATION_MS, } from "../lifecycle-status-decisions.js"; +import { createActivitySignal } from "../activity-signal.js"; +import type { ActivitySignal } from "../types.js"; describe("hashEvidence", () => { it("returns a 12-character hex string", () => { @@ -216,3 +219,59 @@ describe("createDetectingDecision", () => { }); }); }); + +describe("resolveProbeDecision", () => { + const baseProbeInput = { + currentAttempts: 0, + canProbeRuntimeIdentity: true, + activityEvidence: "", + idleWasBlocked: false, + }; + + const exitedSignal: ActivitySignal = createActivitySignal("valid", { + activity: "exited", + source: "native", + }); + + it("terminates immediately when the native activity signal says the agent exited, even if the runtime is still alive", () => { + const result = resolveProbeDecision({ + ...baseProbeInput, + runtimeProbe: { state: "alive", failed: false }, + processProbe: { state: "dead", failed: false }, + activitySignal: exitedSignal, + activityEvidence: "activity_signal=valid via_native activity=exited", + }); + + expect(result?.sessionState).toBe("terminated"); + expect(result?.sessionReason).toBe("agent_process_exited"); + expect(result?.status).toBe("killed"); + // Must not route into the detecting/stuck cycle. + expect(result?.detecting.attempts).toBe(0); + }); + + it("does not enter signal_disagreement detecting when activity confirms the agent exited", () => { + const result = resolveProbeDecision({ + ...baseProbeInput, + // Carries 50 prior detecting attempts — a session already parked at stuck. + currentAttempts: 50, + runtimeProbe: { state: "alive", failed: false }, + processProbe: { state: "dead", failed: false }, + activitySignal: exitedSignal, + }); + + expect(result?.sessionState).toBe("terminated"); + expect(result?.evidence).not.toContain("signal_disagreement"); + }); + + it("keeps treating runtime=alive process=dead as signal disagreement when the agent has NOT exited", () => { + const result = resolveProbeDecision({ + ...baseProbeInput, + runtimeProbe: { state: "alive", failed: false }, + processProbe: { state: "dead", failed: false }, + activitySignal: createActivitySignal("unavailable"), + }); + + expect(result?.sessionState).toBe("detecting"); + expect(result?.evidence).toContain("signal_disagreement"); + }); +}); diff --git a/packages/core/src/lifecycle-state.ts b/packages/core/src/lifecycle-state.ts index 6192c23a96..ec4e17a78d 100644 --- a/packages/core/src/lifecycle-state.ts +++ b/packages/core/src/lifecycle-state.ts @@ -446,6 +446,7 @@ export function deriveLegacyStatus( switch (lifecycle.session.reason) { case "manually_killed": case "runtime_lost": + case "agent_process_exited": return "killed"; case "auto_cleanup": case "pr_merged": diff --git a/packages/core/src/lifecycle-status-decisions.ts b/packages/core/src/lifecycle-status-decisions.ts index 14332e93f9..4d2f84e5ba 100644 --- a/packages/core/src/lifecycle-status-decisions.ts +++ b/packages/core/src/lifecycle-status-decisions.ts @@ -290,6 +290,27 @@ export function parseAttemptCount(raw: string | undefined): number { export function resolveProbeDecision(input: ProbeDecisionInput): LifecycleDecision | null { const recentActivitySupportsLiveness = supportsRecentLiveness(input.activitySignal); + // When the native activity signal and the process probe agree the agent has + // exited, the session is done — terminate it regardless of the runtime probe. + // The tmux keep-alive shell keeps the runtime "alive" after the agent dies, + // which would otherwise route this into the signal_disagreement detecting/stuck + // cycle and park the session at stuck forever (#1933, #1966). The + // processProbe=dead requirement excludes spawning sessions, whose process + // probe stays "unknown" until the agent has started (#1035). + if ( + input.activitySignal.state === "valid" && + input.activitySignal.activity === "exited" && + input.processProbe.state === "dead" + ) { + return createLifecycleDecision({ + status: SESSION_STATUS.KILLED, + evidence: `agent_exited runtime=${input.runtimeProbe.state} process=${input.processProbe.state} ${input.activityEvidence}`, + detecting: { attempts: 0 }, + sessionState: "terminated", + sessionReason: "agent_process_exited", + }); + } + if (input.runtimeProbe.failed || input.processProbe.failed) { return createDetectingDecision({ currentAttempts: input.currentAttempts, From faec50998eabf9631a9e1f436f35e6a28317f6c9 Mon Sep 17 00:00:00 2001 From: harshitsinghbhandari <24b4506@iitb.ac.in> Date: Sat, 23 May 2026 16:02:33 +0530 Subject: [PATCH 2/3] chore: add changeset for agent-exit auto-termination fix Co-Authored-By: Claude Opus 4.7 --- .changeset/terminate-on-agent-exit.md | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 .changeset/terminate-on-agent-exit.md diff --git a/.changeset/terminate-on-agent-exit.md b/.changeset/terminate-on-agent-exit.md new file mode 100644 index 0000000000..63470efba3 --- /dev/null +++ b/.changeset/terminate-on-agent-exit.md @@ -0,0 +1,7 @@ +--- +"@aoagents/ao-core": patch +--- + +fix(core): auto-terminate sessions when the agent exits but the runtime stays alive + +Previously, when an agent process exited while its tmux runtime stayed alive (keep-alive shell), the lifecycle treated `runtime=alive` + `process=dead` as a signal disagreement, ran the detecting cycle, and parked the session at `stuck`/`probe_failure` indefinitely — leaving it lingering on the dashboard sidebar. When the native activity signal and the process probe both agree the agent has exited, the session now terminates directly with reason `agent_process_exited`. Closes #1933 and #1966. From 91ff2a7f5c27c070d5f4658dbf6cdde5b0034ac2 Mon Sep 17 00:00:00 2001 From: harshitsinghbhandari <24b4506@iitb.ac.in> Date: Sat, 23 May 2026 16:08:45 +0530 Subject: [PATCH 3/3] fix(core): guard agent-exit termination on processProbe.failed Align the new terminate-on-exit branch with the rest of resolveProbeDecision by requiring processProbe.failed === false before acting on its state, so a probe that defaults to dead on error can't terminate a live session. Co-Authored-By: Claude Opus 4.7 --- packages/core/src/lifecycle-status-decisions.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/core/src/lifecycle-status-decisions.ts b/packages/core/src/lifecycle-status-decisions.ts index 4d2f84e5ba..4dea2bfe8e 100644 --- a/packages/core/src/lifecycle-status-decisions.ts +++ b/packages/core/src/lifecycle-status-decisions.ts @@ -300,7 +300,8 @@ export function resolveProbeDecision(input: ProbeDecisionInput): LifecycleDecisi if ( input.activitySignal.state === "valid" && input.activitySignal.activity === "exited" && - input.processProbe.state === "dead" + input.processProbe.state === "dead" && + !input.processProbe.failed ) { return createLifecycleDecision({ status: SESSION_STATUS.KILLED,