diff --git a/.changeset/terminate-on-agent-exit.md b/.changeset/terminate-on-agent-exit.md new file mode 100644 index 0000000000..63470efba3 --- /dev/null +++ b/.changeset/terminate-on-agent-exit.md @@ -0,0 +1,7 @@ +--- +"@aoagents/ao-core": patch +--- + +fix(core): auto-terminate sessions when the agent exits but the runtime stays alive + +Previously, when an agent process exited while its tmux runtime stayed alive (keep-alive shell), the lifecycle treated `runtime=alive` + `process=dead` as a signal disagreement, ran the detecting cycle, and parked the session at `stuck`/`probe_failure` indefinitely — leaving it lingering on the dashboard sidebar. When the native activity signal and the process probe both agree the agent has exited, the session now terminates directly with reason `agent_process_exited`. Closes #1933 and #1966. diff --git a/packages/core/src/__tests__/lifecycle-manager.test.ts b/packages/core/src/__tests__/lifecycle-manager.test.ts index 68edb12fee..d7a57e8968 100644 --- a/packages/core/src/__tests__/lifecycle-manager.test.ts +++ b/packages/core/src/__tests__/lifecycle-manager.test.ts @@ -589,7 +589,7 @@ describe("check (single session)", () => { expect(lm.getStates().get("app-1")).toBe("killed"); }); - it("detects killed state when getActivityState returns exited", async () => { + it("terminates when getActivityState returns exited even though the runtime is still alive (#1933, #1966)", async () => { vi.mocked(plugins.agent.getActivityState).mockResolvedValue({ state: "exited" }); vi.mocked(plugins.runtime.isAlive).mockResolvedValue(true); @@ -598,7 +598,7 @@ describe("check (single session)", () => { }); await lm.check("app-1"); - expect(lm.getStates().get("app-1")).toBe("detecting"); + expect(lm.getStates().get("app-1")).toBe("killed"); }); it("detects killed via terminal fallback when getActivityState returns null", async () => { diff --git a/packages/core/src/__tests__/lifecycle-state.test.ts b/packages/core/src/__tests__/lifecycle-state.test.ts index 97c0ac61b6..f7f0b8e38e 100644 --- a/packages/core/src/__tests__/lifecycle-state.test.ts +++ b/packages/core/src/__tests__/lifecycle-state.test.ts @@ -55,10 +55,15 @@ describe("deriveLegacyStatus", () => { merged.session.state = "terminated"; merged.session.reason = "pr_merged"; + const agentExited = createOpenPRLifecycle(); + agentExited.session.state = "terminated"; + agentExited.session.reason = "agent_process_exited"; + expect(deriveLegacyStatus(killed)).toBe("killed"); expect(deriveLegacyStatus(cleanup)).toBe("cleanup"); expect(deriveLegacyStatus(errored)).toBe("errored"); expect(deriveLegacyStatus(merged)).toBe("cleanup"); + expect(deriveLegacyStatus(agentExited)).toBe("killed"); }); it("keeps PR-oriented aliases for idle workers with open PRs", () => { diff --git a/packages/core/src/__tests__/lifecycle-status-decisions.test.ts b/packages/core/src/__tests__/lifecycle-status-decisions.test.ts index f35e735d8e..47ae186272 100644 --- a/packages/core/src/__tests__/lifecycle-status-decisions.test.ts +++ b/packages/core/src/__tests__/lifecycle-status-decisions.test.ts @@ -3,9 +3,12 @@ import { createDetectingDecision, hashEvidence, isDetectingTimedOut, + resolveProbeDecision, DETECTING_MAX_ATTEMPTS, DETECTING_MAX_DURATION_MS, } from "../lifecycle-status-decisions.js"; +import { createActivitySignal } from "../activity-signal.js"; +import type { ActivitySignal } from "../types.js"; describe("hashEvidence", () => { it("returns a 12-character hex string", () => { @@ -216,3 +219,59 @@ describe("createDetectingDecision", () => { }); }); }); + +describe("resolveProbeDecision", () => { + const baseProbeInput = { + currentAttempts: 0, + canProbeRuntimeIdentity: true, + activityEvidence: "", + idleWasBlocked: false, + }; + + const exitedSignal: ActivitySignal = createActivitySignal("valid", { + activity: "exited", + source: "native", + }); + + it("terminates immediately when the native activity signal says the agent exited, even if the runtime is still alive", () => { + const result = resolveProbeDecision({ + ...baseProbeInput, + runtimeProbe: { state: "alive", failed: false }, + processProbe: { state: "dead", failed: false }, + activitySignal: exitedSignal, + activityEvidence: "activity_signal=valid via_native activity=exited", + }); + + expect(result?.sessionState).toBe("terminated"); + expect(result?.sessionReason).toBe("agent_process_exited"); + expect(result?.status).toBe("killed"); + // Must not route into the detecting/stuck cycle. + expect(result?.detecting.attempts).toBe(0); + }); + + it("does not enter signal_disagreement detecting when activity confirms the agent exited", () => { + const result = resolveProbeDecision({ + ...baseProbeInput, + // Carries 50 prior detecting attempts — a session already parked at stuck. + currentAttempts: 50, + runtimeProbe: { state: "alive", failed: false }, + processProbe: { state: "dead", failed: false }, + activitySignal: exitedSignal, + }); + + expect(result?.sessionState).toBe("terminated"); + expect(result?.evidence).not.toContain("signal_disagreement"); + }); + + it("keeps treating runtime=alive process=dead as signal disagreement when the agent has NOT exited", () => { + const result = resolveProbeDecision({ + ...baseProbeInput, + runtimeProbe: { state: "alive", failed: false }, + processProbe: { state: "dead", failed: false }, + activitySignal: createActivitySignal("unavailable"), + }); + + expect(result?.sessionState).toBe("detecting"); + expect(result?.evidence).toContain("signal_disagreement"); + }); +}); diff --git a/packages/core/src/lifecycle-state.ts b/packages/core/src/lifecycle-state.ts index 6192c23a96..ec4e17a78d 100644 --- a/packages/core/src/lifecycle-state.ts +++ b/packages/core/src/lifecycle-state.ts @@ -446,6 +446,7 @@ export function deriveLegacyStatus( switch (lifecycle.session.reason) { case "manually_killed": case "runtime_lost": + case "agent_process_exited": return "killed"; case "auto_cleanup": case "pr_merged": diff --git a/packages/core/src/lifecycle-status-decisions.ts b/packages/core/src/lifecycle-status-decisions.ts index 14332e93f9..4dea2bfe8e 100644 --- a/packages/core/src/lifecycle-status-decisions.ts +++ b/packages/core/src/lifecycle-status-decisions.ts @@ -290,6 +290,28 @@ export function parseAttemptCount(raw: string | undefined): number { export function resolveProbeDecision(input: ProbeDecisionInput): LifecycleDecision | null { const recentActivitySupportsLiveness = supportsRecentLiveness(input.activitySignal); + // When the native activity signal and the process probe agree the agent has + // exited, the session is done — terminate it regardless of the runtime probe. + // The tmux keep-alive shell keeps the runtime "alive" after the agent dies, + // which would otherwise route this into the signal_disagreement detecting/stuck + // cycle and park the session at stuck forever (#1933, #1966). The + // processProbe=dead requirement excludes spawning sessions, whose process + // probe stays "unknown" until the agent has started (#1035). + if ( + input.activitySignal.state === "valid" && + input.activitySignal.activity === "exited" && + input.processProbe.state === "dead" && + !input.processProbe.failed + ) { + return createLifecycleDecision({ + status: SESSION_STATUS.KILLED, + evidence: `agent_exited runtime=${input.runtimeProbe.state} process=${input.processProbe.state} ${input.activityEvidence}`, + detecting: { attempts: 0 }, + sessionState: "terminated", + sessionReason: "agent_process_exited", + }); + } + if (input.runtimeProbe.failed || input.processProbe.failed) { return createDetectingDecision({ currentAttempts: input.currentAttempts,