diff --git a/.changeset/fix-runtime-lost-indeterminate-2025.md b/.changeset/fix-runtime-lost-indeterminate-2025.md new file mode 100644 index 0000000000..f473235334 --- /dev/null +++ b/.changeset/fix-runtime-lost-indeterminate-2025.md @@ -0,0 +1,7 @@ +--- +"@aoagents/ao-core": patch +--- + +fix(core): terminate runtime-lost sessions when the agent process probe is indeterminate (#2025) + +When a tmux session vanished, `runtime.isAlive` reported a clean dead but the agent's tmux-based `isProcessRunning` threw and was mapped to `INDETERMINATE`. The lifecycle poll short-circuited on the indeterminate probe with `skipMetadataWrite`, never reaching `resolveProbeDecision`, so the session froze forever in `detecting`/`runtime_lost` on the dashboard sidebar. The poll now treats an indeterminate agent probe as dead when the runtime is authoritatively dead (a process inside a gone tmux session cannot be alive), letting the session resolve terminal. The `#1838` false-termination protection is preserved — this only fires on an authoritative dead runtime, and the recent-liveness guard still keeps a genuinely-working agent in `detecting`. diff --git a/packages/core/src/__tests__/lifecycle-manager.test.ts b/packages/core/src/__tests__/lifecycle-manager.test.ts index 68edb12fee..876a0a4f67 100644 --- a/packages/core/src/__tests__/lifecycle-manager.test.ts +++ b/packages/core/src/__tests__/lifecycle-manager.test.ts @@ -729,6 +729,60 @@ describe("check (single session)", () => { expect(lm.getStates().get("app-1")).toBe("working"); }); + it("terminates a runtime-lost session even when the agent process probe is indeterminate (#2025)", async () => { + // tmux gone: runtime.isAlive returns a clean dead, but the agent's + // tmux-based isProcessRunning throws -> indeterminate. The authoritative + // dead runtime must win so the session reaches a terminal state instead of + // freezing in detecting forever. + vi.mocked(plugins.runtime.isAlive).mockResolvedValue(false); + vi.mocked(plugins.agent.getActivityState).mockResolvedValue(null); + vi.mocked(plugins.agent.detectActivity).mockReturnValue("idle"); + vi.mocked(plugins.agent.isProcessRunning).mockResolvedValue("indeterminate"); + + const lm = setupCheck("app-1", { + session: makeSession({ + status: "detecting", + workspacePath: null, + metadata: { + lifecycleEvidence: "idle_beyond_threshold activity_signal=valid via_native activity=idle", + detectingAttempts: "1", + }, + }), + metaOverrides: { + lifecycleEvidence: "idle_beyond_threshold activity_signal=valid via_native activity=idle", + detectingAttempts: "1", + }, + }); + + await lm.check("app-1"); + + expect(lm.getStates().get("app-1")).toBe("killed"); + const meta = readMetadataRaw(env.sessionsDir, "app-1"); + expect(meta?.["lifecycleEvidence"]).toContain("runtime_dead process_dead"); + }); + + it("keeps a session in detecting when the runtime is dead but the agent probe is indeterminate AND activity is fresh (#1838 guard at the reclassification site)", async () => { + // Even after reclassifying the indeterminate probe to dead, a fresh liveness + // signal must route through the signal_disagreement branch to detecting — + // never a false termination of a genuinely-working agent. + vi.mocked(plugins.runtime.isAlive).mockResolvedValue(false); + vi.mocked(plugins.agent.getActivityState).mockResolvedValue({ + state: "active", + timestamp: new Date(Date.now() - 5_000), + }); + vi.mocked(plugins.agent.isProcessRunning).mockResolvedValue("indeterminate"); + + const lm = setupCheck("app-1", { + session: makeSession({ status: "working" }), + }); + + await lm.check("app-1"); + + expect(lm.getStates().get("app-1")).toBe("detecting"); + const meta = readMetadataRaw(env.sessionsDir, "app-1"); + expect(meta?.["lifecycleEvidence"]).toContain("signal_disagreement"); + }); + it("does not mark a session stuck from terminal-only idle evidence without a timestamp", async () => { config.reactions = { "agent-stuck": { auto: true, action: "notify", threshold: "1m" }, diff --git a/packages/core/src/__tests__/lifecycle-status-decisions.test.ts b/packages/core/src/__tests__/lifecycle-status-decisions.test.ts index f35e735d8e..7e8cc3d057 100644 --- a/packages/core/src/__tests__/lifecycle-status-decisions.test.ts +++ b/packages/core/src/__tests__/lifecycle-status-decisions.test.ts @@ -3,9 +3,11 @@ import { createDetectingDecision, hashEvidence, isDetectingTimedOut, + resolveProbeDecision, DETECTING_MAX_ATTEMPTS, DETECTING_MAX_DURATION_MS, } from "../lifecycle-status-decisions.js"; +import { createActivitySignal } from "../activity-signal.js"; describe("hashEvidence", () => { it("returns a 12-character hex string", () => { @@ -216,3 +218,78 @@ describe("createDetectingDecision", () => { }); }); }); + +describe("resolveProbeDecision", () => { + const noLivenessSignal = createActivitySignal("null", { source: "native" }); + const recentLivenessSignal = createActivitySignal("valid", { + activity: "active", + timestamp: new Date(), + source: "native", + }); + + const baseProbeInput = { + currentAttempts: 0, + canProbeRuntimeIdentity: true, + activitySignal: noLivenessSignal, + activityEvidence: "activity_signal=null via_native", + idleWasBlocked: false, + }; + + it("terminates when both runtime and agent probes report dead", () => { + const decision = resolveProbeDecision({ + ...baseProbeInput, + runtimeProbe: { state: "dead", failed: false }, + processProbe: { state: "dead", failed: false }, + }); + + expect(decision?.sessionState).toBe("terminated"); + expect(decision?.sessionReason).toBe("runtime_lost"); + }); + + it("keeps a dead runtime with a genuinely unknown process in detecting (no-agent grace)", () => { + // When the process state is truly unknown (e.g. no agent plugin to probe), + // resolveProbeDecision gives detecting grace rather than terminating. The + // #2025 path differs: the manager reclassifies an indeterminate agent probe + // to dead before this point, so it lands in the dead+dead terminal branch. + const decision = resolveProbeDecision({ + ...baseProbeInput, + runtimeProbe: { state: "dead", failed: false }, + processProbe: { state: "unknown", failed: false }, + }); + + expect(decision?.sessionState).toBe("detecting"); + expect(decision?.evidence).toContain("runtime_dead process_unknown"); + }); + + it("stays in detecting when runtime is dead but recent activity supports liveness (preserves #1838 protection)", () => { + const decision = resolveProbeDecision({ + ...baseProbeInput, + activitySignal: recentLivenessSignal, + runtimeProbe: { state: "dead", failed: false }, + processProbe: { state: "unknown", failed: false }, + }); + + expect(decision?.sessionState).toBe("detecting"); + }); + + it("stays in detecting on a transient probe failure (preserves #1838 protection)", () => { + const decision = resolveProbeDecision({ + ...baseProbeInput, + runtimeProbe: { state: "unknown", failed: true }, + processProbe: { state: "unknown", failed: false }, + }); + + expect(decision?.sessionState).toBe("detecting"); + }); + + it("does not terminate a live runtime when the agent probe is indeterminate", () => { + const decision = resolveProbeDecision({ + ...baseProbeInput, + runtimeProbe: { state: "alive", failed: false }, + processProbe: { state: "unknown", failed: false }, + }); + + // runtime alive + agent unknown is not a terminal signal; no decision here. + expect(decision).toBeNull(); + }); +}); diff --git a/packages/core/src/lifecycle-manager.ts b/packages/core/src/lifecycle-manager.ts index 3cae2f77e6..a9b6733ad2 100644 --- a/packages/core/src/lifecycle-manager.ts +++ b/packages/core/src/lifecycle-manager.ts @@ -1176,7 +1176,36 @@ export function createLifecycleManager(deps: LifecycleManagerDeps): LifecycleMan } } - if (processProbe.indeterminate) { + // An indeterminate agent process probe normally means "we couldn't tell" — + // hold the current state rather than risk a false termination (#1838). + // + // The exception: when the runtime probe authoritatively reports dead (e.g. a + // clean `tmux has-session` false), the agent process living inside that + // runtime cannot be alive. The agent's own probe is indeterminate precisely + // because it is tmux-based and threw when the session vanished. Reclassify + // it as dead so the poll can resolve terminal instead of freezing forever in + // `detecting` (#2025). #1838 protection is intact: we only do this on an + // authoritative dead runtime, never on a flaky/alive one. The + // recent-liveness guard inside resolveProbeDecision still keeps a + // genuinely-working agent in `detecting`. + const runtimeAuthoritativelyDead = runtimeProbe.state === "dead" && !runtimeProbe.failed; + if (processProbe.indeterminate && runtimeAuthoritativelyDead) { + // Leave a trace: the audit trail otherwise shows the session jumping + // straight to terminal with no record of the intermediate reclassification. + recordActivityEvent({ + projectId: session.projectId, + sessionId: session.id, + source: "agent", + kind: "agent.process_probe_failed", + level: "warn", + summary: `agent.isProcessRunning indeterminate for ${session.id} — reclassified dead (runtime authoritatively dead)`, + data: { + agentName, + reason: "probe_indeterminate_runtime_dead", + }, + }); + processProbe = { state: "dead", failed: false }; + } else if (processProbe.indeterminate) { recordActivityEvent({ projectId: session.projectId, sessionId: session.id,