Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .changeset/fix-runtime-lost-indeterminate-2025.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
"@aoagents/ao-core": patch
---

fix(core): terminate runtime-lost sessions when the agent process probe is indeterminate (#2025)

When a tmux session vanished, `runtime.isAlive` reported a clean dead but the agent's tmux-based `isProcessRunning` threw and was mapped to `INDETERMINATE`. The lifecycle poll short-circuited on the indeterminate probe with `skipMetadataWrite`, never reaching `resolveProbeDecision`, so the session froze forever in `detecting`/`runtime_lost` on the dashboard sidebar. The poll now treats an indeterminate agent probe as dead when the runtime is authoritatively dead (a process inside a gone tmux session cannot be alive), letting the session resolve terminal. The `#1838` false-termination protection is preserved — this only fires on an authoritative dead runtime, and the recent-liveness guard still keeps a genuinely-working agent in `detecting`.
54 changes: 54 additions & 0 deletions packages/core/src/__tests__/lifecycle-manager.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -729,6 +729,60 @@ describe("check (single session)", () => {
expect(lm.getStates().get("app-1")).toBe("working");
});

it("terminates a runtime-lost session even when the agent process probe is indeterminate (#2025)", async () => {
// tmux gone: runtime.isAlive returns a clean dead, but the agent's
// tmux-based isProcessRunning throws -> indeterminate. The authoritative
// dead runtime must win so the session reaches a terminal state instead of
// freezing in detecting forever.
vi.mocked(plugins.runtime.isAlive).mockResolvedValue(false);
vi.mocked(plugins.agent.getActivityState).mockResolvedValue(null);
vi.mocked(plugins.agent.detectActivity).mockReturnValue("idle");
vi.mocked(plugins.agent.isProcessRunning).mockResolvedValue("indeterminate");

const lm = setupCheck("app-1", {
session: makeSession({
status: "detecting",
workspacePath: null,
metadata: {
lifecycleEvidence: "idle_beyond_threshold activity_signal=valid via_native activity=idle",
detectingAttempts: "1",
},
}),
metaOverrides: {
lifecycleEvidence: "idle_beyond_threshold activity_signal=valid via_native activity=idle",
detectingAttempts: "1",
},
});

await lm.check("app-1");

expect(lm.getStates().get("app-1")).toBe("killed");
const meta = readMetadataRaw(env.sessionsDir, "app-1");
expect(meta?.["lifecycleEvidence"]).toContain("runtime_dead process_dead");
});
Comment thread
harshitsinghbhandari marked this conversation as resolved.

it("keeps a session in detecting when the runtime is dead but the agent probe is indeterminate AND activity is fresh (#1838 guard at the reclassification site)", async () => {
// Even after reclassifying the indeterminate probe to dead, a fresh liveness
// signal must route through the signal_disagreement branch to detecting —
// never a false termination of a genuinely-working agent.
vi.mocked(plugins.runtime.isAlive).mockResolvedValue(false);
vi.mocked(plugins.agent.getActivityState).mockResolvedValue({
state: "active",
timestamp: new Date(Date.now() - 5_000),
});
vi.mocked(plugins.agent.isProcessRunning).mockResolvedValue("indeterminate");

const lm = setupCheck("app-1", {
session: makeSession({ status: "working" }),
});

await lm.check("app-1");

expect(lm.getStates().get("app-1")).toBe("detecting");
const meta = readMetadataRaw(env.sessionsDir, "app-1");
expect(meta?.["lifecycleEvidence"]).toContain("signal_disagreement");
});

it("does not mark a session stuck from terminal-only idle evidence without a timestamp", async () => {
config.reactions = {
"agent-stuck": { auto: true, action: "notify", threshold: "1m" },
Expand Down
77 changes: 77 additions & 0 deletions packages/core/src/__tests__/lifecycle-status-decisions.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@ import {
createDetectingDecision,
hashEvidence,
isDetectingTimedOut,
resolveProbeDecision,
DETECTING_MAX_ATTEMPTS,
DETECTING_MAX_DURATION_MS,
} from "../lifecycle-status-decisions.js";
import { createActivitySignal } from "../activity-signal.js";

describe("hashEvidence", () => {
it("returns a 12-character hex string", () => {
Expand Down Expand Up @@ -216,3 +218,78 @@ describe("createDetectingDecision", () => {
});
});
});

describe("resolveProbeDecision", () => {
const noLivenessSignal = createActivitySignal("null", { source: "native" });
const recentLivenessSignal = createActivitySignal("valid", {
activity: "active",
timestamp: new Date(),
source: "native",
});

const baseProbeInput = {
currentAttempts: 0,
canProbeRuntimeIdentity: true,
activitySignal: noLivenessSignal,
activityEvidence: "activity_signal=null via_native",
idleWasBlocked: false,
};

it("terminates when both runtime and agent probes report dead", () => {
const decision = resolveProbeDecision({
...baseProbeInput,
runtimeProbe: { state: "dead", failed: false },
processProbe: { state: "dead", failed: false },
});

expect(decision?.sessionState).toBe("terminated");
expect(decision?.sessionReason).toBe("runtime_lost");
});

it("keeps a dead runtime with a genuinely unknown process in detecting (no-agent grace)", () => {
// When the process state is truly unknown (e.g. no agent plugin to probe),
// resolveProbeDecision gives detecting grace rather than terminating. The
// #2025 path differs: the manager reclassifies an indeterminate agent probe
// to dead before this point, so it lands in the dead+dead terminal branch.
const decision = resolveProbeDecision({
...baseProbeInput,
runtimeProbe: { state: "dead", failed: false },
processProbe: { state: "unknown", failed: false },
});

expect(decision?.sessionState).toBe("detecting");
expect(decision?.evidence).toContain("runtime_dead process_unknown");
});

it("stays in detecting when runtime is dead but recent activity supports liveness (preserves #1838 protection)", () => {
const decision = resolveProbeDecision({
...baseProbeInput,
activitySignal: recentLivenessSignal,
runtimeProbe: { state: "dead", failed: false },
processProbe: { state: "unknown", failed: false },
});

expect(decision?.sessionState).toBe("detecting");
});

it("stays in detecting on a transient probe failure (preserves #1838 protection)", () => {
const decision = resolveProbeDecision({
...baseProbeInput,
runtimeProbe: { state: "unknown", failed: true },
processProbe: { state: "unknown", failed: false },
});

expect(decision?.sessionState).toBe("detecting");
});

it("does not terminate a live runtime when the agent probe is indeterminate", () => {
const decision = resolveProbeDecision({
...baseProbeInput,
runtimeProbe: { state: "alive", failed: false },
processProbe: { state: "unknown", failed: false },
});

// runtime alive + agent unknown is not a terminal signal; no decision here.
expect(decision).toBeNull();
});
});
31 changes: 30 additions & 1 deletion packages/core/src/lifecycle-manager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1176,7 +1176,36 @@ export function createLifecycleManager(deps: LifecycleManagerDeps): LifecycleMan
}
}

if (processProbe.indeterminate) {
// An indeterminate agent process probe normally means "we couldn't tell" —
// hold the current state rather than risk a false termination (#1838).
//
// The exception: when the runtime probe authoritatively reports dead (e.g. a
// clean `tmux has-session` false), the agent process living inside that
// runtime cannot be alive. The agent's own probe is indeterminate precisely
// because it is tmux-based and threw when the session vanished. Reclassify
// it as dead so the poll can resolve terminal instead of freezing forever in
// `detecting` (#2025). #1838 protection is intact: we only do this on an
// authoritative dead runtime, never on a flaky/alive one. The
// recent-liveness guard inside resolveProbeDecision still keeps a
// genuinely-working agent in `detecting`.
const runtimeAuthoritativelyDead = runtimeProbe.state === "dead" && !runtimeProbe.failed;
if (processProbe.indeterminate && runtimeAuthoritativelyDead) {
// Leave a trace: the audit trail otherwise shows the session jumping
// straight to terminal with no record of the intermediate reclassification.
recordActivityEvent({
projectId: session.projectId,
sessionId: session.id,
source: "agent",
kind: "agent.process_probe_failed",
level: "warn",
summary: `agent.isProcessRunning indeterminate for ${session.id} — reclassified dead (runtime authoritatively dead)`,
data: {
agentName,
reason: "probe_indeterminate_runtime_dead",
},
});
processProbe = { state: "dead", failed: false };
} else if (processProbe.indeterminate) {
Comment thread
harshitsinghbhandari marked this conversation as resolved.
recordActivityEvent({
projectId: session.projectId,
sessionId: session.id,
Expand Down
Loading