Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .changeset/terminate-on-agent-exit.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
"@aoagents/ao-core": patch
---

fix(core): auto-terminate sessions when the agent exits but the runtime stays alive

Previously, when an agent process exited while its tmux runtime stayed alive (keep-alive shell), the lifecycle treated `runtime=alive` + `process=dead` as a signal disagreement, ran the detecting cycle, and parked the session at `stuck`/`probe_failure` indefinitely — leaving it lingering on the dashboard sidebar. When the native activity signal and the process probe both agree the agent has exited, the session now terminates directly with reason `agent_process_exited`. Closes #1933 and #1966.
4 changes: 2 additions & 2 deletions packages/core/src/__tests__/lifecycle-manager.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -589,7 +589,7 @@ describe("check (single session)", () => {
expect(lm.getStates().get("app-1")).toBe("killed");
});

it("detects killed state when getActivityState returns exited", async () => {
it("terminates when getActivityState returns exited even though the runtime is still alive (#1933, #1966)", async () => {
vi.mocked(plugins.agent.getActivityState).mockResolvedValue({ state: "exited" });
vi.mocked(plugins.runtime.isAlive).mockResolvedValue(true);

Expand All @@ -598,7 +598,7 @@ describe("check (single session)", () => {
});

await lm.check("app-1");
expect(lm.getStates().get("app-1")).toBe("detecting");
expect(lm.getStates().get("app-1")).toBe("killed");
});

it("detects killed via terminal fallback when getActivityState returns null", async () => {
Expand Down
5 changes: 5 additions & 0 deletions packages/core/src/__tests__/lifecycle-state.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,15 @@ describe("deriveLegacyStatus", () => {
merged.session.state = "terminated";
merged.session.reason = "pr_merged";

const agentExited = createOpenPRLifecycle();
agentExited.session.state = "terminated";
agentExited.session.reason = "agent_process_exited";

expect(deriveLegacyStatus(killed)).toBe("killed");
expect(deriveLegacyStatus(cleanup)).toBe("cleanup");
expect(deriveLegacyStatus(errored)).toBe("errored");
expect(deriveLegacyStatus(merged)).toBe("cleanup");
expect(deriveLegacyStatus(agentExited)).toBe("killed");
});

it("keeps PR-oriented aliases for idle workers with open PRs", () => {
Expand Down
59 changes: 59 additions & 0 deletions packages/core/src/__tests__/lifecycle-status-decisions.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@ import {
createDetectingDecision,
hashEvidence,
isDetectingTimedOut,
resolveProbeDecision,
DETECTING_MAX_ATTEMPTS,
DETECTING_MAX_DURATION_MS,
} from "../lifecycle-status-decisions.js";
import { createActivitySignal } from "../activity-signal.js";
import type { ActivitySignal } from "../types.js";

describe("hashEvidence", () => {
it("returns a 12-character hex string", () => {
Expand Down Expand Up @@ -216,3 +219,59 @@ describe("createDetectingDecision", () => {
});
});
});

describe("resolveProbeDecision", () => {
const baseProbeInput = {
currentAttempts: 0,
canProbeRuntimeIdentity: true,
activityEvidence: "",
idleWasBlocked: false,
};

const exitedSignal: ActivitySignal = createActivitySignal("valid", {
activity: "exited",
source: "native",
});

it("terminates immediately when the native activity signal says the agent exited, even if the runtime is still alive", () => {
const result = resolveProbeDecision({
...baseProbeInput,
runtimeProbe: { state: "alive", failed: false },
processProbe: { state: "dead", failed: false },
activitySignal: exitedSignal,
activityEvidence: "activity_signal=valid via_native activity=exited",
});

expect(result?.sessionState).toBe("terminated");
expect(result?.sessionReason).toBe("agent_process_exited");
expect(result?.status).toBe("killed");
// Must not route into the detecting/stuck cycle.
expect(result?.detecting.attempts).toBe(0);
});

it("does not enter signal_disagreement detecting when activity confirms the agent exited", () => {
const result = resolveProbeDecision({
...baseProbeInput,
// Carries 50 prior detecting attempts — a session already parked at stuck.
currentAttempts: 50,
runtimeProbe: { state: "alive", failed: false },
processProbe: { state: "dead", failed: false },
activitySignal: exitedSignal,
});

expect(result?.sessionState).toBe("terminated");
expect(result?.evidence).not.toContain("signal_disagreement");
});

it("keeps treating runtime=alive process=dead as signal disagreement when the agent has NOT exited", () => {
const result = resolveProbeDecision({
...baseProbeInput,
runtimeProbe: { state: "alive", failed: false },
processProbe: { state: "dead", failed: false },
activitySignal: createActivitySignal("unavailable"),
});

expect(result?.sessionState).toBe("detecting");
expect(result?.evidence).toContain("signal_disagreement");
});
});
1 change: 1 addition & 0 deletions packages/core/src/lifecycle-state.ts
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,7 @@ export function deriveLegacyStatus(
switch (lifecycle.session.reason) {
case "manually_killed":
case "runtime_lost":
case "agent_process_exited":
return "killed";
case "auto_cleanup":
case "pr_merged":
Expand Down
22 changes: 22 additions & 0 deletions packages/core/src/lifecycle-status-decisions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,28 @@ export function parseAttemptCount(raw: string | undefined): number {
export function resolveProbeDecision(input: ProbeDecisionInput): LifecycleDecision | null {
const recentActivitySupportsLiveness = supportsRecentLiveness(input.activitySignal);

// When the native activity signal and the process probe agree the agent has
// exited, the session is done — terminate it regardless of the runtime probe.
// The tmux keep-alive shell keeps the runtime "alive" after the agent dies,
// which would otherwise route this into the signal_disagreement detecting/stuck
// cycle and park the session at stuck forever (#1933, #1966). The
// processProbe=dead requirement excludes spawning sessions, whose process
// probe stays "unknown" until the agent has started (#1035).
if (
input.activitySignal.state === "valid" &&
input.activitySignal.activity === "exited" &&
input.processProbe.state === "dead" &&
!input.processProbe.failed
) {
return createLifecycleDecision({
status: SESSION_STATUS.KILLED,
evidence: `agent_exited runtime=${input.runtimeProbe.state} process=${input.processProbe.state} ${input.activityEvidence}`,
detecting: { attempts: 0 },
sessionState: "terminated",
sessionReason: "agent_process_exited",
});
}
Comment thread
harshitsinghbhandari marked this conversation as resolved.

if (input.runtimeProbe.failed || input.processProbe.failed) {
return createDetectingDecision({
currentAttempts: input.currentAttempts,
Expand Down
Loading