diff --git a/benchmarks/terminal-bench/README.md b/benchmarks/terminal-bench/README.md new file mode 100644 index 0000000..38412f6 --- /dev/null +++ b/benchmarks/terminal-bench/README.md @@ -0,0 +1,135 @@ +# Ouroboros Terminal-Bench 2.0 Pilot Harness + +This directory contains a local pilot harness for running the Ouroboros CLI on +Terminal-Bench 2.0 through Harbor. It is meant to validate that Ouroboros can be +installed, built, and invoked inside Harbor task containers. + +This is not a leaderboard submission package. It does not add audited +leaderboard metadata or a full ATIF trajectory converter. + +## Files + +- `ouroboros_tbench_agent.py`: Harbor `BaseInstalledAgent` adapter for + Ouroboros. +- `run-pilot.sh`: local convenience script for a one-concurrency pilot run. + +The directory name contains a hyphen, so it is not imported as a Python package. +`run-pilot.sh` adds this directory to `PYTHONPATH` and imports the adapter as: + +```bash +ouroboros_tbench_agent:OuroborosInstalledAgent +``` + +## Prerequisites + +- Docker Desktop installed and running. +- `uv` installed. +- Network access from task containers for installing Bun and calling the model + provider. +- `OPENAI_API_KEY` exported in the shell. + +Optional environment variables: + +- `OUROBOROS_TBENCH_MODEL`, default `openai/gpt-5.5` +- `OUROBOROS_TBENCH_REASONING`, default `medium` +- `OUROBOROS_TBENCH_MAX_STEPS`, default `50` +- `OUROBOROS_TBENCH_N_CONCURRENT`, default `1` +- `OUROBOROS_TBENCH_TIMEOUT_SEC`, default `3600` +- `OUROBOROS_TBENCH_JOBS_DIR`, default `/private/tmp/ouroboros-tbench/jobs` + +## Developer Execution Plan + +1. Start Docker Desktop and verify the daemon is reachable: + + ```bash + docker info + ``` + +2. Export credentials: + + ```bash + export OPENAI_API_KEY=... + ``` + +3. Verify Harbor is available through `uv`: + + ```bash + uv tool run harbor --help + ``` + +4. Run the Harbor oracle sanity check: + + ```bash + uv tool run harbor run --dataset terminal-bench@2.0 --agent oracle --n-concurrent 1 + ``` + +5. Run the Ouroboros pilot: + + ```bash + benchmarks/terminal-bench/run-pilot.sh + ``` + +6. Inspect results: + + ```bash + ls -la /private/tmp/ouroboros-tbench/jobs + ``` + + Open the latest Harbor job directory and inspect the trial `agent/`, + `verifier/`, `result.json`, and `trial.log` files. Ouroboros logs are written + as `agent/ouroboros.txt`, `agent/ouroboros-stdout.txt`, and + `agent/ouroboros-stderr.txt`. + +## Verification Without Running The Benchmark + +Check shell syntax: + +```bash +bash -n benchmarks/terminal-bench/run-pilot.sh +``` + +Validate the adapter import: + +```bash +PYTHONPATH=benchmarks/terminal-bench \ + uv tool run --with harbor python -c "from ouroboros_tbench_agent import OuroborosInstalledAgent; print(OuroborosInstalledAgent.name())" +``` + +Run the repo verification suite: + +```bash +bun run verify +``` + +## Troubleshooting + +### Docker daemon is down + +If `docker info` fails, start Docker Desktop and wait until it reports that the +engine is running. + +### `OPENAI_API_KEY` is missing + +`run-pilot.sh` exits early when `OPENAI_API_KEY` is empty because the default +model is `openai/gpt-5.5`. + +### Harbor is missing + +Use `uv run harbor --help`. If `uv` cannot resolve Harbor, install it with: + +```bash +uv tool install harbor +``` + +### Container setup fails while installing Bun + +Confirm the task container has outbound network access and can reach +`https://bun.sh`. Some Terminal-Bench tasks may intentionally restrict internet +access; those tasks are not suitable for this pilot adapter without pre-baking +Ouroboros and Bun into the agent image. + +### Ouroboros build fails in the task container + +Inspect `agent/ouroboros-stderr.txt` and `trial.log` in the latest Harbor job +directory. The adapter uploads a filtered copy of the current repo and runs +`bun install` followed by `bun run --filter @ouroboros/cli build`. diff --git a/benchmarks/terminal-bench/ouroboros_tbench_agent.py b/benchmarks/terminal-bench/ouroboros_tbench_agent.py new file mode 100644 index 0000000..ff034f1 --- /dev/null +++ b/benchmarks/terminal-bench/ouroboros_tbench_agent.py @@ -0,0 +1,169 @@ +import os +import shlex +import shutil +from pathlib import Path + +from harbor.agents.installed.base import BaseInstalledAgent, with_prompt_template +from harbor.environments.base import BaseEnvironment +from harbor.models.agent.context import AgentContext + + +REPO_ROOT = Path(__file__).resolve().parents[2] +CONTAINER_REPO_DIR = "/installed-agent/ouroboros" +AGENT_LOG_NAME = "ouroboros.txt" +AGENT_STDOUT_NAME = "ouroboros-stdout.txt" +AGENT_STDERR_NAME = "ouroboros-stderr.txt" + + +def _env_or_default(name: str, default: str) -> str: + value = os.environ.get(name) + return value if value and value.strip() else default + + +class OuroborosInstalledAgent(BaseInstalledAgent): + """Harbor installed-agent adapter for running Ouroboros CLI on TB 2.0.""" + + SUPPORTS_ATIF = False + + @staticmethod + def name() -> str: + return "ouroboros" + + def get_version_command(self) -> str | None: + return ( + f"cd {shlex.quote(CONTAINER_REPO_DIR)} && " + "./packages/cli/dist/ouroboros --version" + ) + + async def install(self, environment: BaseEnvironment) -> None: + await self.exec_as_root( + environment, + command=( + "if command -v apk >/dev/null 2>&1; then " + "apk add --no-cache bash curl unzip ca-certificates tar; " + "elif command -v apt-get >/dev/null 2>&1; then " + "apt-get update && apt-get install -y " + "bash curl unzip ca-certificates tar; " + "elif command -v yum >/dev/null 2>&1; then " + "yum install -y bash curl unzip ca-certificates tar; " + "else " + "echo 'No supported package manager found; assuming prerequisites exist' >&2; " + "fi" + ), + env={"DEBIAN_FRONTEND": "noninteractive"}, + timeout_sec=300, + ) + + await self.exec_as_agent( + environment, + command=( + "if ! command -v bun >/dev/null 2>&1; then " + "curl -fsSL https://bun.sh/install | bash; " + "fi" + ), + timeout_sec=300, + ) + + upload_dir = self._prepare_repo_upload() + await environment.upload_dir(upload_dir, CONTAINER_REPO_DIR) + + await self.exec_as_agent( + environment, + command=( + 'export BUN_INSTALL="$HOME/.bun"; ' + 'export PATH="$BUN_INSTALL/bin:$PATH"; ' + "bun install && bun run --filter @ouroboros/cli build" + ), + cwd=CONTAINER_REPO_DIR, + timeout_sec=900, + ) + + @with_prompt_template + async def run( + self, instruction: str, environment: BaseEnvironment, context: AgentContext + ) -> None: + model = _env_or_default("OUROBOROS_TBENCH_MODEL", "openai/gpt-5.5") + reasoning = _env_or_default("OUROBOROS_TBENCH_REASONING", "medium") + max_steps = _env_or_default("OUROBOROS_TBENCH_MAX_STEPS", "50") + + env = { + "OPENAI_API_KEY": os.environ.get("OPENAI_API_KEY", ""), + "OUROBOROS_TBENCH_MODEL": model, + "OUROBOROS_TBENCH_REASONING": reasoning, + "OUROBOROS_TBENCH_MAX_STEPS": max_steps, + } + env = {key: value for key, value in env.items() if value} + + command = ( + "mkdir -p /logs/agent && " + 'export BUN_INSTALL="$HOME/.bun"; ' + 'export PATH="$BUN_INSTALL/bin:$PATH"; ' + f"{shlex.quote(CONTAINER_REPO_DIR)}/packages/cli/dist/ouroboros " + f"--model {shlex.quote(model)} " + f"--reasoning-effort {shlex.quote(reasoning)} " + "--no-stream --no-rsi " + f"--max-steps {shlex.quote(max_steps)} " + f"-m {shlex.quote(instruction)} " + f"> /logs/agent/{AGENT_STDOUT_NAME} " + f"2> /logs/agent/{AGENT_STDERR_NAME}; " + "status=$?; " + f"cat /logs/agent/{AGENT_STDOUT_NAME} " + f"/logs/agent/{AGENT_STDERR_NAME} > /logs/agent/{AGENT_LOG_NAME}; " + "exit $status" + ) + + await self.exec_as_agent( + environment, + command=command, + env=env, + timeout_sec=int(_env_or_default("OUROBOROS_TBENCH_TIMEOUT_SEC", "3600")), + ) + + def populate_context_post_run(self, context: AgentContext) -> None: + log_path = self.logs_dir / AGENT_LOG_NAME + stdout_path = self.logs_dir / AGENT_STDOUT_NAME + stderr_path = self.logs_dir / AGENT_STDERR_NAME + + context.metadata = { + "agent": self.name(), + "model": _env_or_default("OUROBOROS_TBENCH_MODEL", "openai/gpt-5.5"), + "reasoning_effort": _env_or_default("OUROBOROS_TBENCH_REASONING", "medium"), + "max_steps": _env_or_default("OUROBOROS_TBENCH_MAX_STEPS", "50"), + "log_path": str(log_path), + "stdout_path": str(stdout_path), + "stderr_path": str(stderr_path), + "log_excerpt": self._read_excerpt(log_path), + "stdout_excerpt": self._read_excerpt(stdout_path), + "stderr_excerpt": self._read_excerpt(stderr_path), + } + + def _prepare_repo_upload(self) -> Path: + target = self.logs_dir / "repo-upload" + if target.exists(): + shutil.rmtree(target) + + ignore = shutil.ignore_patterns( + ".git", + ".DS_Store", + "node_modules", + "dist", + "out", + "coverage", + ".cache", + ".turbo", + "tmp", + "logs", + "*.log", + ".ouroboros-transcripts.db", + ) + shutil.copytree(REPO_ROOT, target, ignore=ignore) + return target + + def _read_excerpt(self, path: Path, limit: int = 4000) -> str | None: + if not path.exists(): + return None + + text = path.read_text(errors="replace") + if len(text) <= limit: + return text + return text[:limit] + "\n...[truncated]" diff --git a/benchmarks/terminal-bench/run-pilot.sh b/benchmarks/terminal-bench/run-pilot.sh new file mode 100755 index 0000000..6af7a12 --- /dev/null +++ b/benchmarks/terminal-bench/run-pilot.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +JOBS_DIR="${OUROBOROS_TBENCH_JOBS_DIR:-/private/tmp/ouroboros-tbench/jobs}" +N_CONCURRENT="${OUROBOROS_TBENCH_N_CONCURRENT:-1}" + +export OUROBOROS_TBENCH_MODEL="${OUROBOROS_TBENCH_MODEL:-openai/gpt-5.5}" +export OUROBOROS_TBENCH_REASONING="${OUROBOROS_TBENCH_REASONING:-medium}" +export OUROBOROS_TBENCH_MAX_STEPS="${OUROBOROS_TBENCH_MAX_STEPS:-50}" + +if ! command -v uv >/dev/null 2>&1; then + echo "error: uv is required. Install it from https://docs.astral.sh/uv/." >&2 + exit 1 +fi + +if ! command -v docker >/dev/null 2>&1; then + echo "error: docker is required. Install Docker Desktop and start it." >&2 + exit 1 +fi + +if ! docker info >/dev/null 2>&1; then + echo "error: Docker daemon is not reachable. Start Docker Desktop, then retry." >&2 + exit 1 +fi + +if [[ -z "${OPENAI_API_KEY:-}" ]]; then + echo "error: OPENAI_API_KEY is required for the default openai/gpt-5.5 run." >&2 + exit 1 +fi + +if command -v harbor >/dev/null 2>&1; then + HARBOR_CMD=(harbor) +else + HARBOR_CMD=(uv tool run harbor) +fi + +mkdir -p "$JOBS_DIR" + +echo "Running Ouroboros Terminal-Bench 2.0 pilot" +echo "repo: $REPO_ROOT" +echo "jobs: $JOBS_DIR" +echo "model: $OUROBOROS_TBENCH_MODEL" +echo "reasoning: $OUROBOROS_TBENCH_REASONING" +echo "max steps: $OUROBOROS_TBENCH_MAX_STEPS" +echo "concurrency: $N_CONCURRENT" + +cd "$REPO_ROOT" + +PYTHONPATH="$SCRIPT_DIR${PYTHONPATH:+:$PYTHONPATH}" \ + "${HARBOR_CMD[@]}" run \ + --dataset terminal-bench@2.0 \ + --agent-import-path ouroboros_tbench_agent:OuroborosInstalledAgent \ + --n-concurrent "$N_CONCURRENT" \ + --jobs-dir "$JOBS_DIR" \ + "$@" diff --git a/packages/cli/src/json-rpc/handlers.ts b/packages/cli/src/json-rpc/handlers.ts index 31d35b5..c335896 100644 --- a/packages/cli/src/json-rpc/handlers.ts +++ b/packages/cli/src/json-rpc/handlers.ts @@ -619,6 +619,7 @@ export function createHandlers(ctx: HandlerContext): Map try { const agent = ctx.getAgent(sessionId ?? undefined) const historyBeforeRun = agent.getConversationHistory().length + const responseStartedAt = Date.now() const result = await runScoped(sessionId ?? '', () => agent.run(message, { responseStyle: @@ -632,6 +633,7 @@ export function createHandlers(ctx: HandlerContext): Map abortSignal: abort.signal, }), ) + const responseDurationMs = Math.max(0, Date.now() - responseStartedAt) if (sessionId) { // Drain skill activations attributed to *this* session — both the // user-selected one (via activateSkillForRun above) and any mid-turn @@ -641,7 +643,7 @@ export function createHandlers(ctx: HandlerContext): Map ctx.transcriptStore, sessionId, agent.getConversationHistory().slice(historyBeforeRun), - { activatedSkills }, + { activatedSkills, responseDurationMs }, ) if (!persistResult.ok) { throw new HandlerError(JSON_RPC_ERRORS.INTERNAL_ERROR.code, persistResult.error.message) @@ -1848,6 +1850,7 @@ interface DesktopSessionMessage { content: string timestamp: string imageAttachments?: ImageAttachmentMetadata[] + responseDurationMs?: number toolCalls?: DesktopToolCall[] activatedSkills?: string[] } @@ -1877,6 +1880,12 @@ function readMetadataStringArray( return strings.length > 0 ? strings : undefined } +function readMetadataDurationMs(metadata: Record | null): number | undefined { + if (!metadata) return undefined + const value = metadata.responseDurationMs + return typeof value === 'number' && Number.isFinite(value) && value >= 0 ? value : undefined +} + function toDesktopSessionData(session: SessionWithMessages) { const messages: DesktopSessionMessage[] = [] let currentAssistant: DesktopSessionMessage | null = null @@ -1904,6 +1913,8 @@ function toDesktopSessionData(session: SessionWithMessages) { } const activatedSkills = readMetadataStringArray(row.metadata, 'activatedSkills') if (activatedSkills) currentAssistant.activatedSkills = activatedSkills + const responseDurationMs = readMetadataDurationMs(row.metadata) + if (responseDurationMs != null) currentAssistant.responseDurationMs = responseDurationMs messages.push(currentAssistant) continue } @@ -2218,7 +2229,7 @@ function persistConversationDelta( transcriptStore: TranscriptStore, sessionId: string, historyDelta: LLMMessage[], - options: { activatedSkills?: string[] } = {}, + options: { activatedSkills?: string[]; responseDurationMs?: number } = {}, ) { // Attach activated-skills metadata to the FIRST assistant message in this // delta only. A run produces at most one user-visible assistant turn from @@ -2226,7 +2237,14 @@ function persistConversationDelta( // reasoning splits) reuse the same activations conceptually but storing // duplicates would just bloat the UI. const activatedSkills = options.activatedSkills ?? [] + const responseDurationMs = + typeof options.responseDurationMs === 'number' && + Number.isFinite(options.responseDurationMs) && + options.responseDurationMs >= 0 + ? options.responseDurationMs + : undefined let assistantSkillsConsumed = false + let assistantDurationConsumed = false for (const message of historyDelta) { if (message.role === 'user') { @@ -2242,13 +2260,20 @@ function persistConversationDelta( if (message.role === 'assistant') { if (message.content.trim().length > 0) { - const metadata = - !assistantSkillsConsumed && activatedSkills.length > 0 ? { activatedSkills } : undefined - if (metadata) assistantSkillsConsumed = true + const metadata: Record = {} + if (!assistantSkillsConsumed && activatedSkills.length > 0) { + metadata.activatedSkills = activatedSkills + assistantSkillsConsumed = true + } + if (!assistantDurationConsumed && responseDurationMs != null) { + metadata.responseDurationMs = responseDurationMs + assistantDurationConsumed = true + } + const hasMetadata = Object.keys(metadata).length > 0 const addResult = transcriptStore.addMessage(sessionId, { role: 'assistant', content: message.content, - ...(metadata ? { metadata } : {}), + ...(hasMetadata ? { metadata } : {}), }) if (!addResult.ok) return addResult } diff --git a/packages/cli/tests/json-rpc.test.ts b/packages/cli/tests/json-rpc.test.ts index 78e8b4c..bcb5ec8 100644 --- a/packages/cli/tests/json-rpc.test.ts +++ b/packages/cli/tests/json-rpc.test.ts @@ -2533,10 +2533,17 @@ describe('JSON-RPC', () => { const loadResult = (await handlers.get('session/load')!({ id: newResult.sessionId, })) as { - messages: Array<{ role: string; content: string; activatedSkills?: string[] }> + messages: Array<{ + role: string + content: string + activatedSkills?: string[] + responseDurationMs?: number + }> } const assistant = loadResult.messages.find((m) => m.role === 'assistant') expect(assistant?.activatedSkills).toEqual(['meta-thinking']) + expect(typeof assistant?.responseDurationMs).toBe('number') + expect(assistant!.responseDurationMs!).toBeGreaterThanOrEqual(0) const userMsg = loadResult.messages.find((m) => m.role === 'user') expect(userMsg?.activatedSkills).toBeUndefined() diff --git a/packages/desktop/src/renderer/components/AgentMessage.tsx b/packages/desktop/src/renderer/components/AgentMessage.tsx index 89495e6..4a08536 100644 --- a/packages/desktop/src/renderer/components/AgentMessage.tsx +++ b/packages/desktop/src/renderer/components/AgentMessage.tsx @@ -1,4 +1,4 @@ -import React, { useMemo } from 'react' +import React, { useEffect, useMemo, useState } from 'react' import type { CompletedToolCall, Message, SubagentRun, ToolCallState } from '../../shared/protocol' import { MarkdownRenderer } from './MarkdownRenderer' import { StreamingCursor } from './StreamingCursor' @@ -51,9 +51,23 @@ const nameStyle: React.CSSProperties = { lineHeight: 1.4, letterSpacing: '0.01em', color: 'var(--text-primary)', +} + +const metaRowStyle: React.CSSProperties = { + display: 'flex', + alignItems: 'center', + gap: 8, marginBottom: 10, } +const durationStyle: React.CSSProperties = { + fontSize: 12, + lineHeight: 1.4, + fontWeight: 500, + color: 'var(--text-tertiary)', + whiteSpace: 'nowrap', +} + const contentStyle: React.CSSProperties = { minWidth: 0, width: 'min(100%, 66ch)', @@ -163,6 +177,53 @@ function completedToState(tc: CompletedToolCall): ToolCallState { } } +function formatDuration(ms: number): string { + const totalSeconds = Math.max(0, Math.round(ms / 1000)) + if (totalSeconds < 60) return `${totalSeconds}s` + + const minutes = Math.floor(totalSeconds / 60) + const seconds = totalSeconds % 60 + return `${minutes}:${seconds.toString().padStart(2, '0')}` +} + +function useElapsedDurationMs(startedAt: string | null | undefined, isRunning: boolean): number | null { + const getElapsed = (): number | null => { + if (!startedAt) return null + const start = Date.parse(startedAt) + if (!Number.isFinite(start)) return null + return Math.max(0, Date.now() - start) + } + + const [elapsed, setElapsed] = useState(() => getElapsed()) + + useEffect(() => { + setElapsed(getElapsed()) + if (!startedAt || !isRunning) return undefined + + const interval = window.setInterval(() => { + setElapsed(getElapsed()) + }, 1000) + return () => window.clearInterval(interval) + }, [startedAt, isRunning]) + + return elapsed +} + +interface MessageMetaRowProps { + durationLabel?: string +} + +const MessageMetaRow: React.FC = ({ durationLabel }) => ( +
+
Ouroboros
+ {durationLabel && ( + + {durationLabel} + + )} +
+) + function getToolProgressLabel(toolName: string): string { switch (toolName) { case 'bash': @@ -245,51 +306,58 @@ export const AgentMessage: React.FC = ({ expandedToolCallIds, onToolCallExpandedChange, onOpenTeamGraph, -}) => ( -
-
- O -
-
-
-
Ouroboros
- {message.activatedSkills && message.activatedSkills.length > 0 && ( - - )} +}) => { + const durationLabel = + typeof message.responseDurationMs === 'number' + ? `Took ${formatDuration(message.responseDurationMs)}` + : undefined + + return ( +
+
+ O +
+
- -
- {message.toolCalls && message.toolCalls.length > 0 && ( -
- {message.toolCalls.map((tc) => ( - onToolCallExpandedChange?.(tc.id, expanded)} - /> - ))} + + {message.activatedSkills && message.activatedSkills.length > 0 && ( + + )} +
+
- )} - {message.subagentRuns && message.subagentRuns.length > 0 && ( - - )} + {message.toolCalls && message.toolCalls.length > 0 && ( +
+ {message.toolCalls.map((tc) => ( + onToolCallExpandedChange?.(tc.id, expanded)} + /> + ))} +
+ )} + {message.subagentRuns && message.subagentRuns.length > 0 && ( + + )} +
-
-) + ) +} // --------------------------------------------------------------------------- // Streaming agent message (in-progress) @@ -303,6 +371,7 @@ interface StreamingAgentMessageProps { /** Skills active for the in-progress turn (from store.pendingActivatedSkills). */ activatedSkills?: readonly string[] isRunning: boolean + responseStartedAt?: string | null expandedToolCallIds?: ReadonlySet onToolCallExpandedChange?: (toolCallId: string, expanded: boolean) => void onOpenTeamGraph?: () => void @@ -315,6 +384,7 @@ export const StreamingAgentMessage: React.FC = ({ subagentRuns, activatedSkills, isRunning, + responseStartedAt, expandedToolCallIds, onToolCallExpandedChange, onOpenTeamGraph, @@ -328,6 +398,8 @@ export const StreamingAgentMessage: React.FC = ({ () => getProgressMessage(activeEntries, completedToolCalls, text, isRunning), [activeEntries, completedToolCalls, text, isRunning], ) + const elapsedMs = useElapsedDurationMs(responseStartedAt, isRunning) + const durationLabel = elapsedMs == null ? undefined : `${formatDuration(elapsedMs)} elapsed` return (
= ({ className='agent-message__surface' data-testid='agent-message-surface' > -
Ouroboros
+ {activatedSkills && activatedSkills.length > 0 && ( )} diff --git a/packages/desktop/src/renderer/stores/conversationStore.ts b/packages/desktop/src/renderer/stores/conversationStore.ts index 5087add..69a621e 100644 --- a/packages/desktop/src/renderer/stores/conversationStore.ts +++ b/packages/desktop/src/renderer/stores/conversationStore.ts @@ -55,6 +55,7 @@ export interface SessionRunSnapshot { pendingSubmittedPlan: Plan | null isAgentRunning: boolean contextUsage: AgentContextUsageParams | null + responseStartedAt: string | null nextId: number } @@ -123,6 +124,9 @@ export interface ConversationState { /** Current estimated context usage for the active conversation. */ contextUsage: AgentContextUsageParams | null + /** ISO timestamp for the currently running assistant response. */ + responseStartedAt: string | null + // -- Actions --------------------------------------------------------------- /** User sends a message. Adds the message to the list and marks agent as running. */ @@ -303,6 +307,7 @@ function emptySnapshot(): SessionRunSnapshot { pendingSubmittedPlan: null, isAgentRunning: false, contextUsage: null, + responseStartedAt: null, nextId: 1, } } @@ -319,6 +324,7 @@ function snapshotFromFlat(state: ConversationState): SessionRunSnapshot { pendingSubmittedPlan: state.pendingSubmittedPlan, isAgentRunning: state.isAgentRunning, contextUsage: state.contextUsage, + responseStartedAt: state.responseStartedAt, nextId: state.nextId, } } @@ -334,6 +340,7 @@ function flatFromSnapshot(snap: SessionRunSnapshot): { pendingSubmittedPlan: Plan | null isAgentRunning: boolean contextUsage: AgentContextUsageParams | null + responseStartedAt: string | null nextId: number } { return { @@ -346,10 +353,18 @@ function flatFromSnapshot(snap: SessionRunSnapshot): { pendingSubmittedPlan: snap.pendingSubmittedPlan, isAgentRunning: snap.isAgentRunning, contextUsage: snap.contextUsage, + responseStartedAt: snap.responseStartedAt, nextId: snap.nextId, } } +function responseDurationMs(startedAt: string | null, endedAt = Date.now()): number | undefined { + if (!startedAt) return undefined + const started = Date.parse(startedAt) + if (!Number.isFinite(started)) return undefined + return Math.max(0, endedAt - started) +} + /** * Apply a per-session run-state update. * @@ -1080,6 +1095,7 @@ export const useConversationStore = create((set, get) => ({ modelName: null, reasoningEffort: null, contextUsage: null, + responseStartedAt: null, // ---- Actions ------------------------------------------------------------- @@ -1092,6 +1108,7 @@ export const useConversationStore = create((set, get) => ({ } const id = makeId('user', state.nextId) const sentAt = new Date().toISOString() + const responseStartedAt = sentAt const userMessage: Message = { id, role: 'user', @@ -1115,6 +1132,7 @@ export const useConversationStore = create((set, get) => ({ pendingActivatedSkills: skillName ? [skillName] : [], pendingSubmittedPlan: null, activePlanDecision: null, + responseStartedAt, nextId: state.nextId + 1, contextUsage: null, sessions: runSessionId @@ -1251,6 +1269,7 @@ export const useConversationStore = create((set, get) => ({ role: 'agent', text: finalText, timestamp: new Date().toISOString(), + responseDurationMs: responseDurationMs(state.responseStartedAt), toolCalls: state.pendingToolCalls.length > 0 ? [...state.pendingToolCalls] : undefined, subagentRuns: state.pendingSubagentRuns.length > 0 ? [...state.pendingSubagentRuns] : undefined, @@ -1267,6 +1286,7 @@ export const useConversationStore = create((set, get) => ({ pendingSubagentRuns: [], pendingActivatedSkills: [], pendingSubmittedPlan: null, + responseStartedAt: null, nextId: state.nextId + 1, contextUsage: null, sessions: state.activeRunSessionId @@ -1438,6 +1458,7 @@ export const useConversationStore = create((set, get) => ({ pendingToolCalls: [], pendingSubagentRuns: [], pendingSubmittedPlan: null, + responseStartedAt: null, nextId: withStatus.nextId, }) return @@ -1461,6 +1482,7 @@ export const useConversationStore = create((set, get) => ({ role: 'agent', text: partialText, timestamp: new Date().toISOString(), + responseDurationMs: responseDurationMs(snap.responseStartedAt), toolCalls: snap.pendingToolCalls.length > 0 ? [...snap.pendingToolCalls] : undefined, subagentRuns: @@ -1478,6 +1500,7 @@ export const useConversationStore = create((set, get) => ({ activeToolCalls: new Map(), pendingToolCalls: [], pendingSubagentRuns: [], + responseStartedAt: null, })), ...(ownedActiveRun ? { @@ -1487,6 +1510,7 @@ export const useConversationStore = create((set, get) => ({ activeToolCalls: new Map(), pendingToolCalls: [], pendingSubagentRuns: [], + responseStartedAt: null, } : {}), }) @@ -1625,6 +1649,7 @@ export const useConversationStore = create((set, get) => ({ role: 'agent', text: finalAgentText(params.text, submittedPlan), timestamp: new Date().toISOString(), + responseDurationMs: responseDurationMs(s.responseStartedAt), toolCalls: s.pendingToolCalls.length > 0 ? [...s.pendingToolCalls] : undefined, subagentRuns: s.pendingSubagentRuns.length > 0 ? [...s.pendingSubagentRuns] : undefined, activatedSkills: @@ -1640,6 +1665,7 @@ export const useConversationStore = create((set, get) => ({ pendingSubagentRuns: [], pendingActivatedSkills: [], pendingSubmittedPlan: null, + responseStartedAt: null, activePlanDecision: submittedPlan ? { sessionId: null, plan: submittedPlan } : null, nextId: s.nextId + 1, } @@ -1662,6 +1688,7 @@ export const useConversationStore = create((set, get) => ({ role: 'agent', text: finalAgentText(params.text, submittedPlan), timestamp: new Date().toISOString(), + responseDurationMs: responseDurationMs(snap.responseStartedAt), toolCalls: snap.pendingToolCalls.length > 0 ? [...snap.pendingToolCalls] : undefined, subagentRuns: snap.pendingSubagentRuns.length > 0 ? [...snap.pendingSubagentRuns] : undefined, @@ -1678,6 +1705,7 @@ export const useConversationStore = create((set, get) => ({ pendingSubagentRuns: [], pendingActivatedSkills: [], pendingSubmittedPlan: null, + responseStartedAt: null, nextId: snap.nextId + 1, } }) @@ -1747,6 +1775,7 @@ export const useConversationStore = create((set, get) => ({ role: 'agent', text: finalText, timestamp: new Date().toISOString(), + responseDurationMs: responseDurationMs(snap.responseStartedAt), toolCalls: snap.pendingToolCalls.length > 0 ? [...snap.pendingToolCalls] : undefined, subagentRuns: snap.pendingSubagentRuns.length > 0 ? [...snap.pendingSubagentRuns] : undefined, @@ -1767,6 +1796,7 @@ export const useConversationStore = create((set, get) => ({ pendingSubagentRuns: [], pendingActivatedSkills: [], pendingSubmittedPlan: null, + responseStartedAt: null, nextId: withStatus.nextId, contextUsage: null, } @@ -1860,6 +1890,7 @@ export const useConversationStore = create((set, get) => ({ pendingSubagentRuns: [], pendingActivatedSkills: [], pendingSubmittedPlan: null, + responseStartedAt: null, nextId: s.nextId + 1, contextUsage: null, } @@ -1878,6 +1909,7 @@ export const useConversationStore = create((set, get) => ({ role: 'agent', text: snap.streamingText, timestamp: new Date().toISOString(), + responseDurationMs: responseDurationMs(snap.responseStartedAt), toolCalls: snap.pendingToolCalls.length > 0 ? [...snap.pendingToolCalls] : undefined, subagentRuns: snap.pendingSubagentRuns.length > 0 ? [...snap.pendingSubagentRuns] : undefined, @@ -1900,6 +1932,7 @@ export const useConversationStore = create((set, get) => ({ pendingSubagentRuns: [], pendingActivatedSkills: [], pendingSubmittedPlan: null, + responseStartedAt: null, nextId: nextId + 1, contextUsage: null, } @@ -2102,6 +2135,9 @@ export const useConversationStore = create((set, get) => ({ text: normalizeTextContent(m.content), timestamp: m.timestamp, imageAttachments: m.imageAttachments, + ...(m.role === 'assistant' && typeof m.responseDurationMs === 'number' + ? { responseDurationMs: m.responseDurationMs } + : {}), ...(m.role === 'assistant' && m.toolCalls && m.toolCalls.length > 0 ? { toolCalls: m.toolCalls } : {}), @@ -2142,6 +2178,7 @@ export const useConversationStore = create((set, get) => ({ pendingSubmittedPlan: cliRunCompleted ? null : (existing?.pendingSubmittedPlan ?? null), isAgentRunning: cliRunCompleted ? false : state.activeRunSessionId === id, contextUsage: existing?.contextUsage ?? null, + responseStartedAt: cliRunCompleted ? null : (existing?.responseStartedAt ?? null), nextId: messages.length + 1, } snapshots.set(id, incoming) diff --git a/packages/desktop/src/renderer/views/ChatView.tsx b/packages/desktop/src/renderer/views/ChatView.tsx index 87925f4..0498a72 100644 --- a/packages/desktop/src/renderer/views/ChatView.tsx +++ b/packages/desktop/src/renderer/views/ChatView.tsx @@ -138,6 +138,7 @@ export const ChatView: React.FC = ({ const pendingToolCalls = useConversationStore((s) => s.pendingToolCalls) const pendingSubagentRuns = useConversationStore((s) => s.pendingSubagentRuns) const pendingActivatedSkills = useConversationStore((s) => s.pendingActivatedSkills) + const responseStartedAt = useConversationStore((s) => s.responseStartedAt) const bufferedText = useStreamingBuffer() const currentSessionId = useConversationStore((s) => s.currentSessionId) @@ -215,6 +216,7 @@ export const ChatView: React.FC = ({ subagentRuns={pendingSubagentRuns} activatedSkills={pendingActivatedSkills} isRunning={isAgentRunning} + responseStartedAt={responseStartedAt} expandedToolCallIds={expandedToolCallIds} onToolCallExpandedChange={handleToolCallExpandedChange} onOpenTeamGraph={onOpenTeamGraph} @@ -229,6 +231,7 @@ export const ChatView: React.FC = ({ pendingSubagentRuns, pendingActivatedSkills, isAgentRunning, + responseStartedAt, expandedToolCallIds, handleToolCallExpandedChange, onOpenTeamGraph, diff --git a/packages/desktop/src/shared/protocol.ts b/packages/desktop/src/shared/protocol.ts index e1398e7..9e57894 100644 --- a/packages/desktop/src/shared/protocol.ts +++ b/packages/desktop/src/shared/protocol.ts @@ -172,6 +172,8 @@ export interface Message { imageAttachments?: ImageAttachment[] /** Completed tool calls that appeared during this agent turn. */ toolCalls?: CompletedToolCall[] + /** Elapsed wall-clock time for this assistant response, in milliseconds. */ + responseDurationMs?: number /** Subagent activity that appeared during this agent turn. */ subagentRuns?: SubagentRun[] /** Skills active for this agent turn (deduped by name). Set on assistant @@ -761,6 +763,8 @@ export interface SessionMessage { content: string timestamp: string imageAttachments?: ImageAttachment[] + /** Elapsed wall-clock time for this assistant response, in milliseconds. */ + responseDurationMs?: number /** Tool calls made during this assistant turn (assistant role only). */ toolCalls?: CompletedToolCall[] /** Skills active for this assistant turn (deduped by name). */ diff --git a/packages/desktop/tests/conversation-store.test.ts b/packages/desktop/tests/conversation-store.test.ts index be3a34c..aeef5ee 100644 --- a/packages/desktop/tests/conversation-store.test.ts +++ b/packages/desktop/tests/conversation-store.test.ts @@ -27,6 +27,7 @@ function resetStore(): void { workspaceModeError: null, modelName: null, contextUsage: null, + responseStartedAt: null, reasoningEffort: null, }) } @@ -71,6 +72,7 @@ describe('conversation store normalization', () => { pendingSubmittedPlan: null, isAgentRunning: true, contextUsage: null, + responseStartedAt: new Date(Date.now() - 1500).toISOString(), nextId: 1, }, ], @@ -883,6 +885,7 @@ describe('per-session notification routing', () => { currentSessionId: 'session-A', activeRunSessionId: 'session-A', messages: [], + responseStartedAt: new Date(Date.now() - 2500).toISOString(), sessions: [ { id: 'session-A', @@ -906,7 +909,66 @@ describe('per-session notification routing', () => { expect(state.messages).toHaveLength(1) expect(state.messages[0].role).toBe('agent') expect(state.messages[0].text).toBe('reply for A') + expect(state.messages[0].responseDurationMs).toBeGreaterThanOrEqual(2000) expect(state.sessions[0].runStatus).toBe('idle') + expect(state.responseStartedAt).toBeNull() + }) + + test('handleTurnComplete for a background session stores response duration in the snapshot', () => { + resetStore() + useConversationStore.setState({ + currentSessionId: 'session-A', + activeRunSessionId: 'session-B', + messages: [ + { + id: 'user-1', + role: 'user', + text: 'A user message', + timestamp: '2025-01-01T00:00:00Z', + }, + ], + sessionRunSnapshots: new Map([ + [ + 'session-B', + { + messages: [], + streamingText: 'partial', + activeToolCalls: new Map(), + pendingToolCalls: [], + pendingSubagentRuns: [], + pendingActivatedSkills: [], + pendingSubmittedPlan: null, + isAgentRunning: true, + contextUsage: null, + responseStartedAt: new Date(Date.now() - 3200).toISOString(), + nextId: 1, + }, + ], + ]), + sessions: [ + { + id: 'session-B', + createdAt: '2025-01-01T00:00:00Z', + lastActive: '2025-01-01T00:00:00Z', + messageCount: 0, + title: 'B', + titleSource: 'auto', + runStatus: 'running', + }, + ], + }) + + useConversationStore.getState().handleTurnComplete({ + sessionId: 'session-B', + text: 'reply for B', + iterations: 1, + }) + + const state = useConversationStore.getState() + expect(state.messages.map((m) => m.text)).toEqual(['A user message']) + const bMessage = state.sessionRunSnapshots.get('session-B')?.messages[0] + expect(bMessage?.role).toBe('agent') + expect(bMessage?.responseDurationMs).toBeGreaterThanOrEqual(3000) }) test('handleSkillActivated ignores activations from non-current sessions', () => { @@ -972,6 +1034,55 @@ describe('per-session notification routing', () => { expect(state.sessions[0].runStatus).toBe('error') expect(state.activeRunSessionId).toBeNull() }) + + test('handleTurnAborted preserves response duration on partial assistant text', () => { + resetStore() + useConversationStore.setState({ + currentSessionId: 'session-A', + activeRunSessionId: 'session-A', + isAgentRunning: true, + streamingText: 'partial answer', + responseStartedAt: new Date(Date.now() - 2100).toISOString(), + sessions: [ + { + id: 'session-A', + createdAt: '2025-01-01T00:00:00Z', + lastActive: '2025-01-01T00:00:00Z', + messageCount: 0, + title: 'A', + titleSource: 'auto', + runStatus: 'running', + }, + ], + }) + + useConversationStore.getState().handleTurnAborted({ + sessionId: 'session-A', + iterations: 1, + partialText: 'partial answer', + }) + + const agentMessage = useConversationStore.getState().messages.find((m) => m.role === 'agent') + expect(agentMessage?.responseDurationMs).toBeGreaterThanOrEqual(2000) + }) + + test('handleAgentError preserves response duration on partial assistant text', () => { + resetStore() + useConversationStore.setState({ + currentSessionId: 'session-A', + activeRunSessionId: 'session-A', + isAgentRunning: true, + streamingText: 'partial before error', + responseStartedAt: new Date(Date.now() - 1800).toISOString(), + }) + + useConversationStore + .getState() + .handleAgentError({ sessionId: 'session-A', message: 'run failed' }) + + const agentMessage = useConversationStore.getState().messages.find((m) => m.role === 'agent') + expect(agentMessage?.responseDurationMs).toBeGreaterThanOrEqual(1000) + }) }) describe('per-session snapshots survive view switches', () => { @@ -1051,8 +1162,10 @@ describe('per-session snapshots survive view switches', () => { pendingToolCalls: [], pendingSubagentRuns: [], pendingActivatedSkills: [], + pendingSubmittedPlan: null, isAgentRunning: true, contextUsage: null, + responseStartedAt: null, nextId: 2, }, ], @@ -1101,8 +1214,10 @@ describe('per-session snapshots survive view switches', () => { pendingToolCalls: [], pendingSubagentRuns: [], pendingActivatedSkills: [], + pendingSubmittedPlan: null, isAgentRunning: true, contextUsage: null, + responseStartedAt: null, nextId: 2, }, ], @@ -1188,6 +1303,7 @@ describe('loadSession image preview hydration', () => { const before = useConversationStore.getState().messages[0] expect(before.imageAttachments?.[0].previewDataUrl).toBeUndefined() + await Promise.resolve() expect(calls).toEqual([['/tmp/watermark2.webp']]) resolveValidate({ @@ -1205,6 +1321,7 @@ describe('loadSession image preview hydration', () => { await validatePromise await Promise.resolve() + await Promise.resolve() const after = useConversationStore.getState().messages[0] expect(after.imageAttachments?.[0].previewDataUrl).toBe('data:image/webp;base64,AAA=') @@ -1225,6 +1342,7 @@ describe('loadSession image preview hydration', () => { role: 'assistant', content: 'Let me read it.', timestamp: '2025-01-01T00:00:01Z', + responseDurationMs: 3456, toolCalls: [ { id: 'tc-1', @@ -1246,6 +1364,7 @@ describe('loadSession image preview hydration', () => { const messages = useConversationStore.getState().messages expect(messages).toHaveLength(3) expect(messages[1].role).toBe('agent') + expect(messages[1].responseDurationMs).toBe(3456) expect(messages[1].toolCalls).toEqual([ { id: 'tc-1', diff --git a/packages/desktop/tests/e2e/renderer-contract-runtime.spec.ts b/packages/desktop/tests/e2e/renderer-contract-runtime.spec.ts index f6c8767..d3fcdbb 100644 --- a/packages/desktop/tests/e2e/renderer-contract-runtime.spec.ts +++ b/packages/desktop/tests/e2e/renderer-contract-runtime.spec.ts @@ -769,6 +769,9 @@ test('streaming assistant text renders markdown before turn completion', async ( const streamingMessage = launched.page.locator('[data-testid="agent-message"]').last() await expect(streamingMessage.getByText('Writing the response...')).toBeVisible() + await expect(streamingMessage.getByTestId('agent-response-duration')).toContainText( + /\d+s elapsed/, + ) await expect(streamingMessage.getByRole('heading', { name: 'Short answer' })).toBeVisible() await expect(streamingMessage.getByText('First point')).toBeVisible() await expect(streamingMessage.locator('.code-block-wrapper')).toBeVisible() @@ -802,6 +805,9 @@ test('streaming assistant text renders markdown before turn completion', async ( }) await expect(launched.page.getByRole('heading', { name: 'Short answer' }).last()).toBeVisible() + const completedMessage = launched.page.locator('[data-testid="agent-message"]').last() + await expect(completedMessage.getByTestId('agent-response-duration')).toContainText(/Took \d+s/) + await expect(completedMessage.getByTestId('agent-progress-chip')).toHaveCount(0) }) test('streaming status chip stays at the bottom of a growing assistant bubble', async ({}, testInfo) => {