diff --git a/docs/api-reference.md b/docs/api-reference.md index 36f6400..f43edb4 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -435,10 +435,7 @@ The result includes pass-rate deltas, optional tool precision/recall/F1 deltas, **Result Structure:** -```typescript snippet=src/evals/evalRunner.ts#L106-L184 -/** - * Overall result of running an eval dataset - */ +```typescript snippet=src/evals/evalRunner.ts#L121-L195 export interface EvalRunnerResult { /** * Total number of cases @@ -1043,7 +1040,12 @@ interface MCPConformanceResult { ### `EvalExpectBlock` -```typescript snippet=src/evals/datasetTypes.ts#L186-L277 +```typescript snippet=src/evals/datasetTypes.ts#L190-L288 +/** + * Unified expectation block for eval cases + * + * Mirrors the Playwright matcher API for consistency. + */ export interface EvalExpectBlock { /** * Exact response match (toMatchToolResponse) @@ -1102,8 +1104,9 @@ export interface EvalExpectBlock { }; /** - * Asserts which tools the LLM called during a mcp_host simulation. - * Only meaningful for mcp_host mode — direct mode has no tool call trace. + * Asserts which tools the LLM called during a host simulation. + * Only meaningful for mcp_host or external_host runs with high-confidence + * structured tool evidence — direct mode has no tool call trace. */ toolsTriggered?: { /** Expected tool calls */ @@ -1125,7 +1128,8 @@ export interface EvalExpectBlock { }; /** - * Asserts the number of tool calls made during a mcp_host simulation. + * Asserts the number of tool calls made during a host simulation. + * External-host runs require high-confidence structured tool evidence. */ toolCallCount?: { /** Minimum number of tool calls */ @@ -1140,7 +1144,14 @@ export interface EvalExpectBlock { ### `EvalCase` -````typescript snippet=src/evals/datasetTypes.ts#L27-L139 +````typescript snippet=src/evals/datasetTypes.ts#L23-L148 +/** + * A single eval test case + * + * For 'direct' mode: toolName and args are required + * For 'mcp_host' mode: scenario and mcpHostConfig are required + * For 'external_host' mode: scenario and externalHost are required + */ export interface EvalCase { /** * Unique identifier for this test case @@ -1155,7 +1166,8 @@ export interface EvalCase { /** * Evaluation mode * - 'direct': Direct API calls to MCP tools (default) - * - 'mcp_host': LLM-driven tool selection via natural language + * - 'mcp_host': SDK/CLI host simulation via natural language + * - 'external_host': Real external MCP host driven by configured capabilities * * @default 'direct' */ @@ -1172,7 +1184,7 @@ export interface EvalCase { args?: Record; /** - * Natural language scenario for LLM to execute (optional, required for 'mcp_host' mode) + * Natural language scenario for LLM to execute (required for 'mcp_host' and 'external_host' modes) * * @example "Get the weather for London and tell me if I need an umbrella" */ @@ -1185,6 +1197,11 @@ export interface EvalCase { */ mcpHostConfig?: MCPHostConfig; + /** + * External host configuration (required for 'external_host' mode) + */ + externalHost?: ExternalHostConfig; + /** * Additional metadata for this test case * @@ -1256,18 +1273,6 @@ export interface EvalCase { } ```` -### `EvalDataset` - -```typescript -interface EvalDataset { - name: string; - description?: string; - cases: EvalCase[]; - metadata?: Record; - schemas?: Record; // Zod schemas for toMatchToolSchema assertions -} -``` - ## Next Steps - See the [Authentication Guide](./authentication.md) for OAuth and token auth diff --git a/package-lock.json b/package-lock.json index 3376a2e..147c174 100644 --- a/package-lock.json +++ b/package-lock.json @@ -16,6 +16,7 @@ "debug": "^4.4.3", "ink": "^5.2.1", "ink-spinner": "^5.0.0", + "ndjson": "^2.0.0", "oauth4webapi": "^3.0.0", "open": "^10.1.0", "react": "^18.3.1", @@ -30,6 +31,7 @@ "@playwright/test": "^1.49.0", "@release-it-plugins/lerna-changelog": "^8.0.1", "@types/debug": "^4.1.12", + "@types/ndjson": "^2.0.4", "@types/node": "^22.10.2", "@types/react": "^18.3.12", "@types/react-dom": "^18.3.1", @@ -5083,6 +5085,17 @@ "dev": true, "license": "MIT" }, + "node_modules/@types/ndjson": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/@types/ndjson/-/ndjson-2.0.4.tgz", + "integrity": "sha512-ajAl7AjhFstF6waORYNSS49GL5iBKisqJlgvXuprXFKCX9fto4ordlNU3+XMgkMddgeR0WoQQBmKUk0v0dJ4pw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/node": "*", + "@types/through": "*" + } + }, "node_modules/@types/node": { "version": "22.19.3", "resolved": "https://registry.npmjs.org/@types/node/-/node-22.19.3.tgz", @@ -5149,6 +5162,16 @@ "@types/node": "*" } }, + "node_modules/@types/through": { + "version": "0.0.33", + "resolved": "https://registry.npmjs.org/@types/through/-/through-0.0.33.tgz", + "integrity": "sha512-HsJ+z3QuETzP3cswwtzt2vEIiHBk/dCcHGhbmG5X3ecnwFD/lPrMpliGXxSCg03L9AhrdwA4Oz/qfspkDW+xGQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/node": "*" + } + }, "node_modules/@types/unist": { "version": "3.0.3", "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz", @@ -10190,7 +10213,6 @@ "version": "5.0.1", "resolved": "https://registry.npmjs.org/json-stringify-safe/-/json-stringify-safe-5.0.1.tgz", "integrity": "sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA==", - "dev": true, "license": "ISC" }, "node_modules/json-with-bigint": { @@ -11732,7 +11754,6 @@ "version": "1.2.8", "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz", "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==", - "dev": true, "license": "MIT", "funding": { "url": "https://github.com/sponsors/ljharb" @@ -11944,7 +11965,6 @@ "version": "2.0.0", "resolved": "https://registry.npmjs.org/ndjson/-/ndjson-2.0.0.tgz", "integrity": "sha512-nGl7LRGrzugTtaFcJMhLbpzJM6XdivmbkdlaGcrk/LXg2KL/YBC6z1g70xh0/al+oFuVFP8N8kiWRucmeEH/qQ==", - "dev": true, "license": "BSD-3-Clause", "dependencies": { "json-stringify-safe": "^5.0.1", @@ -13628,7 +13648,6 @@ "version": "3.6.2", "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz", "integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==", - "dev": true, "license": "MIT", "dependencies": { "inherits": "^2.0.3", @@ -14114,7 +14133,6 @@ "version": "5.2.1", "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==", - "devOptional": true, "funding": [ { "type": "github", @@ -14443,7 +14461,6 @@ "version": "3.2.2", "resolved": "https://registry.npmjs.org/split2/-/split2-3.2.2.tgz", "integrity": "sha512-9NThjpgZnifTkJpzTZ7Eue85S49QwpNhZTq6GRJwObb6jnLFNGB7Qm73V5HewTROPyxD0C29xqmaI68bQtV+hg==", - "dev": true, "license": "ISC", "dependencies": { "readable-stream": "^3.0.0" @@ -14534,7 +14551,6 @@ "version": "1.3.0", "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz", "integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==", - "dev": true, "license": "MIT", "dependencies": { "safe-buffer": "~5.2.0" @@ -14874,7 +14890,6 @@ "version": "4.0.2", "resolved": "https://registry.npmjs.org/through2/-/through2-4.0.2.tgz", "integrity": "sha512-iOqSav00cVxEEICeD7TjLB1sueEL+81Wpzp2bY17uZjZN0pWZPuo4suZ/61VujxmqSGFfgOcNuTZ85QJwNZQpw==", - "dev": true, "license": "MIT", "dependencies": { "readable-stream": "3" @@ -16438,7 +16453,6 @@ "version": "1.0.2", "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==", - "dev": true, "license": "MIT" }, "node_modules/validate-npm-package-name": { diff --git a/package.json b/package.json index d5c89dc..5af7e9f 100644 --- a/package.json +++ b/package.json @@ -72,6 +72,9 @@ "preview-reporter": "tsx scripts/preview-reporter.ts", "test": "vitest run", "test:all": "npm run build && npm run format:check && npm run lint && npm run typecheck && npm test", + "test:external-host": "vitest run --config vitest.external-host.config.mts", + "test:external-host:chat": "vitest run --config vitest.external-host.config.mts -t \"Claude Chat\"", + "test:external-host:cowork": "vitest run --config vitest.external-host.config.mts -t \"Claude Cowork\"", "test:playwright": "playwright test", "test:watch": "vitest", "typecheck": "tsc --noEmit" @@ -84,6 +87,7 @@ "debug": "^4.4.3", "ink": "^5.2.1", "ink-spinner": "^5.0.0", + "ndjson": "^2.0.0", "oauth4webapi": "^3.0.0", "open": "^10.1.0", "react": "^18.3.1", @@ -95,6 +99,7 @@ "@playwright/test": "^1.49.0", "@release-it-plugins/lerna-changelog": "^8.0.1", "@types/debug": "^4.1.12", + "@types/ndjson": "^2.0.4", "@types/node": "^22.10.2", "@types/react": "^18.3.12", "@types/react-dom": "^18.3.1", diff --git a/src/assertions/validators/toolCalls.test.ts b/src/assertions/validators/toolCalls.test.ts index 42da8b7..637c613 100644 --- a/src/assertions/validators/toolCalls.test.ts +++ b/src/assertions/validators/toolCalls.test.ts @@ -182,7 +182,7 @@ describe('validateToolCalls', () => { calls: [{ name: 'search' }], }); expect(v.pass).toBe(false); - expect(v.message).toContain('mcp_host'); + expect(v.message).toContain('host simulation response'); }); }); @@ -292,6 +292,6 @@ describe('validateToolCallCount', () => { it('returns error when response is not an MCPHostSimulationResult', () => { const v = validateToolCallCount('not a simulation result', { exact: 1 }); expect(v.pass).toBe(false); - expect(v.message).toContain('mcp_host'); + expect(v.message).toContain('host simulation response'); }); }); diff --git a/src/assertions/validators/toolCalls.ts b/src/assertions/validators/toolCalls.ts index ccb36fe..2a11b70 100644 --- a/src/assertions/validators/toolCalls.ts +++ b/src/assertions/validators/toolCalls.ts @@ -102,9 +102,9 @@ function findMatchingCall( } /** - * Validates tool calls made during an MCP host simulation. + * Validates tool calls made during a host simulation. * - * @param response - Must be an MCPHostSimulationResult (from mcp_host mode) + * @param response - Must be an MCPHostSimulationResult-compatible response * @param expectation - Expected tool call specification */ export function validateToolCalls( @@ -115,7 +115,7 @@ export function validateToolCalls( return { pass: false, message: - 'toolsTriggered expectation requires mcp_host mode — response must be an MCPHostSimulationResult', + 'toolsTriggered expectation requires a host simulation response with structured tool calls', }; } @@ -206,9 +206,9 @@ export function validateToolCalls( } /** - * Validates the number of tool calls made during an MCP host simulation. + * Validates the number of tool calls made during a host simulation. * - * @param response - Must be an MCPHostSimulationResult (from mcp_host mode) + * @param response - Must be an MCPHostSimulationResult-compatible response * @param options - Count constraints (min, max, exact) */ export function validateToolCallCount( @@ -219,7 +219,7 @@ export function validateToolCallCount( return { pass: false, message: - 'toolCallCount expectation requires mcp_host mode — response must be an MCPHostSimulationResult', + 'toolCallCount expectation requires a host simulation response with structured tool calls', }; } diff --git a/src/assertions/validators/validators.test.ts b/src/assertions/validators/validators.test.ts index d4f19cc..1e7b361 100644 --- a/src/assertions/validators/validators.test.ts +++ b/src/assertions/validators/validators.test.ts @@ -183,6 +183,18 @@ describe('validateText', () => { const result = validateText(response, 'result'); expect(result.pass).toBe(true); }); + + it('should prefer host simulation final response over metadata JSON', () => { + const response = { + response: 'final answer text', + externalHost: { + traceLimitations: ['metadata-only text'], + }, + }; + + expect(validateText(response, 'final answer').pass).toBe(true); + expect(validateText(response, 'metadata-only').pass).toBe(false); + }); }); }); diff --git a/src/evals/datasetTypes.test.ts b/src/evals/datasetTypes.test.ts index 94201d7..e1df9df 100644 --- a/src/evals/datasetTypes.test.ts +++ b/src/evals/datasetTypes.test.ts @@ -92,6 +92,68 @@ describe('datasetTypes', () => { expect(() => validateEvalCase(evalCase)).not.toThrow(); }); + it('should accept external_host eval case configuration', () => { + const evalCase = { + id: 'external-1', + mode: 'external_host' as const, + scenario: 'Reply with exactly hello', + externalHost: { + driver: { + provider: 'anthropic', + product: 'claude', + surface: 'cowork', + runtime: 'desktop-app', + platform: 'macos', + }, + name: 'Claude Cowork Desktop', + timeoutMs: 120000, + capabilities: { + control: [ + { uses: 'builtin:platform.macos' }, + { uses: 'builtin:anthropic.claude.coworkSurface' }, + ], + input: { + uses: 'builtin:desktop.macos.accessibilitySubmit', + with: { createNewConversation: false }, + }, + completion: { + uses: 'builtin:anthropic.claude.localAgentTrace', + provides: ['trace'], + }, + normalize: { + uses: 'builtin:anthropic.claude.localAgentNormalize', + }, + }, + correlation: { + strategy: 'prompt_marker', + includeInPrompt: false, + promptTemplate: 'trace: {{marker}}', + }, + options: { + appName: 'Claude', + newConversationShortcut: 'cmd+n', + }, + }, + }; + + const result = validateEvalCase(evalCase); + + expect(result).toEqual(evalCase); + }); + + it('should reject external_host configuration without a driver', () => { + const evalCase = { + id: 'external-1', + mode: 'external_host' as const, + scenario: 'Reply with exactly hello', + externalHost: { + name: 'Claude Cowork Desktop', + }, + }; + + expect(() => validateEvalCase(evalCase)).toThrow(ZodError); + }); + it('should accept eval case with complex args', () => { const evalCase = { id: 'test-1', diff --git a/src/evals/datasetTypes.ts b/src/evals/datasetTypes.ts index 0a7d7ba..01fcd7e 100644 --- a/src/evals/datasetTypes.ts +++ b/src/evals/datasetTypes.ts @@ -1,5 +1,7 @@ import { z } from 'zod'; import type { MCPHostConfig } from './mcpHost/mcpHostTypes.js'; +import type { ExternalHostConfig } from './externalHost/types.js'; +import { ExternalHostConfigSchema } from './externalHost/schema.js'; import type { SnapshotSanitizer } from '../assertions/validators/types.js'; import type { BuiltInRubric } from '../judge/judgeTypes.js'; @@ -16,13 +18,14 @@ export type { /** * Evaluation mode */ -export type EvalMode = 'direct' | 'mcp_host'; +export type EvalMode = 'direct' | 'mcp_host' | 'external_host'; /** * A single eval test case * * For 'direct' mode: toolName and args are required * For 'mcp_host' mode: scenario and mcpHostConfig are required + * For 'external_host' mode: scenario and externalHost are required */ export interface EvalCase { /** @@ -38,7 +41,8 @@ export interface EvalCase { /** * Evaluation mode * - 'direct': Direct API calls to MCP tools (default) - * - 'mcp_host': LLM-driven tool selection via natural language + * - 'mcp_host': SDK/CLI host simulation via natural language + * - 'external_host': Real external MCP host driven by configured capabilities * * @default 'direct' */ @@ -55,7 +59,7 @@ export interface EvalCase { args?: Record; /** - * Natural language scenario for LLM to execute (optional, required for 'mcp_host' mode) + * Natural language scenario for LLM to execute (required for 'mcp_host' and 'external_host' modes) * * @example "Get the weather for London and tell me if I need an umbrella" */ @@ -68,6 +72,11 @@ export interface EvalCase { */ mcpHostConfig?: MCPHostConfig; + /** + * External host configuration (required for 'external_host' mode) + */ + externalHost?: ExternalHostConfig; + /** * Additional metadata for this test case * @@ -241,8 +250,9 @@ export interface EvalExpectBlock { }; /** - * Asserts which tools the LLM called during a mcp_host simulation. - * Only meaningful for mcp_host mode — direct mode has no tool call trace. + * Asserts which tools the LLM called during a host simulation. + * Only meaningful for mcp_host or external_host runs with high-confidence + * structured tool evidence — direct mode has no tool call trace. */ toolsTriggered?: { /** Expected tool calls */ @@ -264,7 +274,8 @@ export interface EvalExpectBlock { }; /** - * Asserts the number of tool calls made during a mcp_host simulation. + * Asserts the number of tool calls made during a host simulation. + * External-host runs require high-confidence structured tool evidence. */ toolCallCount?: { /** Minimum number of tool calls */ @@ -447,11 +458,12 @@ const EvalExpectBlockSchema = z.object({ export const EvalCaseSchema = z.object({ id: z.string().min(1, 'id must not be empty'), description: z.string().optional(), - mode: z.enum(['direct', 'mcp_host']).optional(), + mode: z.enum(['direct', 'mcp_host', 'external_host']).optional(), toolName: z.string().min(1, 'toolName must not be empty').optional(), args: z.record(z.string(), z.unknown()).optional(), scenario: z.string().optional(), mcpHostConfig: MCPHostConfigSchema.optional(), + externalHost: ExternalHostConfigSchema.optional(), metadata: z.record(z.string(), z.unknown()).optional(), iterations: z.number().int().min(1).optional(), accuracyThreshold: z.number().min(0).max(1).optional(), diff --git a/src/evals/evalRunner.externalHost.test.ts b/src/evals/evalRunner.externalHost.test.ts new file mode 100644 index 0000000..8cd888b --- /dev/null +++ b/src/evals/evalRunner.externalHost.test.ts @@ -0,0 +1,365 @@ +import { describe, expect, it, vi } from 'vitest'; +import type { MCPFixtureApi } from '../mcp/fixtures/mcpFixture.js'; +import type { + ExternalHostRunResult, + ExternalHostSimulationResult, +} from './externalHost/types.js'; + +const TEST_CORRELATION = { + strategy: 'prompt_marker', + marker: 'MCP_SERVER_TESTER_TEST', + includedInPrompt: true, +} as const; + +vi.mock('./externalHost/runtime.js', () => ({ + runExternalHostScenario: vi.fn(async () => { + const result: ExternalHostSimulationResult = { + success: true, + response: 'external host trace acknowledged.', + toolCalls: [{ name: 'search', arguments: { query: 'planning' } }], + scenario: 'unused', + usage: { + inputTokens: 10, + outputTokens: 5, + totalCostUsd: 0.01, + durationMs: 1000, + }, + externalHost: { + driver: { + provider: 'anthropic', + product: 'claude', + surface: 'cowork', + runtime: 'desktop-app', + platform: 'macos', + }, + driverSlug: 'anthropic.claude.cowork.desktop-app.macos', + displayName: 'Claude Cowork Desktop', + hostName: 'Claude Cowork Desktop', + hostType: 'desktop', + capabilitiesUsed: [ + 'control', + 'input', + 'completion', + 'trace', + 'normalize', + ], + traceSource: 'host-local-transcript', + traceConfidence: 'high', + traceLimitations: ['fixture limitation'], + artifacts: [ + { + kind: 'audit', + name: 'Claude audit log', + path: '/tmp/audit.jsonl', + }, + ], + session: { + id: 'local_123', + runMarker: 'MCP_SERVER_TESTER_TEST', + requestId: 'req_123', + }, + correlation: TEST_CORRELATION, + sources: { + finalAnswer: 'host-local-transcript', + toolCalls: 'host-local-transcript', + usage: 'host-local-transcript', + cost: 'host-local-transcript', + }, + evidence: { + finalAnswer: { + source: 'host-local-transcript', + confidence: 'high', + }, + toolCalls: { + source: 'host-local-transcript', + confidence: 'high', + }, + usage: { source: 'host-local-transcript', confidence: 'high' }, + cost: { source: 'host-local-transcript', confidence: 'high' }, + }, + }, + }; + return result; + }), +})); + +const { runEvalCase } = await import('./evalRunner.js'); +const { runExternalHostScenario } = await import('./externalHost/runtime.js'); + +function makeContext(): { mcp: MCPFixtureApi } { + return { + mcp: { + authType: 'none', + project: 'external-host-test', + } as MCPFixtureApi, + }; +} + +describe('runEvalCase external_host mode', () => { + it('runs an external host case through existing expectations and preserves trace metadata', async () => { + const result = await runEvalCase( + { + id: 'external-host-case', + mode: 'external_host', + scenario: 'Say hello and search.', + externalHost: { + driver: 'anthropic.claude.cowork.desktop-app.macos', + name: 'Claude Cowork Desktop', + }, + expect: { + containsText: 'trace acknowledged', + toolsTriggered: { + calls: [{ name: 'search', arguments: { query: 'planning' } }], + }, + }, + }, + makeContext() + ); + + expect(runExternalHostScenario).toHaveBeenCalledWith( + 'Say hello and search.', + { + driver: 'anthropic.claude.cowork.desktop-app.macos', + name: 'Claude Cowork Desktop', + }, + { caseId: 'external-host-case' } + ); + expect(result.pass).toBe(true); + expect(result.toolName).toBe('external_host'); + expect(result.hostUsage).toMatchObject({ totalCostUsd: 0.01 }); + expect(result.externalHost).toMatchObject({ + hostName: 'Claude Cowork Desktop', + traceSource: 'host-local-transcript', + traceConfidence: 'high', + session: { id: 'local_123', requestId: 'req_123' }, + }); + expect(result.request?.externalHost).toEqual({ + driver: 'anthropic.claude.cowork.desktop-app.macos', + driverSlug: 'anthropic.claude.cowork.desktop-app.macos', + name: 'Claude Cowork Desktop', + hostType: undefined, + variant: undefined, + timeoutMs: undefined, + usesBuiltInDefaults: true, + correlation: { + strategy: 'prompt_marker', + includeInPrompt: true, + }, + options: undefined, + capabilities: { + control: [ + { uses: 'builtin:platform.macos' }, + { + uses: 'builtin:anthropic.claude.activateCoworkSurface', + with: { appName: 'Claude' }, + }, + ], + input: [ + { + uses: 'builtin:desktop.macos.accessibilitySubmit', + with: { appName: 'Claude', createNewConversation: true }, + }, + ], + completion: [ + { + uses: 'builtin:anthropic.claude.localAgentTrace', + provides: ['trace'], + }, + ], + normalize: [{ uses: 'builtin:anthropic.claude.localAgentNormalize' }], + }, + }); + expect(result.mcpHostTrace?.calls).toEqual([ + { + name: 'search', + arguments: { query: 'planning' }, + status: 'expected', + }, + ]); + }); + + it('fails tool assertions as trace insufficiency when external host evidence is low confidence', async () => { + vi.mocked(runExternalHostScenario).mockResolvedValueOnce({ + success: true, + response: 'external host trace acknowledged.', + toolCalls: [{ name: 'search', arguments: { query: 'planning' } }], + externalHost: { + driver: { + provider: 'anthropic', + product: 'claude', + surface: 'chat', + runtime: 'desktop-app', + platform: 'macos', + }, + driverSlug: 'anthropic.claude.chat.desktop-app.macos', + displayName: 'Claude Chat Desktop', + hostName: 'Claude Chat Desktop', + hostType: 'desktop', + capabilitiesUsed: [ + 'control', + 'input', + 'completion', + 'trace', + 'normalize', + ], + traceSource: 'accessibility', + traceConfidence: 'low', + artifacts: [], + session: { runMarker: 'MCP_SERVER_TESTER_TEST' }, + correlation: TEST_CORRELATION, + sources: { + finalAnswer: 'accessibility', + toolCalls: 'none', + usage: 'none', + cost: 'none', + }, + evidence: { + finalAnswer: { source: 'accessibility', confidence: 'low' }, + toolCalls: { source: 'none', confidence: 'unknown' }, + }, + }, + }); + + const result = await runEvalCase( + { + id: 'external-host-low-confidence', + mode: 'external_host', + scenario: 'Say hello and search.', + externalHost: { + driver: 'anthropic.claude.chat.desktop-app.macos', + }, + expect: { + containsText: 'trace acknowledged', + toolsTriggered: { + calls: [{ name: 'search' }], + }, + }, + }, + makeContext() + ); + + expect(result.pass).toBe(false); + expect(result.mcpHostTrace).toBeUndefined(); + expect(result.expectations.toolsTriggered?.details).toContain( + 'cannot support tool-call assertions' + ); + }); + + it('requires high-confidence structured evidence for tool assertions when per-field evidence is absent', async () => { + vi.mocked(runExternalHostScenario).mockResolvedValueOnce({ + success: true, + response: 'external host trace acknowledged.', + toolCalls: [{ name: 'search', arguments: { query: 'planning' } }], + externalHost: { + driver: { + provider: 'anthropic', + product: 'claude', + surface: 'cowork', + runtime: 'desktop-app', + platform: 'macos', + }, + driverSlug: 'anthropic.claude.cowork.desktop-app.macos', + displayName: 'Claude Cowork Desktop', + hostName: 'Claude Cowork Desktop', + hostType: 'desktop', + capabilitiesUsed: [ + 'control', + 'input', + 'completion', + 'trace', + 'normalize', + ], + traceSource: 'host-local-transcript', + traceConfidence: 'medium', + artifacts: [], + session: { runMarker: 'MCP_SERVER_TESTER_TEST' }, + correlation: TEST_CORRELATION, + sources: { + finalAnswer: 'host-local-transcript', + toolCalls: 'host-local-transcript', + }, + }, + }); + + const result = await runEvalCase( + { + id: 'external-host-medium-confidence', + mode: 'external_host', + scenario: 'Say hello and search.', + externalHost: { + driver: 'anthropic.claude.cowork.desktop-app.macos', + }, + expect: { + toolsTriggered: { + calls: [{ name: 'search' }], + }, + }, + }, + makeContext() + ); + + expect(result.pass).toBe(false); + expect(result.mcpHostTrace).toBeUndefined(); + expect(result.expectations.toolsTriggered?.details).toContain( + 'cannot support tool-call assertions' + ); + }); + + it('counts external host driver failures as infrastructure failures across iterations', async () => { + const failure: ExternalHostRunResult = { + success: false, + error: 'Failed to submit prompt to Claude: automation permission denied', + toolCalls: [], + externalHost: { + driver: { + provider: 'anthropic', + product: 'claude', + surface: 'cowork', + runtime: 'desktop-app', + platform: 'macos', + }, + driverSlug: 'anthropic.claude.cowork.desktop-app.macos', + displayName: 'Claude Cowork Desktop', + hostName: 'Claude Cowork Desktop', + hostType: 'desktop', + capabilitiesUsed: [], + traceSource: 'none', + traceConfidence: 'unknown', + artifacts: [], + session: { runMarker: 'MCP_SERVER_TESTER_TEST' }, + correlation: TEST_CORRELATION, + failureKind: 'automation_permission_denied', + }, + }; + const deniedAgain: ExternalHostRunResult = { + ...failure, + error: 'Failed to submit prompt to Claude: still denied', + }; + vi.mocked(runExternalHostScenario) + .mockResolvedValueOnce(failure) + .mockResolvedValueOnce(deniedAgain); + + const result = await runEvalCase( + { + id: 'external-host-driver-failure', + mode: 'external_host', + scenario: 'Say hello.', + externalHost: { + driver: 'anthropic.claude.cowork.desktop-app.macos', + }, + iterations: 2, + expect: { + containsText: 'hello', + }, + }, + makeContext() + ); + + expect(result.pass).toBe(false); + expect(result.infrastructureErrorCount).toBe(2); + expect(result.infrastructureErrorRate).toBe(1); + expect(result.iterationResults?.every((r) => r.isInfrastructureError)).toBe( + true + ); + }); +}); diff --git a/src/evals/evalRunner.test.ts b/src/evals/evalRunner.test.ts index f2f7200..82d9f66 100644 --- a/src/evals/evalRunner.test.ts +++ b/src/evals/evalRunner.test.ts @@ -608,7 +608,9 @@ describe('toolsTriggered and toolCallCount expectations in eval runner', () => { const result = await runEvalCase(evalCase, createContext(mcp)); expect(result.expectations.toolsTriggered?.pass).toBe(false); - expect(result.expectations.toolsTriggered?.details).toContain('mcp_host'); + expect(result.expectations.toolsTriggered?.details).toContain( + 'host simulation response' + ); }); it('validates toolCallCount correctly from simulation result', async () => { diff --git a/src/evals/evalRunner.ts b/src/evals/evalRunner.ts index 5c943c5..235a712 100644 --- a/src/evals/evalRunner.ts +++ b/src/evals/evalRunner.ts @@ -5,6 +5,18 @@ import type { Tool } from '@modelcontextprotocol/sdk/types.js'; import type { ZodType } from 'zod'; import { simulateMCPHost } from './mcpHost/mcpHostSimulation.js'; import type { MCPHostSimulationResult } from './mcpHost/mcpHostTypes.js'; +import { runExternalHostScenario } from './externalHost/runtime.js'; +import type { + ExternalHostCapabilitiesConfig, + ExternalHostCorrelationConfig, + ExternalHostMetadata, + ExternalHostSimulationResult, +} from './externalHost/types.js'; +import { + driverToSlug, + normalizeHostDriver, +} from './externalHost/driverIdentity.js'; +import { getRegisteredExternalHostConfig } from './externalHost/hostRegistry.js'; import type { EvalExpectationResult, UsageMetrics } from '../types/index.js'; import type { EvalCaseResult, @@ -411,6 +423,33 @@ async function executeToolCall( throw new Error(simulationResult.error || 'MCP host simulation failed'); } + return { response: simulationResult }; + } else if (mode === 'external_host') { + if (!evalCase.scenario) { + throw new Error( + `Eval case ${evalCase.id}: scenario is required for external_host mode` + ); + } + + if (!evalCase.externalHost) { + throw new Error( + `Eval case ${evalCase.id}: externalHost is required for external_host mode` + ); + } + + const simulationResult = await runExternalHostScenario( + evalCase.scenario, + evalCase.externalHost, + { caseId: evalCase.id } + ); + + if (!simulationResult.success) { + return { + response: simulationResult, + error: simulationResult.error || 'External host simulation failed', + }; + } + return { response: simulationResult }; } else { // Direct mode - call tool directly @@ -670,11 +709,26 @@ function buildRequest( evalCase: EvalCase, toolOverrideVariantId?: string ): EvalCaseRequest { - const request: EvalCaseRequest = {}; + const request: EvalCaseRequest = { + mode: evalCase.mode ?? 'direct', + }; if (evalCase.description) request.description = evalCase.description; if (toolOverrideVariantId !== undefined) { request.toolOverrideVariantId = toolOverrideVariantId; } + if (evalCase.iterations !== undefined) + request.iterations = evalCase.iterations; + if (evalCase.accuracyThreshold !== undefined) { + request.accuracyThreshold = evalCase.accuracyThreshold; + } + if (evalCase.judgeReps !== undefined) request.judgeReps = evalCase.judgeReps; + if (evalCase.tags) request.tags = evalCase.tags; + if (evalCase.expect) { + request.expect = sanitizeReporterValue(evalCase.expect) as Record< + string, + unknown + >; + } if (evalCase.mode === 'mcp_host') { if (evalCase.scenario) request.scenario = evalCase.scenario; @@ -686,6 +740,45 @@ function buildRequest( }), }; } + } else if (evalCase.mode === 'external_host') { + if (evalCase.scenario) request.scenario = evalCase.scenario; + if (evalCase.externalHost) { + let driverSlug: string | undefined; + try { + driverSlug = driverToSlug( + normalizeHostDriver(evalCase.externalHost.driver) + ); + } catch { + driverSlug = undefined; + } + const registeredConfig = driverSlug + ? getRegisteredExternalHostConfig(driverSlug) + : undefined; + const effectiveOptions = mergeReporterOptions( + registeredConfig?.options, + evalCase.externalHost.options + ); + const effectiveCapabilities = mergeReporterCapabilities( + registeredConfig?.capabilities, + evalCase.externalHost.capabilities + ); + const effectiveCorrelation = mergeReporterCorrelation( + registeredConfig?.correlation, + evalCase.externalHost.correlation + ); + request.externalHost = { + driver: evalCase.externalHost.driver, + driverSlug, + name: evalCase.externalHost.name ?? registeredConfig?.name, + hostType: evalCase.externalHost.hostType, + variant: evalCase.externalHost.variant, + timeoutMs: evalCase.externalHost.timeoutMs, + usesBuiltInDefaults: registeredConfig !== undefined, + correlation: effectiveCorrelation, + options: sanitizeReporterRecord(effectiveOptions), + capabilities: serializeExternalHostCapabilities(effectiveCapabilities), + }; + } } else { if (evalCase.args) request.args = evalCase.args; } @@ -693,6 +786,130 @@ function buildRequest( return request; } +function mergeReporterOptions( + base: Record | undefined, + override: Record | undefined +): Record | undefined { + if (!base) { + return override; + } + if (!override) { + return base; + } + return { + ...base, + ...override, + }; +} + +function mergeReporterCapabilities( + base: ExternalHostCapabilitiesConfig | undefined, + override: ExternalHostCapabilitiesConfig | undefined +): ExternalHostCapabilitiesConfig | undefined { + if (!base) { + return override; + } + if (!override) { + return base; + } + return { + ...base, + ...override, + }; +} + +function mergeReporterCorrelation( + base: ExternalHostCorrelationConfig | undefined, + override: ExternalHostCorrelationConfig | undefined +): ExternalHostCorrelationConfig | undefined { + if (!base) { + return override; + } + if (!override) { + return base; + } + return { + ...base, + ...override, + }; +} + +function serializeExternalHostCapabilities( + capabilities: ExternalHostCapabilitiesConfig | undefined +): NonNullable['capabilities'] { + if (!capabilities || typeof capabilities !== 'object') { + return undefined; + } + + const serialized: NonNullable< + NonNullable['capabilities'] + > = {}; + + for (const [capability, bindingOrBindings] of Object.entries(capabilities)) { + const bindings = Array.isArray(bindingOrBindings) + ? bindingOrBindings + : [bindingOrBindings]; + serialized[capability] = bindings + .filter((binding): binding is NonNullable => + Boolean(binding) + ) + .map((binding) => ({ + uses: binding.uses, + ...(binding.provides !== undefined && { + provides: [...binding.provides], + }), + ...(binding.with !== undefined && { + with: sanitizeReporterRecord(binding.with), + }), + })); + } + + return serialized; +} + +function sanitizeReporterRecord( + value: Record | undefined +): Record | undefined { + if (!value) { + return undefined; + } + return sanitizeReporterValue(value) as Record; +} + +function sanitizeReporterValue(value: unknown): unknown { + if (Array.isArray(value)) { + return value.map((item) => sanitizeReporterValue(item)); + } + + if (value && typeof value === 'object') { + const sanitized: Record = {}; + for (const [key, nestedValue] of Object.entries( + value as Record + )) { + sanitized[key] = isSecretLikeKey(key) + ? '[redacted]' + : sanitizeReporterValue(nestedValue); + } + return sanitized; + } + + if (typeof value === 'function') { + return '[function]'; + } + + if (typeof value === 'bigint') { + return value.toString(); + } + + return value; +} + +function isSecretLikeKey(key: string): boolean { + return /token|secret|password|credential|authorization|api[-_]?key/i.test( + key + ); +} + function isMCPHostSimulationResult( value: unknown ): value is MCPHostSimulationResult { @@ -705,6 +922,12 @@ function isMCPHostSimulationResult( ); } +function isExternalHostSimulationResult( + value: unknown +): value is ExternalHostSimulationResult { + return isMCPHostSimulationResult(value) && 'externalHost' in value; +} + /** * Runs a single iteration of an eval case (the atomic unit of work). * Extracted from runEvalCase to support multi-iteration accuracy loops. @@ -718,6 +941,10 @@ async function runSingleIteration( // Execute tool call const { response, error } = await executeToolCall(evalCase, context.mcp); + const externalHost = + isExternalHostSimulationResult(response) && response.externalHost + ? response.externalHost + : undefined; // Collect expectation results from expect block let expectationResults: EvalCaseResult['expectations'] = {}; @@ -741,10 +968,24 @@ async function runSingleIteration( toolPrecision = tp; toolRecall = tr; + if (evalCase.mode === 'external_host' && externalHost) { + applyExternalHostEvidenceGating( + evalCase.expect, + externalHost, + expectationResults + ); + if (expectationResults.toolsTriggered?.pass === false) { + toolPrecision = undefined; + toolRecall = undefined; + } + } + // Build mcpHostTrace when toolsTriggered expectation is present if ( evalCase.expect.toolsTriggered !== undefined && - isMCPHostSimulationResult(response) + isMCPHostSimulationResult(response) && + (evalCase.mode !== 'external_host' || + (externalHost !== undefined && hasStructuredToolEvidence(externalHost))) ) { const expectedNames = new Set( evalCase.expect.toolsTriggered.calls.map((c) => c.name) @@ -780,7 +1021,11 @@ async function runSingleIteration( id: evalCase.id, datasetName: options.datasetName ?? 'single-case', toolName: - evalCase.scenario != null ? 'mcp_host' : (evalCase.toolName ?? 'unknown'), + evalCase.mode === 'external_host' + ? 'external_host' + : evalCase.scenario != null + ? 'mcp_host' + : (evalCase.toolName ?? 'unknown'), source: 'eval', pass: didCasePass(error, expectationResults), request: buildRequest(evalCase, options.toolOverrideVariantId), @@ -795,9 +1040,60 @@ async function runSingleIteration( toolRecall, mcpHostTrace, hostUsage, + externalHost, }; } +function applyExternalHostEvidenceGating( + expectBlock: EvalExpectBlock, + externalHost: ExternalHostMetadata, + expectationResults: EvalCaseResult['expectations'] +): void { + const needsToolEvidence = + expectBlock.toolsTriggered !== undefined || + expectBlock.toolCallCount !== undefined; + + if (!needsToolEvidence || hasStructuredToolEvidence(externalHost)) { + return; + } + + const details = `External host trace source ${ + externalHost.sources?.toolCalls ?? externalHost.traceSource + } (${externalHost.traceConfidence} confidence) cannot support tool-call assertions. Use protocol traces or host-native structured traces for toolsTriggered/toolCallCount.`; + + if (expectBlock.toolsTriggered !== undefined) { + expectationResults.toolsTriggered = { pass: false, details }; + } + if (expectBlock.toolCallCount !== undefined) { + expectationResults.toolCallCount = { pass: false, details }; + } +} + +function hasStructuredToolEvidence( + externalHost: ExternalHostMetadata +): boolean { + const structuredSources = [ + 'mcp-proxy', + 'mcp-server-logs', + 'host-local-transcript', + 'host-native-export', + ]; + const evidence = externalHost.evidence?.toolCalls; + + if (evidence) { + return ( + evidence.confidence === 'high' && + structuredSources.includes(evidence.source) + ); + } + + const source = externalHost.sources?.toolCalls ?? externalHost.traceSource; + return ( + externalHost.traceConfidence === 'high' && + structuredSources.includes(source) + ); +} + /** * Returns true when the error message appears to be caused by network or * infrastructure issues (connection resets, timeouts, rate limits, etc.) @@ -830,6 +1126,12 @@ function isInfrastructureError(err: unknown): boolean { msg.includes('429') || msg.includes('503') || msg.includes('network') || + msg.includes('automation permission') || + msg.includes('automation/accessibility') || + msg.includes('no matching claude session') || + msg.includes('timed out waiting for claude session') || + msg.includes('failed to submit prompt to claude') || + msg.includes('failed to submit prompt to desktop host') || // Prompt/context overflow — LLM couldn't run, not a tool discoverability failure msg.includes('prompt is too long') || msg.includes('context length exceeded') || @@ -842,6 +1144,12 @@ function isInfrastructureError(err: unknown): boolean { ); } +function isExternalHostInfrastructureFailure( + externalHost: ExternalHostMetadata | undefined +): boolean { + return externalHost?.failureKind !== undefined; +} + /** * Runs a single eval case and returns the result. * When `evalCase.iterations > 1`, runs the case N times and returns accuracy. @@ -884,7 +1192,8 @@ export async function runEvalCase( // Check whether the tool call itself failed due to infrastructure (the // error is surfaced as result.error since executeToolCall swallows throws) const infraError = - result.error != null && isInfrastructureError(result.error); + isExternalHostInfrastructureFailure(result.externalHost) || + (result.error != null && isInfrastructureError(result.error)); iterationResults.push({ pass: result.pass, durationMs: result.durationMs, @@ -892,6 +1201,7 @@ export async function runEvalCase( isInfrastructureError: infraError, mcpHostTrace: result.mcpHostTrace, hostUsage: result.hostUsage, + externalHost: result.externalHost, }); } catch (err) { // runSingleIteration should not throw, but guard defensively @@ -920,7 +1230,11 @@ export async function runEvalCase( id: evalCase.id, datasetName: options.datasetName ?? 'single-case', toolName: - evalCase.scenario != null ? 'mcp_host' : (evalCase.toolName ?? 'unknown'), + evalCase.mode === 'external_host' + ? 'external_host' + : evalCase.scenario != null + ? 'mcp_host' + : (evalCase.toolName ?? 'unknown'), source: 'eval', pass: false, error: iterationResults[0]?.error, @@ -1080,7 +1394,7 @@ export async function runEvalDataset( // Preflight cost warning: estimate the number of LLM judge API calls this run will make const estimatedJudgeCalls = casesToRun.reduce((sum, c) => { const effectiveIterations = - c.mode === 'mcp_host' + c.mode === 'mcp_host' || c.mode === 'external_host' ? (c.iterations ?? defaultLlmIterations ?? 1) : (c.iterations ?? 1); if (c.expect?.passesJudge == null) return sum; @@ -1102,10 +1416,10 @@ export async function runEvalDataset( // Build task factories for all cases const tasks = casesToRun.map((evalCase) => async () => { - // Apply defaultLlmIterations to mcp_host cases that don't specify iterations. + // Apply defaultLlmIterations to host-driven cases that don't specify iterations. // Direct mode cases are deterministic — they always stay at 1 iteration. const withIterations = - evalCase.mode === 'mcp_host' && + (evalCase.mode === 'mcp_host' || evalCase.mode === 'external_host') && evalCase.iterations === undefined && defaultLlmIterations !== undefined ? { ...evalCase, iterations: defaultLlmIterations } @@ -1116,11 +1430,11 @@ export async function runEvalDataset( // Single-iteration mcp_host runs (the default) are a valid smoke-test pattern // and are not warned about — the warning is scoped to cases that have // explicitly chosen a multi-iteration count that is too small to be reliable. - if (evalCase.mode === 'mcp_host') { + if (evalCase.mode === 'mcp_host' || evalCase.mode === 'external_host') { const effectiveIterations = withIterations.iterations ?? 1; if (effectiveIterations > 1 && effectiveIterations < 10) { console.warn( - `[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in mcp_host mode ` + + `[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in ${evalCase.mode} mode ` + `may not be statistically reliable. Consider using 10+ iterations for accuracy measurements you can trust.` ); } diff --git a/src/evals/externalHost/builtinCapabilities.ts b/src/evals/externalHost/builtinCapabilities.ts new file mode 100644 index 0000000..ec3e55c --- /dev/null +++ b/src/evals/externalHost/builtinCapabilities.ts @@ -0,0 +1,22 @@ +import { ANTHROPIC_CLAUDE_CAPABILITIES } from './builtins/anthropicClaude.js'; +import { MACOS_DESKTOP_CAPABILITIES } from './builtins/macosDesktop.js'; +import type { ExternalHostCapabilityImplementation } from './types.js'; + +const BUILTIN_CAPABILITIES = new Map< + string, + ExternalHostCapabilityImplementation +>( + [...MACOS_DESKTOP_CAPABILITIES, ...ANTHROPIC_CLAUDE_CAPABILITIES].map( + (implementation) => [implementation.id, implementation] + ) +); + +export function listBuiltinExternalHostCapabilities(): ExternalHostCapabilityImplementation[] { + return Array.from(BUILTIN_CAPABILITIES.values()); +} + +export function resolveBuiltinExternalHostCapability( + uses: string +): ExternalHostCapabilityImplementation | undefined { + return BUILTIN_CAPABILITIES.get(uses); +} diff --git a/src/evals/externalHost/builtins/anthropicClaude.integration.test.ts b/src/evals/externalHost/builtins/anthropicClaude.integration.test.ts new file mode 100644 index 0000000..4687173 --- /dev/null +++ b/src/evals/externalHost/builtins/anthropicClaude.integration.test.ts @@ -0,0 +1,70 @@ +import { describe, expect, it } from 'vitest'; +import { runExternalHostScenario } from '../runtime.js'; + +describe('Claude external host integrations', () => { + it('drives Claude Chat Desktop and captures low-confidence visible response evidence', async () => { + const result = await runExternalHostScenario( + 'Please reply with exactly: external host integration acknowledged.', + { + driver: 'anthropic.claude.chat.desktop-app.macos', + name: 'Claude Chat Desktop', + timeoutMs: 30_000, + }, + { caseId: 'claude-chat-desktop-integration' } + ); + + if (!result.success) { + throw new Error( + `${result.externalHost.failureKind ?? 'unknown'}: ${result.error}` + ); + } + + expect(result.response?.toLowerCase()).toContain( + 'external host integration acknowledged' + ); + expect(result.externalHost.driverSlug).toBe( + 'anthropic.claude.chat.desktop-app.macos' + ); + expect(result.externalHost.traceSource).toBe('accessibility'); + expect(result.externalHost.traceConfidence).toBe('low'); + expect(result.externalHost.artifacts.length).toBeGreaterThan(0); + expect(result.externalHost.session.runMarker).toContain( + 'MCP_SERVER_TESTER_' + ); + }, 150_000); + + it('drives the active Claude Cowork Desktop surface and captures high-confidence local-agent trace evidence', async () => { + const result = await runExternalHostScenario( + 'Please reply with exactly: external host integration acknowledged.', + { + driver: 'anthropic.claude.cowork.desktop-app.macos', + name: 'Claude Cowork Desktop', + timeoutMs: 60_000, + options: { + newConversationShortcut: 'none', + }, + }, + { caseId: 'claude-cowork-desktop-integration' } + ); + + if (!result.success) { + throw new Error( + `${result.externalHost.failureKind ?? 'unknown'}: ${result.error}` + ); + } + + expect(result.response?.toLowerCase()).toContain( + 'external host integration acknowledged' + ); + expect(result.externalHost.driverSlug).toBe( + 'anthropic.claude.cowork.desktop-app.macos' + ); + expect(result.externalHost.traceSource).toBe('host-local-transcript'); + expect(result.externalHost.traceConfidence).toBe('high'); + expect(result.externalHost.artifacts.length).toBeGreaterThan(0); + expect(result.externalHost.session.id).toBeDefined(); + expect(result.externalHost.session.runMarker).toContain( + 'MCP_SERVER_TESTER_' + ); + }, 150_000); +}); diff --git a/src/evals/externalHost/builtins/anthropicClaude.test.ts b/src/evals/externalHost/builtins/anthropicClaude.test.ts new file mode 100644 index 0000000..1a9b5d2 --- /dev/null +++ b/src/evals/externalHost/builtins/anthropicClaude.test.ts @@ -0,0 +1,835 @@ +import { mkdtemp, mkdir, writeFile } from 'node:fs/promises'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; +import { describe, expect, it } from 'vitest'; +import { + buildClaudeTraceMetadata, + findMatchingClaudeSessions, + extractAccessibilityResponse, + getClaudeDataDir, + looksLikeClaudeChatSurface, + parseClaudeTrace, + snapshotClaudeSessions, + waitForClaudeTrace, + type SessionCandidate, +} from './anthropicClaude.js'; + +const COWORK_DRIVER = { + provider: 'anthropic', + product: 'claude', + surface: 'cowork', + runtime: 'desktop-app', + platform: 'macos', +} as const; + +async function writeJsonl(path: string, events: unknown[]): Promise { + await writeFile( + path, + events.map((event) => JSON.stringify(event)).join('\n'), + 'utf-8' + ); +} + +describe('anthropicClaude trace parsing', () => { + it('parses final answer, usage, tool calls, and artifacts from local Claude files', async () => { + const root = await mkdtemp(join(tmpdir(), 'claude-trace-')); + const sessionId = 'local_test'; + const cliSessionId = 'cli-session'; + const sessionDir = join(root, sessionId); + const transcriptDir = join( + sessionDir, + '.claude', + 'projects', + '-sessions-test' + ); + await mkdir(transcriptDir, { recursive: true }); + + const metadataPath = join(root, `${sessionId}.json`); + await writeFile( + metadataPath, + JSON.stringify({ + sessionId, + cliSessionId, + initialMessage: 'marker MCP_SERVER_TESTER_TEST', + cwd: '/sessions/test', + createdAt: '2026-05-09T00:00:00.000Z', + }), + 'utf-8' + ); + + await writeJsonl(join(sessionDir, 'audit.jsonl'), [ + { + type: 'result', + result: 'trace spike acknowledged.', + requestId: 'req_123', + duration_ms: 1234, + duration_api_ms: 1000, + total_cost_usd: 0.01, + usage: { + input_tokens: 10, + output_tokens: 5, + cache_read_input_tokens: 2, + }, + timestamp: '2026-05-09T00:00:02.000Z', + }, + ]); + + await writeJsonl(join(transcriptDir, `${cliSessionId}.jsonl`), [ + { + type: 'assistant', + message: { + content: [ + { + type: 'tool_use', + id: 'toolu_1', + name: 'mcp__server__search', + input: { query: 'planning' }, + }, + ], + }, + }, + ]); + + const candidate: SessionCandidate = { + id: sessionId, + metadataPath, + sessionDir, + statMtimeMs: Date.now(), + metadata: { + sessionId, + cliSessionId, + initialMessage: 'marker MCP_SERVER_TESTER_TEST', + cwd: '/sessions/test', + }, + }; + + const trace = await parseClaudeTrace(candidate); + + expect(trace.finalAnswer).toBe('trace spike acknowledged.'); + expect(trace.requestId).toBe('req_123'); + expect(trace.usage).toMatchObject({ + inputTokens: 10, + outputTokens: 5, + totalCostUsd: 0.01, + durationMs: 1234, + durationApiMs: 1000, + cacheReadInputTokens: 2, + }); + expect(trace.toolCalls).toEqual([ + { + id: 'toolu_1', + name: 'search', + arguments: { query: 'planning' }, + }, + ]); + expect(trace.transcriptPath).toContain(`${cliSessionId}.jsonl`); + expect(trace.isComplete).toBe(true); + expect(trace.auditParsed).toBe(true); + expect(trace.transcriptParsed).toBe(true); + expect(trace.usageAvailable).toBe(true); + expect(trace.costAvailable).toBe(true); + }); + + it('does not treat assistant text without a result event as a completed run', async () => { + const root = await mkdtemp(join(tmpdir(), 'claude-pending-')); + const sessionId = 'local_pending'; + const sessionDir = join(root, sessionId); + await mkdir(sessionDir, { recursive: true }); + const metadataPath = join(root, `${sessionId}.json`); + await writeFile( + metadataPath, + JSON.stringify({ + sessionId, + initialMessage: 'marker MCP_SERVER_TESTER_PENDING', + createdAt: new Date().toISOString(), + }), + 'utf-8' + ); + await writeJsonl(join(sessionDir, 'audit.jsonl'), [ + { + type: 'assistant', + message: { + content: [{ type: 'text', text: 'partial assistant response' }], + }, + }, + ]); + + const candidate: SessionCandidate = { + id: sessionId, + metadataPath, + sessionDir, + statMtimeMs: Date.now(), + metadata: { + sessionId, + initialMessage: 'marker MCP_SERVER_TESTER_PENDING', + }, + }; + + const trace = await parseClaudeTrace(candidate); + + expect(trace.finalAnswer).toBe('partial assistant response'); + expect(trace.isComplete).toBe(false); + }); + + it('continues parsing valid JSONL events when one line is malformed', async () => { + const root = await mkdtemp(join(tmpdir(), 'claude-jsonl-')); + const sessionId = 'local_jsonl'; + const sessionDir = join(root, sessionId); + await mkdir(sessionDir, { recursive: true }); + const metadataPath = join(root, `${sessionId}.json`); + await writeFile( + metadataPath, + JSON.stringify({ + sessionId, + initialMessage: 'marker MCP_SERVER_TESTER_JSONL', + }), + 'utf-8' + ); + await writeFile( + join(sessionDir, 'audit.jsonl'), + [ + JSON.stringify({ type: 'assistant', result: 'ignored' }), + '{not-json', + JSON.stringify({ type: 'result', result: 'final answer' }), + ].join('\n'), + 'utf-8' + ); + + const candidate: SessionCandidate = { + id: sessionId, + metadataPath, + sessionDir, + statMtimeMs: Date.now(), + metadata: { + sessionId, + initialMessage: 'marker MCP_SERVER_TESTER_JSONL', + }, + }; + + const trace = await parseClaudeTrace(candidate); + + expect(trace.finalAnswer).toBe('final answer'); + expect(trace.isComplete).toBe(true); + expect(trace.parseWarnings.join('\n')).toContain( + 'discarded 1 malformed JSONL line' + ); + }); + + it('only marks evidence fields high confidence when the parsed trace supports them', async () => { + const root = await mkdtemp(join(tmpdir(), 'claude-evidence-')); + const sessionId = 'local_evidence'; + const sessionDir = join(root, sessionId); + await mkdir(sessionDir, { recursive: true }); + const metadataPath = join(root, `${sessionId}.json`); + await writeFile( + metadataPath, + JSON.stringify({ + sessionId, + initialMessage: 'marker MCP_SERVER_TESTER_EVIDENCE', + }), + 'utf-8' + ); + await writeJsonl(join(sessionDir, 'audit.jsonl'), [ + { + type: 'result', + result: 'final answer', + total_cost_usd: 0.01, + usage: { input_tokens: 1, output_tokens: 2 }, + }, + ]); + + const trace = await parseClaudeTrace({ + id: sessionId, + metadataPath, + sessionDir, + statMtimeMs: Date.now(), + metadata: { + sessionId, + initialMessage: 'marker MCP_SERVER_TESTER_EVIDENCE', + }, + }); + const metadata = buildClaudeTraceMetadata({ + config: { + driver: COWORK_DRIVER, + name: 'Claude Cowork Desktop', + }, + context: { + runId: 'run', + caseId: 'case', + scenario: 'scenario', + submittedScenario: 'scenario', + marker: 'MCP_SERVER_TESTER_EVIDENCE', + correlation: { + strategy: 'prompt_marker', + marker: 'MCP_SERVER_TESTER_EVIDENCE', + includedInPrompt: true, + }, + timeoutMs: 1000, + startedAtMs: Date.now(), + }, + driver: COWORK_DRIVER, + displayName: 'Claude Cowork Desktop', + artifacts: [], + trace, + limitations: [], + }); + + expect(metadata.evidence?.finalAnswer).toEqual({ + source: 'host-local-transcript', + confidence: 'high', + }); + expect(metadata.evidence?.toolCalls).toEqual({ + source: 'none', + confidence: 'unknown', + }); + expect(metadata.evidence?.usage).toEqual({ + source: 'host-local-transcript', + confidence: 'high', + }); + expect(metadata.evidence?.cost).toEqual({ + source: 'host-local-transcript', + confidence: 'high', + }); + expect(metadata.traceConfidence).toBe('high'); + expect(metadata.traceLimitations?.join('\n')).toContain( + 'Tool-call evidence is unavailable' + ); + }); + + it('allows capability-local Claude data directory options to override driver-wide options', () => { + expect( + getClaudeDataDir( + { + driver: COWORK_DRIVER, + options: { dataDir: '/global/claude' }, + }, + { with: { dataDir: '/capability/claude' } } + ) + ).toBe('/capability/claude'); + }); + + it('matches sessions by marker instead of timing alone', async () => { + const root = await mkdtemp(join(tmpdir(), 'claude-match-')); + const sessionDir = join(root, 'local_match'); + await mkdir(sessionDir, { recursive: true }); + const metadataPath = join(root, 'local_match.json'); + + await writeFile( + metadataPath, + JSON.stringify({ + sessionId: 'local_match', + initialMessage: 'hello MCP_SERVER_TESTER_MATCH', + createdAt: new Date().toISOString(), + }), + 'utf-8' + ); + await writeJsonl(join(sessionDir, 'audit.jsonl'), [ + { type: 'result', result: 'done' }, + ]); + + const matches = await findMatchingClaudeSessions({ + dataDir: root, + marker: 'MCP_SERVER_TESTER_MATCH', + snapshot: new Map(), + startedAtMs: Date.now() - 1000, + }); + + expect(matches).toHaveLength(1); + expect(matches[0]?.finalAnswer).toBe('done'); + }); + + it('handles numeric Claude metadata timestamps when checking recency', async () => { + const root = await mkdtemp(join(tmpdir(), 'claude-numeric-time-')); + const sessionDir = join(root, 'local_numeric_time'); + await mkdir(sessionDir, { recursive: true }); + await writeFile( + join(root, 'local_numeric_time.json'), + JSON.stringify({ + sessionId: 'local_numeric_time', + initialMessage: 'hello MCP_SERVER_TESTER_NUMERIC_TIME', + createdAt: Date.now(), + }), + 'utf-8' + ); + await writeJsonl(join(sessionDir, 'audit.jsonl'), [ + { type: 'result', result: 'numeric timestamp done' }, + ]); + + const snapshot = await snapshotClaudeSessions(root); + const matches = await findMatchingClaudeSessions({ + dataDir: root, + marker: 'MCP_SERVER_TESTER_NUMERIC_TIME', + snapshot, + startedAtMs: Date.now() - 1000, + }); + + expect(matches).toHaveLength(1); + expect(matches[0]?.finalAnswer).toBe('numeric timestamp done'); + }); + + it('snapshots existing sessions so old unchanged files are ignored', async () => { + const root = await mkdtemp(join(tmpdir(), 'claude-snapshot-')); + await mkdir(join(root, 'local_old'), { recursive: true }); + await writeFile( + join(root, 'local_old.json'), + JSON.stringify({ + sessionId: 'local_old', + initialMessage: 'MCP_SERVER_TESTER_OLD', + }), + 'utf-8' + ); + + const snapshot = await snapshotClaudeSessions(root); + const matches = await findMatchingClaudeSessions({ + dataDir: root, + marker: 'MCP_SERVER_TESTER_OLD', + snapshot, + startedAtMs: Date.now(), + }); + + expect(matches).toEqual([]); + }); + + it('detects reused sessions when audit files change after the snapshot', async () => { + const root = await mkdtemp(join(tmpdir(), 'claude-reuse-')); + const sessionDir = join(root, 'local_reuse'); + await mkdir(sessionDir, { recursive: true }); + await writeFile( + join(root, 'local_reuse.json'), + JSON.stringify({ + sessionId: 'local_reuse', + initialMessage: 'MCP_SERVER_TESTER_REUSE', + }), + 'utf-8' + ); + + const snapshot = await snapshotClaudeSessions(root); + await writeJsonl(join(sessionDir, 'audit.jsonl'), [ + { type: 'result', result: 'reuse done' }, + ]); + + const matches = await findMatchingClaudeSessions({ + dataDir: root, + marker: 'MCP_SERVER_TESTER_REUSE', + snapshot, + startedAtMs: Date.now(), + }); + + expect(matches).toHaveLength(1); + expect(matches[0]?.finalAnswer).toBe('reuse done'); + }); + + it('does not use a pre-marker result as completion for a reused session', async () => { + const root = await mkdtemp(join(tmpdir(), 'claude-reuse-marker-')); + const sessionId = 'local_reuse_marker'; + const sessionDir = join(root, sessionId); + await mkdir(sessionDir, { recursive: true }); + const metadataPath = join(root, `${sessionId}.json`); + await writeFile( + metadataPath, + JSON.stringify({ + sessionId, + initialMessage: 'old run', + }), + 'utf-8' + ); + await writeJsonl(join(sessionDir, 'audit.jsonl'), [ + { type: 'result', result: 'old completed answer' }, + { + type: 'assistant', + message: { + content: [ + { + type: 'text', + text: 'MCP_SERVER_TESTER_REUSED_MARKER partial new response', + }, + ], + }, + }, + ]); + + const trace = await parseClaudeTrace( + { + id: sessionId, + metadataPath, + sessionDir, + statMtimeMs: Date.now(), + metadata: { + sessionId, + initialMessage: 'old run', + }, + }, + 'MCP_SERVER_TESTER_REUSED_MARKER' + ); + + expect(trace.finalAnswer).toBe( + 'MCP_SERVER_TESTER_REUSED_MARKER partial new response' + ); + expect(trace.isComplete).toBe(false); + }); + + it('does not use a pre-marker result when metadata contains the marker but the audit is still pending', async () => { + const root = await mkdtemp(join(tmpdir(), 'claude-metadata-marker-')); + const sessionId = 'local_metadata_marker'; + const sessionDir = join(root, sessionId); + await mkdir(sessionDir, { recursive: true }); + const metadataPath = join(root, `${sessionId}.json`); + await writeFile( + metadataPath, + JSON.stringify({ + sessionId, + initialMessage: 'MCP_SERVER_TESTER_METADATA_MARKER prompt', + }), + 'utf-8' + ); + await writeJsonl(join(sessionDir, 'audit.jsonl'), [ + { type: 'result', result: 'old completed answer' }, + { + type: 'assistant', + message: { + content: [ + { + type: 'text', + text: 'MCP_SERVER_TESTER_METADATA_MARKER partial new response', + }, + ], + }, + }, + ]); + + const trace = await parseClaudeTrace( + { + id: sessionId, + metadataPath, + sessionDir, + statMtimeMs: Date.now(), + metadata: { + sessionId, + initialMessage: 'MCP_SERVER_TESTER_METADATA_MARKER prompt', + }, + }, + 'MCP_SERVER_TESTER_METADATA_MARKER' + ); + + expect(trace.finalAnswer).toBe( + 'MCP_SERVER_TESTER_METADATA_MARKER partial new response' + ); + expect(trace.isComplete).toBe(false); + }); + + it('does not combine a transcript marker with a pre-marker audit result', async () => { + const root = await mkdtemp(join(tmpdir(), 'claude-cross-source-marker-')); + const sessionId = 'local_cross_source_marker'; + const cliSessionId = 'cli-cross-source'; + const sessionDir = join(root, sessionId); + const transcriptDir = join(sessionDir, '.claude', 'projects', '-project'); + await mkdir(transcriptDir, { recursive: true }); + const metadataPath = join(root, `${sessionId}.json`); + await writeFile( + metadataPath, + JSON.stringify({ + sessionId, + cliSessionId, + initialMessage: 'old run', + }), + 'utf-8' + ); + await writeJsonl(join(sessionDir, 'audit.jsonl'), [ + { type: 'result', result: 'old completed answer' }, + ]); + await writeJsonl(join(transcriptDir, `${cliSessionId}.jsonl`), [ + { + type: 'assistant', + message: { + content: [ + { + type: 'text', + text: 'MCP_SERVER_TESTER_CROSS_SOURCE partial new response', + }, + ], + }, + }, + ]); + + const trace = await parseClaudeTrace( + { + id: sessionId, + metadataPath, + sessionDir, + statMtimeMs: Date.now(), + metadata: { + sessionId, + cliSessionId, + initialMessage: 'old run', + }, + }, + 'MCP_SERVER_TESTER_CROSS_SOURCE' + ); + + expect(trace.finalAnswer).toBe( + 'MCP_SERVER_TESTER_CROSS_SOURCE partial new response' + ); + expect(trace.isComplete).toBe(false); + }); + + it('normalizes MCP tool names when server names contain underscores', async () => { + const root = await mkdtemp(join(tmpdir(), 'claude-tool-name-')); + const sessionId = 'local_tool_name'; + const cliSessionId = 'cli-session'; + const sessionDir = join(root, sessionId); + const transcriptDir = join(sessionDir, '.claude', 'projects', '-project'); + await mkdir(transcriptDir, { recursive: true }); + const metadataPath = join(root, `${sessionId}.json`); + await writeFile( + metadataPath, + JSON.stringify({ + sessionId, + cliSessionId, + initialMessage: 'marker MCP_SERVER_TESTER_TOOL', + }), + 'utf-8' + ); + await writeJsonl(join(sessionDir, 'audit.jsonl'), [ + { type: 'result', result: 'done' }, + ]); + await writeJsonl(join(transcriptDir, `${cliSessionId}.jsonl`), [ + { + type: 'assistant', + message: { + content: [ + { + type: 'tool_use', + id: 'toolu_1', + name: 'mcp__my_server__search', + input: { query: 'planning' }, + }, + ], + }, + }, + ]); + + const trace = await parseClaudeTrace({ + id: sessionId, + metadataPath, + sessionDir, + statMtimeMs: Date.now(), + metadata: { + sessionId, + cliSessionId, + initialMessage: 'marker MCP_SERVER_TESTER_TOOL', + }, + }); + + expect(trace.toolCalls[0]?.name).toBe('search'); + }); + + it('waits for a terminal result event before returning a matched trace', async () => { + const root = await mkdtemp(join(tmpdir(), 'claude-wait-result-')); + const sessionDir = join(root, 'local_wait'); + await mkdir(sessionDir, { recursive: true }); + await writeFile( + join(root, 'local_wait.json'), + JSON.stringify({ + sessionId: 'local_wait', + initialMessage: 'MCP_SERVER_TESTER_WAIT', + createdAt: new Date().toISOString(), + }), + 'utf-8' + ); + await writeJsonl(join(sessionDir, 'audit.jsonl'), [ + { + type: 'assistant', + message: { content: [{ type: 'text', text: 'partial' }] }, + }, + ]); + + const tracePromise = waitForClaudeTrace({ + dataDir: root, + marker: 'MCP_SERVER_TESTER_WAIT', + correlation: { + strategy: 'prompt_marker', + marker: 'MCP_SERVER_TESTER_WAIT', + includedInPrompt: true, + }, + snapshot: new Map(), + timeoutMs: 2_500, + startedAtMs: Date.now() - 1000, + }); + + await new Promise((resolve) => setTimeout(resolve, 900)); + await writeJsonl(join(sessionDir, 'audit.jsonl'), [ + { + type: 'assistant', + message: { content: [{ type: 'text', text: 'partial' }] }, + }, + { type: 'result', result: 'complete' }, + ]); + + await expect(tracePromise).resolves.toMatchObject({ + finalAnswer: 'complete', + isComplete: true, + }); + }); + + it('waits briefly for an expected embedded transcript after the result event', async () => { + const root = await mkdtemp(join(tmpdir(), 'claude-wait-transcript-')); + const sessionId = 'local_wait_transcript'; + const cliSessionId = 'cli-session'; + const sessionDir = join(root, sessionId); + const transcriptDir = join(sessionDir, '.claude', 'projects', '-project'); + await mkdir(transcriptDir, { recursive: true }); + await writeFile( + join(root, `${sessionId}.json`), + JSON.stringify({ + sessionId, + cliSessionId, + initialMessage: 'MCP_SERVER_TESTER_TRANSCRIPT', + createdAt: new Date().toISOString(), + }), + 'utf-8' + ); + await writeJsonl(join(sessionDir, 'audit.jsonl'), [ + { type: 'result', result: 'complete' }, + ]); + + const tracePromise = waitForClaudeTrace({ + dataDir: root, + marker: 'MCP_SERVER_TESTER_TRANSCRIPT', + correlation: { + strategy: 'prompt_marker', + marker: 'MCP_SERVER_TESTER_TRANSCRIPT', + includedInPrompt: true, + }, + snapshot: new Map(), + timeoutMs: 3_500, + startedAtMs: Date.now() - 1000, + }); + + await new Promise((resolve) => setTimeout(resolve, 900)); + await writeJsonl(join(transcriptDir, `${cliSessionId}.jsonl`), [ + { + type: 'assistant', + message: { + content: [ + { + type: 'tool_use', + id: 'toolu_1', + name: 'mcp__server__search', + input: { query: 'planning' }, + }, + ], + }, + }, + ]); + + await expect(tracePromise).resolves.toMatchObject({ + finalAnswer: 'complete', + transcriptParsed: true, + toolCalls: [{ name: 'search', arguments: { query: 'planning' } }], + }); + }); + + it('discovers nested Claude local-agent session metadata', async () => { + const root = await mkdtemp(join(tmpdir(), 'claude-nested-')); + const nested = join(root, 'workspace', 'project'); + await mkdir(join(nested, 'local_nested'), { recursive: true }); + await writeFile( + join(nested, 'local_nested.json'), + JSON.stringify({ + sessionId: 'local_nested', + initialMessage: 'MCP_SERVER_TESTER_NESTED', + createdAt: new Date().toISOString(), + }), + 'utf-8' + ); + await writeJsonl(join(nested, 'local_nested', 'audit.jsonl'), [ + { type: 'result', result: 'nested done' }, + ]); + + const matches = await findMatchingClaudeSessions({ + dataDir: root, + marker: 'MCP_SERVER_TESTER_NESTED', + snapshot: new Map(), + startedAtMs: Date.now() - 1000, + }); + + expect(matches).toHaveLength(1); + expect(matches[0]?.finalAnswer).toBe('nested done'); + }); + + it('can match a single fresh Claude local-agent session without a prompt marker', async () => { + const root = await mkdtemp(join(tmpdir(), 'claude-no-marker-')); + const sessionDir = join(root, 'local_no_marker'); + await mkdir(sessionDir, { recursive: true }); + await writeFile( + join(root, 'local_no_marker.json'), + JSON.stringify({ + sessionId: 'local_no_marker', + initialMessage: 'plain prompt without marker', + createdAt: new Date().toISOString(), + }), + 'utf-8' + ); + await writeJsonl(join(sessionDir, 'audit.jsonl'), [ + { type: 'result', result: 'plain prompt done' }, + ]); + + const matches = await findMatchingClaudeSessions({ + dataDir: root, + marker: 'MCP_SERVER_TESTER_NOT_IN_PROMPT', + correlation: { + strategy: 'none', + marker: 'MCP_SERVER_TESTER_NOT_IN_PROMPT', + includedInPrompt: false, + }, + snapshot: new Map(), + startedAtMs: Date.now() - 1000, + }); + + expect(matches).toHaveLength(1); + expect(matches[0]?.finalAnswer).toBe('plain prompt done'); + }); + + it('extracts final answer from accessibility fallback text', () => { + expect( + extractAccessibilityResponse( + [ + 'You said: Please reply with exactly: external host integration acknowledged.', + '[eval-run-marker:MCP_SERVER_TESTER_TEST]', + 'Claude responded: external host integration acknowledged.', + 'Write a message...', + ].join('\n') + ) + ).toBe('external host integration acknowledged.'); + }); + + it('extracts final answer from comma-separated accessibility fallback text', () => { + expect( + extractAccessibilityResponse( + 'You said: prompt [eval-run-marker:MCP_SERVER_TESTER_TEST], Claude responded: external host integration acknowledged., Write a message...' + ) + ).toBe('external host integration acknowledged.'); + }); + + it('recognizes the regular Claude Chat surface from visible controls', () => { + expect( + looksLikeClaudeChatSurface( + [ + 'New chat', + 'Projects', + 'Artifacts', + 'Ask your org', + 'Write a message...', + ].join('\n') + ) + ).toBe(true); + }); + + it('does not classify a local-agent surface from generic composer text alone', () => { + expect( + looksLikeClaudeChatSurface( + ['Claude Code', 'Session', 'Write a message...'].join('\n') + ) + ).toBe(false); + }); +}); diff --git a/src/evals/externalHost/builtins/anthropicClaude.ts b/src/evals/externalHost/builtins/anthropicClaude.ts new file mode 100644 index 0000000..2f852bc --- /dev/null +++ b/src/evals/externalHost/builtins/anthropicClaude.ts @@ -0,0 +1,1389 @@ +import { randomUUID } from 'node:crypto'; +import { readdir, readFile, stat } from 'node:fs/promises'; +import { homedir } from 'node:os'; +import { basename, dirname, join } from 'node:path'; +import { Readable } from 'node:stream'; +import { parse as parseNdjson } from 'ndjson'; +import type { LLMToolCall } from '../../mcpHost/mcpHostTypes.js'; +import type { + ExternalHostConfig, + ExternalHostCapabilityContext, + ExternalHostCapabilityImplementation, + ExternalHostFailureKind, + ExternalHostMetadata, + ExternalHostRunResult, + HostArtifact, + HostCapability, + HostDriverId, + HostRunContext, +} from '../types.js'; +import type { UsageMetrics } from '../../../types/index.js'; +import { driverToSlug, hostTypeFromDriver } from '../driverIdentity.js'; +import { + readMacosAccessibilityText, + readMacosFrontWindowContents, + runAppleScript, +} from './macosDesktop.js'; + +const DEFAULT_APP_NAME = 'Claude'; +const POLL_INTERVAL_MS = 750; +const TRACE_SETTLE_AFTER_COMPLETE_MS = 1_500; +const CLAUDE_DESKTOP_MACOS_CAPABILITIES = [ + 'control', + 'input', + 'completion', + 'trace', + 'normalize', +] as const; + +export interface ClaudeSessionMetadata { + sessionId?: string; + cliSessionId?: string; + createdAt?: string | number; + lastActivityAt?: string | number; + cwd?: string; + model?: string; + title?: string; + initialMessage?: string; +} + +export interface SessionCandidate { + id: string; + metadataPath: string; + sessionDir: string; + statMtimeMs: number; + metadata: ClaudeSessionMetadata; +} + +interface SnapshotEntry { + mtimeMs: number; +} + +export type ClaudeSessionSnapshot = Map; + +export interface ClaudeTrace { + candidate: SessionCandidate; + auditPath?: string; + transcriptPath?: string; + finalAnswer?: string; + toolCalls: LLMToolCall[]; + usage?: UsageMetrics; + requestId?: string; + completedAt?: string; + llmDurationMs?: number; + terminalReason?: string; + isError?: boolean; + isComplete: boolean; + auditParsed: boolean; + transcriptParsed: boolean; + usageAvailable: boolean; + costAvailable: boolean; + parseWarnings: string[]; + rawText: string; +} + +interface ClaudeAuditEvent { + type?: string; + result?: unknown; + is_error?: boolean; + duration_ms?: number; + duration_api_ms?: number; + total_cost_usd?: number; + requestId?: string; + request_id?: string; + usage?: Record; + message?: { + content?: Array<{ + type?: string; + id?: string; + name?: string; + input?: Record; + text?: string; + }>; + }; + timestamp?: string; + terminal_reason?: string; +} + +export const ANTHROPIC_CLAUDE_CAPABILITIES: ExternalHostCapabilityImplementation[] = + [ + { + id: 'builtin:anthropic.claude.coworkSurface', + capabilities: ['control'], + run: rejectClaudeChatSurfaceCapability, + }, + { + id: 'builtin:anthropic.claude.activateCoworkSurface', + capabilities: ['control'], + run: activateCoworkSurfaceCapability, + }, + { + id: 'builtin:anthropic.claude.accessibilityTrace', + capabilities: ['completion', 'trace', 'normalize'], + run: captureClaudeChatAccessibilityResultCapability, + }, + { + id: 'builtin:anthropic.claude.localAgentTrace', + capabilities: ['completion', 'trace'], + setup: snapshotClaudeSessionsCapability, + run: captureClaudeCoworkAgentTraceCapability, + }, + { + id: 'builtin:anthropic.claude.localAgentNormalize', + capabilities: ['normalize'], + run: normalizeClaudeCoworkAgentTraceCapability, + }, + ]; + +/** + * Deterministically switches the Claude desktop app to the Cowork surface via + * Cmd+2 (the app's built-in shortcut for the Cowork sidebar tab). Idempotent — + * sending Cmd+2 while already on Cowork is a no-op. Replaces the older + * rejectClaudeChatSurface capability for use cases that need automatic surface + * activation (e.g. CI runs). + */ +async function activateCoworkSurfaceCapability({ + config, + run, + binding, + state, +}: ExternalHostCapabilityContext): Promise { + const appName = + runStringOption(config, binding, 'appName') ?? DEFAULT_APP_NAME; + const settleDelayMs = 700; + const script = ` +tell application ${JSON.stringify(appName)} to activate +delay 0.4 +tell application "System Events" + tell process ${JSON.stringify(appName)} + set frontmost to true + keystroke "2" using command down + end tell +end tell +delay ${settleDelayMs / 1000} +return "ok" +`; + try { + await runAppleScript(script, { timeoutMs: 8_000 }); + } catch (err) { + return failureResult({ + config, + context: run, + driver: state.driver, + displayName: state.displayName, + capabilitiesUsed: state.capabilitiesUsed, + failureKind: 'submission_failed', + error: `Failed to activate Cowork surface via Cmd+2: ${formatError(err)}`, + artifacts: [], + limitations: [ + 'Cowork surface activation depends on Cmd+2 being bound to the Cowork sidebar tab in the user-installed Claude app version.', + ], + }); + } +} + +async function rejectClaudeChatSurfaceCapability({ + config, + run, + binding, + state, +}: ExternalHostCapabilityContext): Promise { + const appName = + runStringOption(config, binding, 'appName') ?? DEFAULT_APP_NAME; + const chatSurfaceReason = await detectClaudeChatSurface(appName); + if (!chatSurfaceReason) { + return; + } + + return failureResult({ + config, + context: run, + driver: state.driver, + displayName: state.displayName, + capabilitiesUsed: state.capabilitiesUsed, + failureKind: 'submission_failed', + error: `${state.displayName} surface is not active: ${chatSurfaceReason}`, + artifacts: [], + limitations: [ + 'Cowork is a distinct Claude Desktop surface; this driver will not submit Cowork evals through the regular Claude Chat composer.', + 'Open or focus an active Cowork/local-agent session before running this driver, or add a deterministic Cowork launch step.', + ], + }); +} + +async function snapshotClaudeSessionsCapability({ + config, + run, + binding, + state, +}: ExternalHostCapabilityContext): Promise { + const dataDir = getClaudeDataDir(config, binding); + state.data.claudeDataDir = dataDir; + + try { + state.data.claudeSessionSnapshot = await snapshotClaudeSessions(dataDir); + } catch (err) { + return failureResult({ + config, + context: run, + driver: state.driver, + displayName: state.displayName, + capabilitiesUsed: state.capabilitiesUsed, + failureKind: 'parse_failure', + error: `Failed to snapshot Claude session directory: ${formatError(err)}`, + artifacts: [], + limitations: [`Claude data directory: ${dataDir}`], + }); + } +} + +async function captureClaudeChatAccessibilityResultCapability({ + config, + run, + binding, + state, +}: ExternalHostCapabilityContext): Promise { + try { + return await waitForAccessibilityTrace({ + config, + context: run, + driver: state.driver, + displayName: state.displayName, + capabilitiesUsed: state.capabilitiesUsed, + timeoutMs: run.timeoutMs, + appName: runStringOption(config, binding, 'appName'), + }); + } catch (err) { + const message = formatError(err); + return failureResult({ + config, + context: run, + driver: state.driver, + displayName: state.displayName, + capabilitiesUsed: state.capabilitiesUsed, + failureKind: classifyTraceFailure(message), + error: message, + artifacts: [], + limitations: [ + 'Claude Chat Desktop currently uses Accessibility as the fallback trace source; IndexedDB parsing has not been stabilized.', + ], + }); + } +} + +async function captureClaudeCoworkAgentTraceCapability({ + config, + run, + binding, + state, +}: ExternalHostCapabilityContext): Promise { + const dataDir = + typeof state.data.claudeDataDir === 'string' + ? state.data.claudeDataDir + : getClaudeDataDir(config, binding); + const snapshot = state.data.claudeSessionSnapshot as + | ClaudeSessionSnapshot + | undefined; + + if (!snapshot) { + return failureResult({ + config, + context: run, + driver: state.driver, + displayName: state.displayName, + capabilitiesUsed: state.capabilitiesUsed, + failureKind: 'parse_failure', + error: 'Claude Cowork trace step requires a session snapshot.', + artifacts: [], + limitations: [`Claude data directory: ${dataDir}`], + }); + } + + try { + state.data.claudeTrace = await waitForClaudeTrace({ + dataDir, + marker: run.marker, + correlation: run.correlation, + snapshot, + timeoutMs: run.timeoutMs, + startedAtMs: run.startedAtMs, + }); + } catch (err) { + const message = formatError(err); + return failureResult({ + config, + context: run, + driver: state.driver, + displayName: state.displayName, + capabilitiesUsed: state.capabilitiesUsed, + failureKind: classifyTraceFailure(message), + error: message, + artifacts: [], + limitations: [`Claude data directory: ${dataDir}`], + }); + } +} + +async function normalizeClaudeCoworkAgentTraceCapability({ + config, + run, + state, +}: ExternalHostCapabilityContext): Promise { + const trace = state.data.claudeTrace as ClaudeTrace | undefined; + if (!trace) { + return failureResult({ + config, + context: run, + driver: state.driver, + displayName: state.displayName, + capabilitiesUsed: state.capabilitiesUsed, + failureKind: 'parse_failure', + error: 'Claude Cowork trace normalization requires a parsed trace.', + artifacts: [], + limitations: [], + }); + } + + const artifacts = buildArtifacts(trace); + const metadata = buildClaudeTraceMetadata({ + config, + context: run, + driver: state.driver, + displayName: state.displayName, + capabilitiesUsed: state.capabilitiesUsed, + artifacts, + trace, + limitations: trace.parseWarnings, + }); + + if (trace.isError) { + return { + success: false, + toolCalls: trace.toolCalls, + error: + trace.finalAnswer ?? + `Claude host run failed${trace.terminalReason ? `: ${trace.terminalReason}` : ''}`, + externalHost: { + ...metadata, + failureKind: 'host_run_failed', + }, + }; + } + + if (trace.finalAnswer === undefined) { + return { + success: false, + toolCalls: trace.toolCalls, + error: 'Claude trace completed but did not include a final answer.', + externalHost: { + ...metadata, + failureKind: 'parse_failure', + }, + }; + } + + return { + success: true, + toolCalls: trace.toolCalls, + response: trace.finalAnswer, + conversationHistory: trace.finalAnswer + ? [{ role: 'assistant', content: trace.finalAnswer }] + : undefined, + usage: trace.usage, + llmDurationMs: trace.llmDurationMs, + externalHost: metadata, + }; +} + +function stringOption( + options: Record | undefined, + key: string +): string | undefined { + const value = options?.[key]; + return typeof value === 'string' ? value : undefined; +} + +function configStringOption( + config: ExternalHostConfig, + key: string +): string | undefined { + const value = config.options?.[key]; + return typeof value === 'string' ? value : undefined; +} + +function runStringOption( + config: ExternalHostConfig, + binding: { with?: Record }, + key: string +): string | undefined { + return stringOption(binding.with, key) ?? configStringOption(config, key); +} + +export function getClaudeDataDir( + config: ExternalHostConfig, + binding?: { with?: Record } +): string { + const configuredDataDir = binding + ? runStringOption(config, binding, 'dataDir') + : configStringOption(config, 'dataDir'); + + return ( + configuredDataDir ?? + join( + homedir(), + 'Library', + 'Application Support', + 'Claude', + 'local-agent-mode-sessions' + ) + ); +} + +export async function snapshotClaudeSessions( + dataDir: string +): Promise { + const snapshot = new Map(); + const sessions = await listSessionCandidates(dataDir); + for (const session of sessions) { + snapshot.set(session.metadataPath, { mtimeMs: session.statMtimeMs }); + } + return snapshot; +} + +export async function waitForClaudeTrace(options: { + dataDir: string; + marker: string; + correlation: HostRunContext['correlation']; + snapshot: ClaudeSessionSnapshot; + timeoutMs: number; + startedAtMs: number; +}): Promise { + const deadline = Date.now() + options.timeoutMs; + let lastPending: ClaudeTrace | undefined; + let completeTraceFirstSeenAtMs: number | undefined; + + while (Date.now() < deadline) { + const matches = await findMatchingClaudeSessions(options); + + if (matches.length > 1) { + throw new Error( + `Ambiguous Claude sessions for ${describeCorrelation(options)}: ${matches + .map((m) => m.candidate.id) + .join(', ')}` + ); + } + + if (matches.length === 1) { + const trace = matches[0]!; + if (isTraceReady(trace, completeTraceFirstSeenAtMs)) { + return trace; + } + if (trace.isComplete && completeTraceFirstSeenAtMs === undefined) { + completeTraceFirstSeenAtMs = Date.now(); + } + lastPending = trace; + } + + await delay(POLL_INTERVAL_MS); + } + + if (lastPending) { + throw new Error( + `Timed out waiting for Claude session ${lastPending.candidate.id} to complete` + ); + } + + throw new Error( + `No matching Claude session found for ${describeCorrelation(options)}` + ); +} + +function isTraceReady( + trace: ClaudeTrace, + completeTraceFirstSeenAtMs: number | undefined +): boolean { + if (!trace.isComplete) { + return false; + } + + if (!trace.candidate.metadata.cliSessionId || trace.transcriptParsed) { + return true; + } + + return ( + completeTraceFirstSeenAtMs !== undefined && + Date.now() - completeTraceFirstSeenAtMs >= TRACE_SETTLE_AFTER_COMPLETE_MS + ); +} + +export async function findMatchingClaudeSessions(options: { + dataDir: string; + marker: string; + correlation?: HostRunContext['correlation']; + snapshot: ClaudeSessionSnapshot; + startedAtMs: number; +}): Promise { + const sessions = await listSessionCandidates(options.dataDir); + const traces: ClaudeTrace[] = []; + + for (const session of sessions) { + const previous = options.snapshot.get(session.metadataPath); + const isNewOrUpdated = + previous === undefined || session.statMtimeMs > previous.mtimeMs; + const createdAtMs = metadataTimestampMs(session.metadata.createdAt); + const isRecent = + !Number.isNaN(createdAtMs) && createdAtMs >= options.startedAtMs - 5_000; + + if (!isNewOrUpdated && !isRecent) { + continue; + } + + const trace = await parseClaudeTrace( + session, + options.correlation?.includedInPrompt === false + ? undefined + : options.marker + ); + if ( + sessionMatchesCorrelation({ + session, + trace, + marker: options.marker, + correlation: options.correlation, + isNewOrUpdated, + isRecent, + }) + ) { + traces.push(trace); + } + } + + return traces; +} + +function describeCorrelation(options: { + marker: string; + correlation?: HostRunContext['correlation']; +}): string { + if (options.correlation?.includedInPrompt) { + return `marker ${options.marker}`; + } + return `${options.correlation?.strategy ?? 'none'} correlation near the run start`; +} + +async function readAccessibilityFallback( + config: ExternalHostConfig, + context: HostRunContext, + driver: HostDriverId, + displayName: string, + capabilitiesUsed: readonly HostCapability[], + options: { appName?: string } = {} +): Promise { + let visibleText: string; + try { + visibleText = await readMacosAccessibilityText( + options.appName ?? + configStringOption(config, 'appName') ?? + DEFAULT_APP_NAME + ); + } catch { + return undefined; + } + + if (!visibleText.includes(context.marker)) { + return undefined; + } + + const response = extractAccessibilityResponse(visibleText); + if (!response) { + return undefined; + } + + return { + success: true, + toolCalls: [], + response, + conversationHistory: [{ role: 'assistant', content: response }], + externalHost: { + ...buildHostIdentityMetadata(config, driver, displayName), + hostVariant: config.variant, + capabilitiesUsed: [...capabilitiesUsed], + traceSource: 'accessibility', + traceConfidence: 'low', + traceLimitations: [ + 'Claude did not produce a matching local-agent transcript; final answer was captured from the visible Accessibility tree.', + 'Tool calls, token usage, cost, and hidden context are unavailable from this fallback source.', + ], + artifacts: [ + { + kind: 'trace', + name: 'Claude visible accessibility text', + contentType: 'text/plain', + summary: visibleText.slice(0, 1000), + }, + ], + session: { + runMarker: context.marker, + }, + correlation: context.correlation, + sources: { + finalAnswer: 'accessibility', + toolCalls: 'none', + usage: 'none', + cost: 'none', + }, + evidence: { + finalAnswer: { source: 'accessibility', confidence: 'low' }, + toolCalls: { source: 'none', confidence: 'unknown' }, + usage: { source: 'none', confidence: 'unknown' }, + cost: { source: 'none', confidence: 'unknown' }, + }, + }, + }; +} + +async function detectClaudeChatSurface( + appName: string +): Promise { + let surfaceText: string; + try { + // `entire contents of front window` is a single IPC batch transfer; it can + // be multi-MB on a fully-loaded Electron window (handled by the maxBuffer + // bump in runAppleScript). The recursive AppleScript alternative does one + // IPC round-trip per element and hits the per-script timeout on large + // trees. + surfaceText = await readMacosFrontWindowContents(appName); + } catch (err) { + return `could not verify active Claude surface via Accessibility: ${formatError(err)}`; + } + + if (looksLikeClaudeChatSurface(surfaceText)) { + return 'visible controls match the regular Claude Chat surface'; + } + + return undefined; +} + +export function looksLikeClaudeChatSurface(visibleText: string): boolean { + const chatSignals = [ + 'New chat', + 'Projects', + 'Artifacts', + 'Ask your org', + 'Write a message', + ]; + const signalCount = chatSignals.filter((signal) => + visibleText.includes(signal) + ).length; + return signalCount >= 3; +} + +async function waitForAccessibilityTrace(options: { + config: ExternalHostConfig; + context: HostRunContext; + driver: HostDriverId; + displayName: string; + capabilitiesUsed: readonly HostCapability[]; + timeoutMs: number; + appName?: string; +}): Promise { + const deadline = Date.now() + options.timeoutMs; + + while (Date.now() < deadline) { + const fallback = await readAccessibilityFallback( + options.config, + options.context, + options.driver, + options.displayName, + options.capabilitiesUsed, + { appName: options.appName } + ); + if (fallback) { + return fallback; + } + await delay(POLL_INTERVAL_MS); + } + + throw new Error( + `Timed out waiting for Claude Chat Desktop visible response for marker ${options.context.marker}` + ); +} + +export async function parseClaudeTrace( + candidate: SessionCandidate, + marker?: string +): Promise { + const parseWarnings: string[] = []; + const auditPath = join(candidate.sessionDir, 'audit.jsonl'); + const transcriptPath = candidate.metadata.cliSessionId + ? await findFile( + candidate.sessionDir, + `${candidate.metadata.cliSessionId}.jsonl` + ) + : undefined; + + let auditEvents: ClaudeAuditEvent[] = []; + let transcriptEvents: ClaudeAuditEvent[] = []; + let rawAudit = ''; + let rawTranscript = ''; + let auditParsed = false; + let transcriptParsed = false; + + try { + rawAudit = await readFile(auditPath, 'utf-8'); + const parsed = await parseNdjsonContent( + rawAudit, + 'Claude audit log' + ); + auditEvents = parsed.events; + auditParsed = parsed.events.length > 0; + parseWarnings.push(...parsed.warnings); + } catch (err) { + parseWarnings.push(`Could not read Claude audit log: ${formatError(err)}`); + } + + if (transcriptPath) { + try { + rawTranscript = await readFile(transcriptPath, 'utf-8'); + const parsed = await parseNdjsonContent( + rawTranscript, + 'Claude transcript' + ); + transcriptEvents = parsed.events; + transcriptParsed = parsed.ok; + parseWarnings.push(...parsed.warnings); + } catch (err) { + parseWarnings.push( + `Could not read Claude transcript: ${formatError(err)}` + ); + } + } else if (candidate.metadata.cliSessionId) { + parseWarnings.push( + `Could not locate transcript for cliSessionId ${candidate.metadata.cliSessionId}.` + ); + } + + const auditEventsForRun = selectEventsForMarker( + candidate.metadata, + auditEvents, + marker + ); + const transcriptEventsForRun = selectEventsForMarker( + candidate.metadata, + transcriptEvents, + marker + ); + const combinedEventsForRun = [ + ...auditEventsForRun, + ...transcriptEventsForRun, + ]; + const resultEvent = + findLastResultEvent(auditEventsForRun) ?? + findLastResultEvent(transcriptEventsForRun); + const finalAnswer = + typeof resultEvent?.result === 'string' + ? resultEvent.result + : extractAssistantText(combinedEventsForRun); + const usage = resultEvent ? extractUsage(resultEvent) : undefined; + const toolCalls = extractToolCalls( + transcriptEventsForRun.length > 0 + ? transcriptEventsForRun + : combinedEventsForRun + ); + + return { + candidate, + auditPath, + transcriptPath, + finalAnswer, + toolCalls, + usage, + requestId: resultEvent?.requestId ?? resultEvent?.request_id, + completedAt: resultEvent?.timestamp, + llmDurationMs: resultEvent?.duration_api_ms ?? resultEvent?.duration_ms, + terminalReason: resultEvent?.terminal_reason, + isError: resultEvent?.is_error === true, + isComplete: resultEvent !== undefined, + auditParsed, + transcriptParsed, + usageAvailable: usage !== undefined, + costAvailable: typeof resultEvent?.total_cost_usd === 'number', + parseWarnings, + rawText: `${rawAudit}\n${rawTranscript}`, + }; +} + +function selectEventsForMarker( + metadata: ClaudeSessionMetadata, + events: ClaudeAuditEvent[], + marker?: string +): ClaudeAuditEvent[] { + if (!marker) { + return events; + } + + const markerIndex = events.findIndex((event) => + JSON.stringify(event).includes(marker) + ); + if (markerIndex < 0) { + return metadata.initialMessage?.includes(marker) ? events : []; + } + + return events.slice(markerIndex); +} + +export function buildClaudeTraceMetadata(options: { + config: ExternalHostConfig; + context: HostRunContext; + driver: HostDriverId; + displayName: string; + capabilitiesUsed?: readonly HostCapability[]; + artifacts: HostArtifact[]; + trace: ClaudeTrace; + limitations: string[]; +}): ExternalHostMetadata { + const correlationLimitations = options.context.correlation.includedInPrompt + ? [] + : [ + 'Trace was matched by recently updated host artifacts because no prompt marker was included.', + ]; + const limitations = buildTraceLimitations(options.trace, [ + ...options.limitations, + ...correlationLimitations, + ]); + const traceConfidence = getTraceConfidence( + options.trace, + options.context.correlation + ); + const finalAnswerEvidence = buildEvidence( + options.trace.isComplete && options.trace.finalAnswer !== undefined, + traceConfidence + ); + const toolCallsEvidence = buildEvidence( + options.trace.transcriptParsed, + traceConfidence + ); + const usageEvidence = buildEvidence( + options.trace.usageAvailable, + traceConfidence + ); + const costEvidence = buildEvidence( + options.trace.costAvailable, + traceConfidence + ); + + return { + ...buildHostIdentityMetadata( + options.config, + options.driver, + options.displayName + ), + hostVariant: options.config.variant, + capabilitiesUsed: [ + ...(options.capabilitiesUsed ?? CLAUDE_DESKTOP_MACOS_CAPABILITIES), + ], + traceSource: 'host-local-transcript', + traceConfidence, + traceLimitations: limitations.length > 0 ? limitations : undefined, + artifacts: options.artifacts, + session: { + id: + options.trace.candidate.metadata.sessionId ?? + options.trace.candidate.id, + runMarker: options.context.marker, + requestId: options.trace.requestId, + cliSessionId: options.trace.candidate.metadata.cliSessionId, + cwd: options.trace.candidate.metadata.cwd, + startedAt: metadataTimestampString( + options.trace.candidate.metadata.createdAt + ), + completedAt: options.trace.completedAt, + }, + correlation: options.context.correlation, + sources: { + finalAnswer: finalAnswerEvidence.source, + toolCalls: toolCallsEvidence.source, + usage: usageEvidence.source, + cost: costEvidence.source, + }, + evidence: { + finalAnswer: finalAnswerEvidence, + toolCalls: toolCallsEvidence, + usage: usageEvidence, + cost: costEvidence, + }, + }; +} + +function buildEvidence( + available: boolean, + confidence: ExternalHostMetadata['traceConfidence'] +) { + return available + ? ({ source: 'host-local-transcript', confidence } as const) + : ({ source: 'none', confidence: 'unknown' } as const); +} + +function getTraceConfidence( + trace: ClaudeTrace, + correlation: HostRunContext['correlation'] +): ExternalHostMetadata['traceConfidence'] { + if (!trace.isComplete || !trace.auditParsed) { + return 'unknown'; + } + if ( + trace.parseWarnings.some((warning) => + warning.startsWith('Claude audit log discarded') + ) + ) { + return 'medium'; + } + return correlation.includedInPrompt ? 'high' : 'medium'; +} + +function buildTraceLimitations( + trace: ClaudeTrace, + limitations: string[] +): string[] { + const output = [...limitations]; + + if (!trace.transcriptParsed) { + output.push( + 'Tool-call evidence is unavailable because a complete structured Claude transcript was not found or could not be parsed.' + ); + } + + if (!trace.usageAvailable) { + output.push('Usage evidence is unavailable from the parsed Claude trace.'); + } + + if (!trace.costAvailable) { + output.push('Cost evidence is unavailable from the parsed Claude trace.'); + } + + return Array.from(new Set(output)); +} + +function failureResult(options: { + config: ExternalHostConfig; + context: HostRunContext; + driver: HostDriverId; + displayName: string; + capabilitiesUsed?: readonly HostCapability[]; + failureKind: ExternalHostFailureKind; + error: string; + artifacts: HostArtifact[]; + limitations: string[]; +}): ExternalHostRunResult { + return { + success: false, + toolCalls: [], + error: options.error, + externalHost: { + ...buildHostIdentityMetadata( + options.config, + options.driver, + options.displayName + ), + hostVariant: options.config.variant, + capabilitiesUsed: [...(options.capabilitiesUsed ?? [])], + traceSource: 'none', + traceConfidence: 'unknown', + traceLimitations: options.limitations, + artifacts: options.artifacts, + session: { runMarker: options.context.marker }, + correlation: options.context.correlation, + failureKind: options.failureKind, + }, + }; +} + +function buildHostIdentityMetadata( + config: ExternalHostConfig, + driver: HostDriverId, + displayName: string +): Pick< + ExternalHostMetadata, + 'driver' | 'driverSlug' | 'displayName' | 'hostName' | 'hostType' +> { + return { + driver, + driverSlug: driverToSlug(driver), + displayName, + hostName: displayName, + hostType: config.hostType ?? hostTypeFromDriver(driver), + }; +} + +function buildArtifacts(trace: ClaudeTrace): HostArtifact[] { + const artifacts: HostArtifact[] = [ + { + kind: 'metadata', + name: 'Claude session metadata', + path: trace.candidate.metadataPath, + contentType: 'application/json', + }, + ]; + + if (trace.auditPath) { + artifacts.push({ + kind: 'audit', + name: 'Claude audit log', + path: trace.auditPath, + contentType: 'application/x-ndjson', + }); + } + + if (trace.transcriptPath) { + artifacts.push({ + kind: 'transcript', + name: 'Claude transcript', + path: trace.transcriptPath, + contentType: 'application/x-ndjson', + }); + } + + return artifacts; +} + +export function extractAccessibilityResponse( + visibleText: string +): string | undefined { + const lines = visibleText + .split('\n') + .map((line) => line.trim()) + .filter(Boolean); + const responseLine = [...lines] + .reverse() + .find((line) => line.startsWith('Claude responded: ')); + if (responseLine) { + return responseLine.slice('Claude responded: '.length).trim(); + } + + const inlineResponseMatch = /Claude responded:\s*([^,\n]+)/.exec(visibleText); + if (inlineResponseMatch?.[1]) { + return inlineResponseMatch[1].trim(); + } + + const markerIndex = lines.findIndex((line) => + line.includes('[eval-run-marker:') + ); + if (markerIndex >= 0) { + return lines + .slice(markerIndex + 1) + .find( + (line) => + !line.startsWith('Write a message') && + !line.includes('Claude is AI and can make mistakes') + ); + } + + return undefined; +} + +async function listSessionCandidates( + dataDir: string +): Promise { + const metadataPaths = await findClaudeMetadataFiles(dataDir); + const candidates: SessionCandidate[] = []; + + for (const metadataPath of metadataPaths) { + try { + const metadata = JSON.parse( + await readFile(metadataPath, 'utf-8') + ) as ClaudeSessionMetadata; + const metadataStat = await stat(metadataPath); + const id = basename(metadataPath, '.json'); + const sessionDir = join(dirname(metadataPath), id); + const statMtimeMs = await getSessionObservedMtime({ + sessionDir, + cliSessionId: metadata.cliSessionId, + metadataMtimeMs: metadataStat.mtimeMs, + }); + candidates.push({ + id, + metadataPath, + sessionDir, + statMtimeMs, + metadata, + }); + } catch { + continue; + } + } + + return candidates; +} + +async function getSessionObservedMtime(options: { + sessionDir: string; + cliSessionId?: string; + metadataMtimeMs: number; +}): Promise { + const observed = [ + options.metadataMtimeMs, + await getFileMtime(join(options.sessionDir, 'audit.jsonl')), + await getFileMtime(options.sessionDir), + ]; + + if (options.cliSessionId) { + const transcriptPath = await findFile( + options.sessionDir, + `${options.cliSessionId}.jsonl` + ); + if (transcriptPath) { + observed.push(await getFileMtime(transcriptPath)); + } + } + + return Math.max( + ...observed.filter((mtime): mtime is number => mtime !== undefined) + ); +} + +async function getFileMtime(path: string): Promise { + try { + return (await stat(path)).mtimeMs; + } catch { + return undefined; + } +} + +async function findClaudeMetadataFiles(root: string): Promise { + const stack = [root]; + const matches: string[] = []; + + while (stack.length > 0) { + const current = stack.pop()!; + let entries; + try { + entries = await readdir(current, { withFileTypes: true }); + } catch { + continue; + } + + for (const entry of entries) { + const path = join(current, entry.name); + if (entry.isFile() && /^local_.+\.json$/.test(entry.name)) { + matches.push(path); + } else if (entry.isDirectory()) { + stack.push(path); + } + } + } + + return matches; +} + +function sessionMatchesMarker( + session: SessionCandidate, + trace: ClaudeTrace, + marker: string +): boolean { + if (session.metadata.initialMessage?.includes(marker)) { + return true; + } + if (trace.finalAnswer?.includes(marker)) { + return true; + } + return trace.rawText.includes(marker); +} + +function sessionMatchesCorrelation(options: { + session: SessionCandidate; + trace: ClaudeTrace; + marker: string; + correlation?: HostRunContext['correlation']; + isNewOrUpdated: boolean; + isRecent: boolean; +}): boolean { + if (options.correlation?.includedInPrompt !== false) { + return sessionMatchesMarker(options.session, options.trace, options.marker); + } + + return options.isNewOrUpdated || options.isRecent; +} + +async function parseNdjsonContent( + content: string, + sourceName: string +): Promise<{ events: T[]; ok: boolean; warnings: string[] }> { + const events: T[] = []; + const parser = parseNdjson({ strict: false }); + + await new Promise((resolve, reject) => { + parser.on('data', (event: T) => events.push(event)); + parser.on('error', reject); + parser.on('end', resolve); + Readable.from([content]).pipe(parser); + }); + + const nonEmptyLineCount = content + .split('\n') + .filter((line) => line.trim().length > 0).length; + const discardedLineCount = nonEmptyLineCount - events.length; + const warnings = + discardedLineCount > 0 + ? [ + `${sourceName} discarded ${discardedLineCount} malformed JSONL line${ + discardedLineCount === 1 ? '' : 's' + } using ndjson strict=false parsing.`, + ] + : []; + + return { events, ok: warnings.length === 0, warnings }; +} + +function findLastResultEvent( + events: ClaudeAuditEvent[] +): ClaudeAuditEvent | undefined { + return [...events] + .reverse() + .find((event) => event.type === 'result' || event.result !== undefined); +} + +async function findFile( + root: string, + filename: string +): Promise { + const stack = [root]; + + while (stack.length > 0) { + const current = stack.pop()!; + let entries; + try { + entries = await readdir(current, { withFileTypes: true }); + } catch { + continue; + } + + for (const entry of entries) { + const path = join(current, entry.name); + if (entry.isFile() && entry.name === filename) { + return path; + } + if (entry.isDirectory()) { + stack.push(path); + } + } + } + + return undefined; +} + +function extractAssistantText(events: ClaudeAuditEvent[]): string | undefined { + const parts: string[] = []; + + for (const event of events) { + for (const block of event.message?.content ?? []) { + if (block.type === 'text' && block.text) { + parts.push(block.text); + } + } + } + + return parts.length > 0 ? parts.join('') : undefined; +} + +function extractToolCalls(events: ClaudeAuditEvent[]): LLMToolCall[] { + const toolCalls: LLMToolCall[] = []; + + for (const event of events) { + for (const block of event.message?.content ?? []) { + if (block.type !== 'tool_use' || !block.name) { + continue; + } + const mcpMatch = /^mcp__(.+)__(.+)$/.exec(block.name); + toolCalls.push({ + name: mcpMatch ? mcpMatch[2]! : block.name, + arguments: block.input ?? {}, + id: block.id, + }); + } + } + + return toolCalls; +} + +function extractUsage(event: ClaudeAuditEvent): UsageMetrics | undefined { + const usage = event.usage; + const inputTokens = + getNumber(usage, 'input_tokens') ?? getNumber(usage, 'inputTokens'); + const outputTokens = + getNumber(usage, 'output_tokens') ?? getNumber(usage, 'outputTokens'); + + if ( + inputTokens === undefined && + outputTokens === undefined && + event.total_cost_usd === undefined && + event.duration_ms === undefined + ) { + return undefined; + } + + return { + inputTokens: inputTokens ?? 0, + outputTokens: outputTokens ?? 0, + totalCostUsd: event.total_cost_usd ?? 0, + durationMs: event.duration_ms ?? 0, + durationApiMs: event.duration_api_ms, + cacheReadInputTokens: + getNumber(usage, 'cache_read_input_tokens') ?? + getNumber(usage, 'cacheReadInputTokens'), + cacheCreationInputTokens: + getNumber(usage, 'cache_creation_input_tokens') ?? + getNumber(usage, 'cacheCreationInputTokens'), + }; +} + +function getNumber( + object: Record | undefined, + key: string +): number | undefined { + const value = object?.[key]; + return typeof value === 'number' ? value : undefined; +} + +function metadataTimestampMs(value: string | number | undefined): number { + if (typeof value === 'number') { + return value; + } + if (typeof value === 'string') { + const parsed = Date.parse(value); + return Number.isNaN(parsed) ? Number.NaN : parsed; + } + return Number.NaN; +} + +function metadataTimestampString( + value: string | number | undefined +): string | undefined { + if (typeof value === 'string') { + return value; + } + if (typeof value === 'number') { + return new Date(value).toISOString(); + } + return undefined; +} + +function classifyTraceFailure(message: string): ExternalHostFailureKind { + const lower = message.toLowerCase(); + if (lower.includes('ambiguous')) return 'ambiguous_matching_sessions'; + if (lower.includes('timed out')) return 'timeout'; + if (lower.includes('no matching')) return 'no_matching_session'; + if (lower.includes('parse')) return 'parse_failure'; + return 'unknown'; +} + +function formatError(err: unknown): string { + return err instanceof Error ? err.message : String(err); +} + +function delay(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +export function createExternalHostRunId(caseId: string): string { + return `${caseId}-${randomUUID()}`; +} diff --git a/src/evals/externalHost/builtins/macosDesktop.test.ts b/src/evals/externalHost/builtins/macosDesktop.test.ts new file mode 100644 index 0000000..4898742 --- /dev/null +++ b/src/evals/externalHost/builtins/macosDesktop.test.ts @@ -0,0 +1,74 @@ +import { describe, expect, it } from 'vitest'; +import { + buildMacosDesktopSubmitScript, + MACOS_DESKTOP_CAPABILITIES, +} from './macosDesktop.js'; + +describe('macOS desktop built-in capabilities', () => { + it('declares reusable platform and accessibility submit capabilities', () => { + expect( + MACOS_DESKTOP_CAPABILITIES.map((capability) => ({ + id: capability.id, + capabilities: capability.capabilities, + })) + ).toEqual([ + { + id: 'builtin:platform.macos', + capabilities: ['control'], + }, + { + id: 'builtin:desktop.macos.accessibilitySubmit', + capabilities: ['control', 'input'], + }, + ]); + }); + + it('builds a submit script that uses keyboard-only input (no coordinate clicks)', () => { + const script = buildMacosDesktopSubmitScript('hello marker', { + appName: 'Example', + createNewConversation: false, + settleDelayMs: 500, + }); + + expect(script).toContain('tell application "Example" to activate'); + expect(script).toContain('keystroke "v" using command down'); + expect(script).toContain('key code 36'); + // Coordinate-based clicks were removed in favor of relying on Chromium's + // DOM autofocus when a new conversation opens via Cmd+N. + expect(script).not.toContain('click at {'); + }); + + it('emits Cmd+N when createNewConversation is enabled', () => { + const script = buildMacosDesktopSubmitScript('hello marker', { + appName: 'Example', + createNewConversation: true, + settleDelayMs: 500, + }); + + expect(script).toContain('keystroke "n" using command down'); + }); + + it('verifies the target app is foregrounded before sending keystrokes and errors fast otherwise', () => { + const script = buildMacosDesktopSubmitScript('hello marker', { + appName: 'Example', + createNewConversation: false, + settleDelayMs: 500, + }); + + // The retry loop polls `frontmost` and re-asserts `set frontmost to true` + // up to 10 times so transient focus-prevention can be retried before we + // give up. + expect(script).toContain('repeat 10 times'); + expect(script).toContain('if frontmost then'); + expect(script).toContain('set frontmost to true'); + + // If the loop exits without activation succeeding, the script must error + // fast with a message identifying the foreground problem rather than + // letting downstream keystrokes route to the wrong app and surface as a + // 90-second eval timeout. + expect(script).toContain('if not activated then'); + expect(script).toContain( + 'could not be brought to the foreground (focus is held by another app)' + ); + }); +}); diff --git a/src/evals/externalHost/builtins/macosDesktop.ts b/src/evals/externalHost/builtins/macosDesktop.ts new file mode 100644 index 0000000..e679fb9 --- /dev/null +++ b/src/evals/externalHost/builtins/macosDesktop.ts @@ -0,0 +1,358 @@ +import { execFile } from 'node:child_process'; +import { promisify } from 'node:util'; +import type { + ExternalHostCapabilityContext, + ExternalHostCapabilityImplementation, + ExternalHostFailureKind, + ExternalHostRunResult, +} from '../types.js'; +import { driverToSlug, hostTypeFromDriver } from '../driverIdentity.js'; + +const execFileAsync = promisify(execFile); +const DEFAULT_SETTLE_DELAY_MS = 500; +const DEFAULT_APPLESCRIPT_TIMEOUT_MS = 30_000; +const DEFAULT_APPLESCRIPT_MAX_BUFFER = 64 * 1024 * 1024; + +export const MACOS_DESKTOP_CAPABILITIES: ExternalHostCapabilityImplementation[] = + [ + { + id: 'builtin:platform.macos', + capabilities: ['control'], + run: requireMacosCapability, + }, + { + id: 'builtin:desktop.macos.accessibilitySubmit', + capabilities: ['control', 'input'], + run: submitPromptCapability, + }, + ]; + +export async function runAppleScript( + script: string, + options: { timeoutMs?: number; maxBuffer?: number } = {} +): Promise { + const result = await execFileAsync('osascript', ['-e', script], { + maxBuffer: options.maxBuffer ?? DEFAULT_APPLESCRIPT_MAX_BUFFER, + timeout: options.timeoutMs ?? DEFAULT_APPLESCRIPT_TIMEOUT_MS, + killSignal: 'SIGKILL', + }); + return result.stdout; +} + +export function writeMacosClipboard(value: string): Promise { + return new Promise((resolve, reject) => { + const child = execFile('pbcopy', (error) => { + if (error) { + reject(new Error(error.message)); + return; + } + resolve(); + }); + child.stdin?.end(value); + }); +} + +export async function readMacosAccessibilityText( + appName: string +): Promise { + const script = ` +on collectText(theElement) + set output to {} + try + tell application "System Events" to set elementRole to role of theElement + tell application "System Events" to set elementValue to value of theElement + if (elementRole is "AXStaticText" or elementRole is "AXTextArea") and elementValue is not missing value then set end of output to (elementValue as text) + end try + try + tell application "System Events" to set uiChildren to UI elements of theElement + repeat with childElement in uiChildren + set output to output & my collectText(childElement) + end repeat + end try + return output +end collectText + +tell application "System Events" to tell process ${JSON.stringify(appName)} + set textItems to my collectText(front window) +end tell +set AppleScript's text item delimiters to linefeed +return textItems as text +`; + return runAppleScript(script); +} + +export async function readMacosFrontWindowContents( + appName: string +): Promise { + const script = `tell application "System Events" to tell process ${JSON.stringify( + appName + )} to get entire contents of front window`; + return runAppleScript(script); +} + +async function requireMacosCapability({ + config, + run, + binding, + state, +}: ExternalHostCapabilityContext): Promise { + if (process.platform === 'darwin') { + return; + } + + return desktopFailureResult({ + config, + context: run, + state, + failureKind: 'unsupported_host', + error: + stringOption(binding.with, 'error') ?? + `${state.displayName} currently requires macOS automation support.`, + limitations: [ + stringOption(binding.with, 'limitation') ?? + 'Windows UI Automation support has not been added yet.', + ], + }); +} + +async function submitPromptCapability({ + config, + run, + binding, + state, +}: ExternalHostCapabilityContext): Promise { + try { + const appName = + runStringOption(config, binding, 'appName') ?? state.displayName; + await submitPromptToMacosDesktopApp(run.submittedScenario, { + appName, + createNewConversation: shouldCreateNewConversation( + binding.with?.createNewConversation, + config + ), + settleDelayMs: runNumberOption(config, binding, 'settleDelayMs'), + submitButtonNames: stringArrayOption(binding.with, 'submitButtonNames'), + }); + } catch (err) { + const message = formatError(err); + return desktopFailureResult({ + config, + context: run, + state, + failureKind: classifyDesktopSubmissionFailure(message), + error: `Failed to submit prompt to desktop host: ${message}`, + limitations: [ + 'The desktop host app must be installed, signed in, and allowed in macOS Automation/Accessibility settings.', + ], + }); + } +} + +export async function submitPromptToMacosDesktopApp( + prompt: string, + options: { + appName: string; + createNewConversation: boolean; + settleDelayMs?: number; + submitButtonNames?: string[]; + } +): Promise { + const settleDelayMs = options.settleDelayMs ?? DEFAULT_SETTLE_DELAY_MS; + const script = buildMacosDesktopSubmitScript(prompt, { + ...options, + settleDelayMs, + }); + await writeMacosClipboard(prompt); + await runAppleScript(script); +} + +export function buildMacosDesktopSubmitScript( + _prompt: string, + options: { + appName: string; + createNewConversation: boolean; + settleDelayMs: number; + submitButtonNames?: string[]; + } +): string { + const settleDelayMs = options.settleDelayMs; + + const newConversation = options.createNewConversation + ? `keystroke "n" using command down + delay ${Math.max(settleDelayMs, 1500) / 1000}` + : ''; + + return ` +tell application ${JSON.stringify(options.appName)} to activate +delay ${settleDelayMs / 1000} + +-- Verify the app actually came to the foreground. tell-to-activate is +-- unreliable on multi-monitor / multi-Space setups when another app +-- (browser, terminal, etc.) holds focus-prevention precedence. Retry +-- bringing the app forward up to ~2 seconds; fail fast with a clear +-- error if the OS refuses, since otherwise our keystrokes route to +-- whatever app actually has focus and the eval times out 90s later. +set activated to false +repeat 10 times + tell application "System Events" to tell process ${JSON.stringify(options.appName)} + if frontmost then + set activated to true + exit repeat + end if + try + set frontmost to true + end try + end tell + delay 0.2 +end repeat +if not activated then + error ${JSON.stringify(options.appName)} & " could not be brought to the foreground (focus is held by another app); keystrokes would route to the wrong app" +end if + +tell application "System Events" + -- Force a known-focus state by opening a new conversation. Chromium's React + -- app autofocuses the composer on a fresh chat view, even though + -- AXFocusedUIElement doesn't expose that state to AppleScript. This avoids + -- coordinate-based clicks that are fragile to window position, monitor + -- placement, or layout drift. + ${newConversation} + -- Paste the prompt from clipboard. The caller has already written the + -- prompt to the macOS clipboard via writeMacosClipboard. The keystroke + -- routes to whatever has DOM focus inside the active window. + keystroke "v" using command down + delay 0.4 + -- Submit via Return. + key code 36 +end tell +`; +} + +function shouldCreateNewConversation( + option: unknown, + config: { options?: Record } +): boolean { + if (option === 'unless-disabled') { + return configStringOption(config, 'newConversationShortcut') !== 'none'; + } + return option === true; +} + +function desktopFailureResult({ + config, + context, + state, + failureKind, + error, + limitations, +}: { + config: ExternalHostCapabilityContext['config']; + context: ExternalHostCapabilityContext['run']; + state: ExternalHostCapabilityContext['state']; + failureKind: ExternalHostFailureKind; + error: string; + limitations: string[]; +}): ExternalHostRunResult { + return { + success: false, + toolCalls: [], + error, + externalHost: { + driver: state.driver, + driverSlug: driverToSlug(state.driver), + displayName: state.displayName, + hostName: state.displayName, + hostType: config.hostType ?? hostTypeFromDriver(state.driver), + hostVariant: config.variant, + capabilitiesUsed: state.capabilitiesUsed, + traceSource: 'none', + traceConfidence: 'unknown', + traceLimitations: limitations, + artifacts: [], + session: { runMarker: context.marker }, + correlation: context.correlation, + failureKind, + }, + }; +} + +function runStringOption( + config: { options?: Record }, + binding: { with?: Record }, + key: string +): string | undefined { + return stringOption(binding.with, key) ?? configStringOption(config, key); +} + +function runNumberOption( + config: { options?: Record }, + binding: { with?: Record }, + key: string +): number | undefined { + const value = binding.with?.[key]; + return typeof value === 'number' ? value : configNumberOption(config, key); +} + +function configStringOption( + config: { options?: Record }, + key: string +): string | undefined { + return stringOption(config.options, key); +} + +function configNumberOption( + config: { options?: Record }, + key: string +): number | undefined { + const value = config.options?.[key]; + return typeof value === 'number' ? value : undefined; +} + +function stringOption( + options: Record | undefined, + key: string +): string | undefined { + const value = options?.[key]; + return typeof value === 'string' ? value : undefined; +} + +function stringArrayOption( + options: Record | undefined, + key: string +): string[] | undefined { + const value = options?.[key]; + if (!Array.isArray(value)) { + return undefined; + } + const strings = value.filter( + (item): item is string => typeof item === 'string' + ); + return strings.length > 0 ? strings : undefined; +} + +function classifyDesktopSubmissionFailure( + message: string +): ExternalHostFailureKind { + const lower = message.toLowerCase(); + if ( + lower.includes('not authorized') || + lower.includes('not permitted') || + lower.includes('assistive access') || + lower.includes('accessibility') || + lower.includes('automation') + ) { + return 'automation_permission_denied'; + } + if ( + lower.includes('can’t get application') || + lower.includes("can't get application") || + lower.includes('application isn’t running') || + lower.includes("application isn't running") + ) { + return 'app_unavailable'; + } + return 'submission_failed'; +} + +function formatError(err: unknown): string { + return err instanceof Error ? err.message : String(err); +} diff --git a/src/evals/externalHost/capabilities.test.ts b/src/evals/externalHost/capabilities.test.ts new file mode 100644 index 0000000..a5692bd --- /dev/null +++ b/src/evals/externalHost/capabilities.test.ts @@ -0,0 +1,24 @@ +import { describe, expect, it } from 'vitest'; +import { validateHostCapabilities } from './capabilities.js'; + +describe('validateHostCapabilities', () => { + it('passes when all required external host capabilities are present', () => { + expect( + validateHostCapabilities([ + 'control', + 'input', + 'completion', + 'trace', + 'normalize', + ]) + ).toEqual([]); + }); + + it('reports missing required capabilities', () => { + expect(validateHostCapabilities(['control', 'input'])).toEqual([ + 'completion', + 'trace', + 'normalize', + ]); + }); +}); diff --git a/src/evals/externalHost/capabilities.ts b/src/evals/externalHost/capabilities.ts new file mode 100644 index 0000000..6877e36 --- /dev/null +++ b/src/evals/externalHost/capabilities.ts @@ -0,0 +1,18 @@ +import type { HostCapability } from './types.js'; + +export const REQUIRED_HOST_CAPABILITIES: HostCapability[] = [ + 'control', + 'input', + 'completion', + 'trace', + 'normalize', +]; + +export function validateHostCapabilities( + capabilities: readonly HostCapability[] +): HostCapability[] { + const provided = new Set(capabilities); + return REQUIRED_HOST_CAPABILITIES.filter( + (capability) => !provided.has(capability) + ); +} diff --git a/src/evals/externalHost/capabilityRuntime.test.ts b/src/evals/externalHost/capabilityRuntime.test.ts new file mode 100644 index 0000000..76a639b --- /dev/null +++ b/src/evals/externalHost/capabilityRuntime.test.ts @@ -0,0 +1,155 @@ +import { describe, expect, it } from 'vitest'; +import { + loadExternalHostConfig, + loadExternalHostRunner, + registerExternalHostCapability, +} from './capabilityRuntime.js'; + +const TEST_DRIVER = { + provider: 'test', + product: 'host', + surface: 'chat', + runtime: 'desktop-app', + platform: 'macos', +} as const; + +const TEST_CORRELATION = { + strategy: 'prompt_marker', + marker: 'MCP_SERVER_TESTER_CAPABILITY', + includedInPrompt: true, +} as const; + +describe('external host capability runtime', () => { + it('composes a runner from config-declared capability bindings', async () => { + const calls: string[] = []; + + registerExternalHostCapability({ + id: 'test.capability.success', + capabilities: ['control', 'input', 'completion', 'trace', 'normalize'], + async setup({ state }) { + calls.push('setup'); + state.data.setupSeen = true; + }, + async run({ run, state }) { + calls.push('run'); + expect(state.driverSlug).toBe('test.host.chat.desktop-app.macos'); + expect(state.data.setupSeen).toBe(true); + return { + success: true, + response: 'composed result', + toolCalls: [], + externalHost: { + driver: state.driver, + driverSlug: state.driverSlug, + displayName: state.displayName, + hostName: state.displayName, + hostType: 'custom', + capabilitiesUsed: state.capabilitiesUsed, + traceSource: 'manual-import', + traceConfidence: 'high', + artifacts: [], + session: { runMarker: run.marker }, + correlation: run.correlation, + }, + }; + }, + }); + + const runner = await loadExternalHostRunner({ + driver: TEST_DRIVER, + capabilities: { + control: { + uses: 'test.capability.success', + provides: ['input', 'completion', 'trace', 'normalize'], + }, + }, + }); + + const result = await runner.run({ + runId: 'run', + caseId: 'case', + scenario: 'scenario', + submittedScenario: 'scenario', + marker: 'MCP_SERVER_TESTER_CAPABILITY', + correlation: TEST_CORRELATION, + timeoutMs: 1000, + startedAtMs: Date.now(), + }); + + expect(calls).toEqual(['setup', 'run']); + expect(result).toMatchObject({ + success: true, + response: 'composed result', + externalHost: { + driverSlug: 'test.host.chat.desktop-app.macos', + capabilitiesUsed: [ + 'control', + 'input', + 'completion', + 'trace', + 'normalize', + ], + }, + }); + }); + + it('treats binding provides as additional capabilities', async () => { + registerExternalHostCapability({ + id: 'test.capability.extraControl', + capabilities: ['control'], + }); + registerExternalHostCapability({ + id: 'test.capability.inputTrace', + capabilities: ['input', 'trace'], + }); + + const loaded = await loadExternalHostConfig({ + driver: TEST_DRIVER, + capabilities: { + control: { uses: 'test.capability.extraControl' }, + input: { + uses: 'test.capability.inputTrace', + provides: ['completion', 'normalize'], + }, + }, + }); + + expect(loaded.capabilitiesUsed).toEqual([ + 'control', + 'input', + 'trace', + 'completion', + 'normalize', + ]); + }); + + it('fails config loading when required capabilities are missing', async () => { + registerExternalHostCapability({ + id: 'test.capability.controlOnly', + capabilities: ['control'], + }); + + await expect( + loadExternalHostConfig({ + driver: TEST_DRIVER, + capabilities: { + control: { uses: 'test.capability.controlOnly' }, + }, + }) + ).rejects.toThrow('missing capabilities'); + }); + + it('fails config loading for unavailable capability implementations', async () => { + await expect( + loadExternalHostConfig({ + driver: TEST_DRIVER, + capabilities: { + control: { + uses: 'missing.capability', + provides: ['input', 'completion', 'trace', 'normalize'], + }, + }, + }) + ).rejects.toThrow('not available'); + }); +}); diff --git a/src/evals/externalHost/capabilityRuntime.ts b/src/evals/externalHost/capabilityRuntime.ts new file mode 100644 index 0000000..152044f --- /dev/null +++ b/src/evals/externalHost/capabilityRuntime.ts @@ -0,0 +1,336 @@ +import { + REQUIRED_HOST_CAPABILITIES, + validateHostCapabilities, +} from './capabilities.js'; +import { + getRegisteredExternalHostConfig, + getRegisteredExternalHostDisplayName, +} from './hostRegistry.js'; +import { + listBuiltinExternalHostCapabilities, + resolveBuiltinExternalHostCapability, +} from './builtinCapabilities.js'; +import { + driverToSlug, + hostTypeFromDriver, + normalizeHostDriver, +} from './driverIdentity.js'; +import type { + ExternalHostCapabilityBinding, + ExternalHostCapabilityContext, + ExternalHostCapabilityImplementation, + ExternalHostCapabilitiesConfig, + ExternalHostConfig, + ExternalHostRunResult, + ExternalHostRunState, + ExternalHostRunner, + HostCapability, + HostDriverId, + HostRunContext, +} from './types.js'; + +const CAPABILITIES = new Map(); + +export interface LoadedExternalHostCapability { + capability: HostCapability; + binding: ExternalHostCapabilityBinding; + implementation: ExternalHostCapabilityImplementation; +} + +export interface LoadedExternalHostConfig { + config: ExternalHostConfig; + driver: HostDriverId; + driverSlug: string; + displayName: string; + loadedCapabilities: LoadedExternalHostCapability[]; + capabilitiesUsed: HostCapability[]; +} + +export function registerExternalHostCapability( + implementation: ExternalHostCapabilityImplementation +): void { + CAPABILITIES.set(implementation.id, implementation); +} + +export function listExternalHostCapabilities(): ExternalHostCapabilityImplementation[] { + return Array.from( + new Map( + [...listBuiltinExternalHostCapabilities(), ...CAPABILITIES.values()].map( + (implementation) => [implementation.id, implementation] + ) + ).values() + ); +} + +export async function resolveExternalHostCapability( + uses: string +): Promise { + const registered = CAPABILITIES.get(uses); + if (registered) { + return registered; + } + + const configuredBuiltin = resolveBuiltinExternalHostCapability(uses); + if (configuredBuiltin) { + return configuredBuiltin; + } + + if (uses.startsWith('module:')) { + return loadModuleCapability(uses); + } + + return undefined; +} + +export async function loadExternalHostRunner( + config: ExternalHostConfig +): Promise { + const loaded = await loadExternalHostConfig(config); + + return createExternalHostRunner(loaded); +} + +export function createExternalHostRunner( + loaded: LoadedExternalHostConfig +): ExternalHostRunner { + return { + async run(context: HostRunContext): Promise { + return runLoadedExternalHost(loaded, context); + }, + }; +} + +export async function loadExternalHostConfig( + config: ExternalHostConfig +): Promise { + const driver = normalizeHostDriver(config.driver); + const driverSlug = driverToSlug(driver); + const registeredConfig = getRegisteredExternalHostConfig(driverSlug); + const effectiveConfig = mergeExternalHostConfig(config, registeredConfig); + const capabilitiesConfig = effectiveConfig.capabilities; + + if (!capabilitiesConfig) { + throw new Error( + `External host ${driverSlug} does not declare capabilities and has no built-in defaults.` + ); + } + + const loadedCapabilities: LoadedExternalHostCapability[] = []; + const providedCapabilities = new Set(); + + for (const capability of REQUIRED_HOST_CAPABILITIES) { + const bindings = normalizeCapabilityBindings( + capabilitiesConfig[capability] + ); + for (const binding of bindings) { + const implementation = await resolveExternalHostCapability(binding.uses); + if (!implementation) { + throw new Error( + `External host capability implementation is not available: ${binding.uses}` + ); + } + + loadedCapabilities.push({ + capability, + binding, + implementation, + }); + providedCapabilities.add(capability); + for (const provided of [ + ...implementation.capabilities, + ...(binding.provides ?? []), + ]) { + providedCapabilities.add(provided); + } + } + } + + const capabilitiesUsed = Array.from(providedCapabilities); + const missingCapabilities = validateHostCapabilities(capabilitiesUsed); + if (missingCapabilities.length > 0) { + throw new Error( + `External host ${driverSlug} is missing capabilities: ${missingCapabilities.join(', ')}` + ); + } + + return { + config: effectiveConfig, + driver, + driverSlug, + displayName: + effectiveConfig.name ?? + getRegisteredExternalHostDisplayName(driverSlug) ?? + driverSlug, + loadedCapabilities, + capabilitiesUsed, + }; +} + +async function runLoadedExternalHost( + loaded: LoadedExternalHostConfig, + context: HostRunContext +): Promise { + const state: ExternalHostRunState = { + driver: loaded.driver, + driverSlug: loaded.driverSlug, + displayName: loaded.displayName, + capabilitiesUsed: loaded.capabilitiesUsed, + data: {}, + }; + + for (const loadedCapability of loaded.loadedCapabilities) { + const result = await loadedCapability.implementation.setup?.( + capabilityContext(loaded, context, state, loadedCapability) + ); + if (result) { + return result; + } + if (state.result) { + return state.result; + } + } + + for (const loadedCapability of loaded.loadedCapabilities) { + const result = await loadedCapability.implementation.run?.( + capabilityContext(loaded, context, state, loadedCapability) + ); + if (result) { + return result; + } + if (state.result) { + return state.result; + } + } + + return runtimeFailure( + loaded, + context, + `External host ${loaded.driverSlug} completed without producing a result.` + ); +} + +function capabilityContext( + loaded: LoadedExternalHostConfig, + run: HostRunContext, + state: ExternalHostRunState, + loadedCapability: LoadedExternalHostCapability +): ExternalHostCapabilityContext { + return { + config: loaded.config, + run, + capability: loadedCapability.capability, + binding: loadedCapability.binding, + state, + }; +} + +function mergeExternalHostConfig( + config: ExternalHostConfig, + builtin: Partial | undefined +): ExternalHostConfig { + if (!builtin) { + return config; + } + + return { + ...builtin, + ...config, + capabilities: mergeCapabilities(builtin.capabilities, config.capabilities), + correlation: { + ...builtin.correlation, + ...config.correlation, + }, + options: { + ...builtin.options, + ...config.options, + }, + }; +} + +function mergeCapabilities( + base: ExternalHostCapabilitiesConfig | undefined, + override: ExternalHostCapabilitiesConfig | undefined +): ExternalHostCapabilitiesConfig | undefined { + if (!base) { + return override; + } + if (!override) { + return base; + } + return { + ...base, + ...override, + }; +} + +function normalizeCapabilityBindings( + binding: + | ExternalHostCapabilityBinding + | ExternalHostCapabilityBinding[] + | undefined +): ExternalHostCapabilityBinding[] { + if (!binding) { + return []; + } + return Array.isArray(binding) ? binding : [binding]; +} + +async function loadModuleCapability( + uses: string +): Promise { + const target = uses.slice('module:'.length); + const [specifier, exportName = 'default'] = target.split('#'); + if (!specifier) { + throw new Error(`Invalid external host module capability id: ${uses}`); + } + + const module = (await import(specifier)) as Record; + const implementation = module[exportName]; + if (!isExternalHostCapabilityImplementation(implementation)) { + throw new Error( + `External host module capability ${uses} did not export a valid implementation.` + ); + } + return implementation; +} + +function isExternalHostCapabilityImplementation( + value: unknown +): value is ExternalHostCapabilityImplementation { + return ( + typeof value === 'object' && + value !== null && + typeof (value as ExternalHostCapabilityImplementation).id === 'string' && + Array.isArray((value as ExternalHostCapabilityImplementation).capabilities) + ); +} + +function runtimeFailure( + loaded: LoadedExternalHostConfig, + context: HostRunContext, + error: string +): ExternalHostRunResult { + return { + success: false, + toolCalls: [], + error, + externalHost: { + driver: loaded.driver, + driverSlug: loaded.driverSlug, + displayName: loaded.displayName, + hostName: loaded.displayName, + hostType: loaded.config.hostType ?? hostTypeFromDriver(loaded.driver), + hostVariant: loaded.config.variant, + capabilitiesUsed: loaded.capabilitiesUsed, + traceSource: 'none', + traceConfidence: 'unknown', + traceLimitations: [ + 'The external host capability runner did not produce a result.', + ], + artifacts: [], + session: { runMarker: context.marker }, + correlation: context.correlation, + failureKind: 'unsupported_host', + }, + }; +} diff --git a/src/evals/externalHost/driverIdentity.ts b/src/evals/externalHost/driverIdentity.ts new file mode 100644 index 0000000..4ca7d6f --- /dev/null +++ b/src/evals/externalHost/driverIdentity.ts @@ -0,0 +1,77 @@ +import type { + ExternalHostType, + HostDriverConfig, + HostDriverId, +} from './types.js'; + +export const CLAUDE_CHAT_DESKTOP_MACOS_DRIVER: HostDriverId = { + provider: 'anthropic', + product: 'claude', + surface: 'chat', + runtime: 'desktop-app', + platform: 'macos', +}; + +export const CLAUDE_COWORK_DESKTOP_MACOS_DRIVER: HostDriverId = { + provider: 'anthropic', + product: 'claude', + surface: 'cowork', + runtime: 'desktop-app', + platform: 'macos', +}; + +export const CLAUDE_CODE_CLI_MACOS_DRIVER: HostDriverId = { + provider: 'anthropic', + product: 'claude', + surface: 'code', + runtime: 'cli', + platform: 'macos', +}; + +export function driverToSlug(driver: HostDriverId): string { + return [ + driver.provider, + driver.product, + driver.surface, + driver.runtime, + driver.platform, + driver.channel, + ] + .filter((part): part is string => Boolean(part)) + .join('.'); +} + +export function parseDriverSlug(slug: string): HostDriverId { + const [provider, product, surface, runtime, platform, ...rest] = + slug.split('.'); + + if (!provider || !product || !surface || !runtime) { + throw new Error( + `External host driver slug must include provider.product.surface.runtime: ${slug}` + ); + } + + return { + provider, + product, + surface, + runtime, + ...(platform ? { platform } : {}), + ...(rest.length > 0 ? { channel: rest.join('.') } : {}), + }; +} + +export function normalizeHostDriver(driver: HostDriverConfig): HostDriverId { + if (typeof driver === 'string') { + return parseDriverSlug(driver); + } + + return driver; +} + +export function hostTypeFromDriver(driver: HostDriverId): ExternalHostType { + if (driver.runtime === 'cli' || driver.runtime === 'tui') return 'cli'; + if (driver.runtime === 'browser') return 'browser'; + if (driver.runtime === 'desktop-app') return 'desktop'; + return 'custom'; +} diff --git a/src/evals/externalHost/hostRegistry.test.ts b/src/evals/externalHost/hostRegistry.test.ts new file mode 100644 index 0000000..4550f9d --- /dev/null +++ b/src/evals/externalHost/hostRegistry.test.ts @@ -0,0 +1,91 @@ +import { describe, expect, it } from 'vitest'; +import { + CLAUDE_COWORK_DESKTOP_MACOS_DRIVER, + driverToSlug, + getRegisteredExternalHostConfig, + loadExternalHostConfig, + listRegisteredExternalHostSlugs, + normalizeHostDriver, + parseDriverSlug, +} from './index.js'; + +describe('external host driver identity and built-in defaults', () => { + it('round-trips structured driver ids to slugs', () => { + const slug = driverToSlug(CLAUDE_COWORK_DESKTOP_MACOS_DRIVER); + + expect(slug).toBe('anthropic.claude.cowork.desktop-app.macos'); + expect(parseDriverSlug(slug)).toEqual(CLAUDE_COWORK_DESKTOP_MACOS_DRIVER); + }); + + it('normalizes driver slug strings to structured ids', () => { + expect( + normalizeHostDriver('anthropic.claude.cowork.desktop-app.macos') + ).toEqual(CLAUDE_COWORK_DESKTOP_MACOS_DRIVER); + }); + + it('declares Claude Cowork as capability bindings, not a concrete runner', () => { + const config = getRegisteredExternalHostConfig( + 'anthropic.claude.cowork.desktop-app.macos' + ); + + expect(config?.name).toBe('Claude Cowork Desktop'); + expect(config?.correlation).toEqual({ + strategy: 'prompt_marker', + includeInPrompt: true, + }); + expect(config?.capabilities).toMatchObject({ + control: [ + { uses: 'builtin:platform.macos' }, + { + uses: 'builtin:anthropic.claude.activateCoworkSurface', + with: { appName: 'Claude' }, + }, + ], + input: { uses: 'builtin:desktop.macos.accessibilitySubmit' }, + completion: { + uses: 'builtin:anthropic.claude.localAgentTrace', + provides: ['trace'], + }, + normalize: { + uses: 'builtin:anthropic.claude.localAgentNormalize', + }, + }); + }); + + it('loads Claude Cowork defaults into concrete capability providers at runtime', async () => { + const loaded = await loadExternalHostConfig({ + driver: 'anthropic.claude.cowork.desktop-app.macos', + }); + + expect(loaded.displayName).toBe('Claude Cowork Desktop'); + expect(loaded.capabilitiesUsed).toEqual([ + 'control', + 'input', + 'completion', + 'trace', + 'normalize', + ]); + expect( + loaded.loadedCapabilities.map((capability) => capability.binding.uses) + ).toEqual([ + 'builtin:platform.macos', + 'builtin:anthropic.claude.activateCoworkSurface', + 'builtin:desktop.macos.accessibilitySubmit', + 'builtin:anthropic.claude.localAgentTrace', + 'builtin:anthropic.claude.localAgentNormalize', + ]); + }); + + it('returns no built-in defaults for syntactically valid unsupported drivers', () => { + expect( + getRegisteredExternalHostConfig('openai.chatgpt.chat.browser.web') + ).toBeUndefined(); + }); + + it('lists registered external hosts by structured driver slug', () => { + expect(listRegisteredExternalHostSlugs()).toEqual([ + 'anthropic.claude.chat.desktop-app.macos', + 'anthropic.claude.cowork.desktop-app.macos', + ]); + }); +}); diff --git a/src/evals/externalHost/hostRegistry.ts b/src/evals/externalHost/hostRegistry.ts new file mode 100644 index 0000000..2f4b790 --- /dev/null +++ b/src/evals/externalHost/hostRegistry.ts @@ -0,0 +1,88 @@ +import type { ExternalHostConfig } from './types.js'; +import { + CLAUDE_CHAT_DESKTOP_MACOS_DRIVER, + CLAUDE_COWORK_DESKTOP_MACOS_DRIVER, + driverToSlug, +} from './driverIdentity.js'; + +const EXTERNAL_HOST_REGISTRY: Record< + string, + Partial & { name: string; description: string } +> = { + [driverToSlug(CLAUDE_CHAT_DESKTOP_MACOS_DRIVER)]: { + driver: CLAUDE_CHAT_DESKTOP_MACOS_DRIVER, + name: 'Claude Chat Desktop', + description: + 'Drives the regular Claude Desktop chat surface on macOS and captures low-confidence visible response evidence via Accessibility.', + correlation: { + strategy: 'prompt_marker', + includeInPrompt: true, + }, + capabilities: { + control: { uses: 'builtin:platform.macos' }, + input: { + uses: 'builtin:desktop.macos.accessibilitySubmit', + with: { + appName: 'Claude', + createNewConversation: 'unless-disabled', + }, + }, + completion: { + uses: 'builtin:anthropic.claude.accessibilityTrace', + provides: ['trace', 'normalize'], + }, + }, + }, + [driverToSlug(CLAUDE_COWORK_DESKTOP_MACOS_DRIVER)]: { + driver: CLAUDE_COWORK_DESKTOP_MACOS_DRIVER, + name: 'Claude Cowork Desktop', + description: + 'Drives the Claude Desktop Cowork surface on macOS and captures high-confidence local-agent trace evidence.', + correlation: { + strategy: 'prompt_marker', + includeInPrompt: true, + }, + capabilities: { + control: [ + { uses: 'builtin:platform.macos' }, + { + uses: 'builtin:anthropic.claude.activateCoworkSurface', + with: { appName: 'Claude' }, + }, + ], + input: { + uses: 'builtin:desktop.macos.accessibilitySubmit', + with: { appName: 'Claude', createNewConversation: true }, + }, + completion: { + uses: 'builtin:anthropic.claude.localAgentTrace', + provides: ['trace'], + }, + normalize: { + uses: 'builtin:anthropic.claude.localAgentNormalize', + }, + }, + }, +}; + +export function getRegisteredExternalHostConfig( + driverSlug: string +): Partial | undefined { + return EXTERNAL_HOST_REGISTRY[driverSlug]; +} + +export function getRegisteredExternalHostDisplayName( + driverSlug: string +): string | undefined { + return EXTERNAL_HOST_REGISTRY[driverSlug]?.name; +} + +export function getRegisteredExternalHostDescription( + driverSlug: string +): string | undefined { + return EXTERNAL_HOST_REGISTRY[driverSlug]?.description; +} + +export function listRegisteredExternalHostSlugs(): string[] { + return Object.keys(EXTERNAL_HOST_REGISTRY); +} diff --git a/src/evals/externalHost/index.ts b/src/evals/externalHost/index.ts new file mode 100644 index 0000000..31d3e1a --- /dev/null +++ b/src/evals/externalHost/index.ts @@ -0,0 +1,69 @@ +export { runExternalHostScenario } from './runtime.js'; +export { + REQUIRED_HOST_CAPABILITIES, + validateHostCapabilities, +} from './capabilities.js'; +export { + getRegisteredExternalHostConfig, + getRegisteredExternalHostDescription, + getRegisteredExternalHostDisplayName, + listRegisteredExternalHostSlugs, +} from './hostRegistry.js'; +export { + listBuiltinExternalHostCapabilities, + resolveBuiltinExternalHostCapability, +} from './builtinCapabilities.js'; +export { + listExternalHostCapabilities, + loadExternalHostConfig, + loadExternalHostRunner, + registerExternalHostCapability, + resolveExternalHostCapability, +} from './capabilityRuntime.js'; +export type { + LoadedExternalHostCapability, + LoadedExternalHostConfig, +} from './capabilityRuntime.js'; +export { + CLAUDE_CHAT_DESKTOP_MACOS_DRIVER, + CLAUDE_CODE_CLI_MACOS_DRIVER, + CLAUDE_COWORK_DESKTOP_MACOS_DRIVER, + driverToSlug, + hostTypeFromDriver, + normalizeHostDriver, + parseDriverSlug, +} from './driverIdentity.js'; +export { + ExternalHostCapabilityBindingSchema, + ExternalHostConfigSchema, + ExternalHostCorrelationSchema, + getExternalHostConfigJsonSchema, + getExternalHostReference, + HostCapabilitySchema, + HostDriverIdSchema, + listExternalHostDriverReferences, +} from './schema.js'; +export type { ExternalHostDriverReference } from './schema.js'; +export type { + EvidenceSource, + ExternalHostCapabilityBinding, + ExternalHostCapabilityContext, + ExternalHostCapabilityImplementation, + ExternalHostCapabilitiesConfig, + ExternalHostConfig, + ExternalHostFailureKind, + ExternalHostMetadata, + ExternalHostRunState, + ExternalHostRunResult, + ExternalHostRunner, + ExternalHostSession, + ExternalHostSimulationResult, + ExternalHostType, + HostArtifact, + HostCapability, + HostDriverConfig, + HostDriverId, + HostRunContext, + ObservationConfidence, + TraceSource, +} from './types.js'; diff --git a/src/evals/externalHost/runtime.test.ts b/src/evals/externalHost/runtime.test.ts new file mode 100644 index 0000000..c936463 --- /dev/null +++ b/src/evals/externalHost/runtime.test.ts @@ -0,0 +1,56 @@ +import { describe, expect, it } from 'vitest'; +import { formatSubmittedScenario, runExternalHostScenario } from './runtime.js'; + +describe('external host runtime', () => { + it('adds an evaluator marker with an instruction not to mention it', () => { + const submitted = formatSubmittedScenario( + 'Reply with exactly: acknowledged.', + 'MCP_SERVER_TESTER_run_123' + ); + + expect(submitted).toContain('Reply with exactly: acknowledged.'); + expect(submitted).toContain('[eval-run-marker:MCP_SERVER_TESTER_run_123]'); + expect(submitted).toContain('do not mention this marker'); + }); + + it('leaves the submitted scenario unchanged when prompt correlation is disabled', () => { + const submitted = formatSubmittedScenario( + 'Reply with exactly: acknowledged.', + 'MCP_SERVER_TESTER_run_123', + { strategy: 'none' } + ); + + expect(submitted).toBe('Reply with exactly: acknowledged.'); + }); + + it('supports prompt marker correlation without including it in the prompt', () => { + const submitted = formatSubmittedScenario( + 'Reply with exactly: acknowledged.', + 'MCP_SERVER_TESTER_run_123', + { strategy: 'prompt_marker', includeInPrompt: false } + ); + + expect(submitted).toBe('Reply with exactly: acknowledged.'); + }); + + it('infers host type for unsupported driver failures', async () => { + const result = await runExternalHostScenario( + 'hello', + { driver: 'openai.chatgpt.chat.browser.web' }, + { runId: 'unsupported-browser' } + ); + + expect(result).toMatchObject({ + success: false, + externalHost: { + driverSlug: 'openai.chatgpt.chat.browser.web', + hostType: 'browser', + failureKind: 'unsupported_host', + correlation: { + strategy: 'none', + includedInPrompt: false, + }, + }, + }); + }); +}); diff --git a/src/evals/externalHost/runtime.ts b/src/evals/externalHost/runtime.ts new file mode 100644 index 0000000..2b9ace7 --- /dev/null +++ b/src/evals/externalHost/runtime.ts @@ -0,0 +1,139 @@ +import { randomUUID } from 'node:crypto'; +import type { + ExternalHostCorrelationConfig, + ExternalHostCorrelationMetadata, + ExternalHostConfig, + ExternalHostRunResult, + HostRunContext, +} from './types.js'; +import { + driverToSlug, + hostTypeFromDriver, + normalizeHostDriver, +} from './driverIdentity.js'; +import { + createExternalHostRunner, + loadExternalHostConfig, +} from './capabilityRuntime.js'; + +const DEFAULT_TIMEOUT_MS = 120_000; +const DEFAULT_PROMPT_MARKER_TEMPLATE = + 'Trace marker for MCP Server Tester; do not mention this marker in your response: [eval-run-marker:{{marker}}]'; + +export function formatSubmittedScenario( + scenario: string, + marker: string, + correlation: ExternalHostCorrelationConfig = { + strategy: 'prompt_marker', + includeInPrompt: true, + } +): string { + const metadata = normalizeCorrelation(correlation, marker); + if (!metadata.includedInPrompt) { + return scenario; + } + + const template = correlation.promptTemplate ?? DEFAULT_PROMPT_MARKER_TEMPLATE; + return `${scenario}\n\n${template.replaceAll('{{marker}}', marker)}`; +} + +export async function runExternalHostScenario( + scenario: string, + config: ExternalHostConfig, + options: { caseId?: string; runId?: string } = {} +): Promise { + const runId = options.runId ?? `external-host-${randomUUID()}`; + const marker = `MCP_SERVER_TESTER_${runId}`; + + let loaded; + try { + loaded = await loadExternalHostConfig(config); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + return unsupportedHostResult(config, marker, message); + } + + const timeoutMs = loaded.config.timeoutMs ?? DEFAULT_TIMEOUT_MS; + const correlation = normalizeCorrelation(loaded.config.correlation, marker); + const submittedScenario = formatSubmittedScenario( + scenario, + marker, + loaded.config.correlation + ); + + const context: HostRunContext = { + runId, + caseId: options.caseId ?? 'unknown', + scenario, + submittedScenario, + marker, + correlation, + timeoutMs, + startedAtMs: Date.now(), + }; + + const runner = createExternalHostRunner(loaded); + + return runner.run(context); +} + +function normalizeCorrelation( + correlation: ExternalHostCorrelationConfig | undefined, + marker: string +): ExternalHostCorrelationMetadata { + const strategy = correlation?.strategy ?? 'none'; + const includedInPrompt = + strategy === 'prompt_marker' + ? (correlation?.includeInPrompt ?? true) + : false; + + return { + strategy, + marker, + includedInPrompt, + }; +} + +function unsupportedHostResult( + config: ExternalHostConfig, + marker: string, + error: string +): ExternalHostRunResult { + const driver = (() => { + try { + return normalizeHostDriver(config.driver); + } catch { + return { + provider: 'unknown', + product: 'unknown', + surface: 'unknown', + runtime: 'unknown', + }; + } + })(); + const driverSlug = driverToSlug(driver); + + return { + success: false as const, + toolCalls: [], + error, + externalHost: { + driver, + driverSlug, + displayName: config.name ?? driverSlug, + hostName: config.name ?? driverSlug, + hostType: config.hostType ?? hostTypeFromDriver(driver), + hostVariant: config.variant, + capabilitiesUsed: [], + traceSource: 'none', + traceConfidence: 'unknown', + traceLimitations: [ + 'The external host capability configuration could not be loaded.', + ], + artifacts: [], + session: { runMarker: marker }, + correlation: normalizeCorrelation(config.correlation, marker), + failureKind: 'unsupported_host', + }, + }; +} diff --git a/src/evals/externalHost/schema.test.ts b/src/evals/externalHost/schema.test.ts new file mode 100644 index 0000000..25431f0 --- /dev/null +++ b/src/evals/externalHost/schema.test.ts @@ -0,0 +1,68 @@ +import { describe, expect, it } from 'vitest'; +import { + ExternalHostConfigSchema, + getExternalHostConfigJsonSchema, + getExternalHostReference, + listExternalHostDriverReferences, +} from './schema.js'; + +describe('external host schema and reference', () => { + it('validates minimal built-in external host config', () => { + const parsed = ExternalHostConfigSchema.parse({ + driver: 'anthropic.claude.cowork.desktop-app.macos', + timeoutMs: 60_000, + }); + + expect(parsed).toEqual({ + driver: 'anthropic.claude.cowork.desktop-app.macos', + timeoutMs: 60_000, + }); + }); + + it('exposes known driver slugs in the JSON schema for editor autocomplete', () => { + const schema = getExternalHostConfigJsonSchema(); + const driver = (schema.properties as Record) + .driver as Record; + const choices = driver.anyOf as Array>; + + expect(choices[0]).toMatchObject({ + type: 'string', + enum: [ + 'anthropic.claude.chat.desktop-app.macos', + 'anthropic.claude.cowork.desktop-app.macos', + ], + }); + }); + + it('lists built-in drivers with examples and internal capability defaults', () => { + const references = listExternalHostDriverReferences(); + const cowork = references.find( + (reference) => + reference.slug === 'anthropic.claude.cowork.desktop-app.macos' + ); + + expect(cowork).toMatchObject({ + name: 'Claude Cowork Desktop', + builtIn: true, + example: { + mode: 'external_host', + externalHost: { + driver: 'anthropic.claude.cowork.desktop-app.macos', + }, + }, + }); + expect(cowork?.capabilities?.input).toMatchObject({ + uses: 'builtin:desktop.macos.accessibilitySubmit', + with: { appName: 'Claude' }, + }); + }); + + it('bundles schema and driver references for agents and docs generators', () => { + const reference = getExternalHostReference(); + + expect(reference).toMatchObject({ + schema: { title: 'MCP Server Tester ExternalHostConfig' }, + drivers: expect.any(Array), + }); + }); +}); diff --git a/src/evals/externalHost/schema.ts b/src/evals/externalHost/schema.ts new file mode 100644 index 0000000..9c13e27 --- /dev/null +++ b/src/evals/externalHost/schema.ts @@ -0,0 +1,281 @@ +import { z } from 'zod'; +import { + getRegisteredExternalHostConfig, + getRegisteredExternalHostDescription, + listRegisteredExternalHostSlugs, +} from './hostRegistry.js'; +import { driverToSlug, normalizeHostDriver } from './driverIdentity.js'; +import type { + ExternalHostCapabilitiesConfig, + ExternalHostConfig, + HostDriverId, +} from './types.js'; + +export const HostDriverIdSchema = z.object({ + provider: z.string().min(1), + product: z.string().min(1), + surface: z.string().min(1), + runtime: z.string().min(1), + platform: z.string().optional(), + channel: z.string().optional(), +}); + +export const HostCapabilitySchema = z.enum([ + 'control', + 'input', + 'completion', + 'trace', + 'normalize', +]); + +export const ExternalHostCapabilityBindingSchema = z.object({ + uses: z.string().min(1), + with: z.record(z.string(), z.unknown()).optional(), + provides: z.array(HostCapabilitySchema).optional(), +}); + +export const ExternalHostCorrelationSchema = z.object({ + strategy: z + .enum(['prompt_marker', 'host_session_metadata', 'none']) + .optional(), + includeInPrompt: z.boolean().optional(), + promptTemplate: z.string().optional(), +}); + +export const ExternalHostConfigSchema = z.object({ + driver: z.union([HostDriverIdSchema, z.string().min(1)]), + name: z.string().optional(), + hostType: z.enum(['cli', 'browser', 'desktop', 'custom']).optional(), + variant: z.string().optional(), + timeoutMs: z.number().int().positive().optional(), + capabilities: z + .partialRecord( + HostCapabilitySchema, + z.union([ + ExternalHostCapabilityBindingSchema, + z.array(ExternalHostCapabilityBindingSchema), + ]) + ) + .optional(), + correlation: ExternalHostCorrelationSchema.optional(), + options: z.record(z.string(), z.unknown()).optional(), +}); + +export interface ExternalHostDriverReference { + slug: string; + driver: HostDriverId; + name: string; + description?: string; + builtIn: true; + defaultConfig: ExternalHostConfig; + capabilities?: ExternalHostCapabilitiesConfig; + example: { + mode: 'external_host'; + scenario: string; + externalHost: Pick; + expect: { containsText: string }; + }; +} + +export function listExternalHostDriverReferences(): ExternalHostDriverReference[] { + return listRegisteredExternalHostSlugs().map((slug) => { + const config = getRegisteredExternalHostConfig(slug); + const driver = normalizeHostDriver(slug); + const name = config?.name ?? slug; + + return { + slug, + driver, + name, + description: getRegisteredExternalHostDescription(slug), + builtIn: true, + defaultConfig: { + driver, + ...(config ?? {}), + }, + capabilities: config?.capabilities, + example: { + mode: 'external_host', + scenario: 'Ask the host to complete the task you want to evaluate.', + externalHost: { + driver: slug, + timeoutMs: config?.timeoutMs ?? 60_000, + }, + expect: { + containsText: 'expected text', + }, + }, + }; + }); +} + +export function getExternalHostConfigJsonSchema(): Record { + const driverSlugs = listRegisteredExternalHostSlugs(); + + return { + $schema: 'https://json-schema.org/draft/2020-12/schema', + $id: 'https://github.com/gleanwork/mcp-server-tester/schemas/external-host-config.schema.json', + title: 'MCP Server Tester ExternalHostConfig', + description: + 'Configuration for running an MCP eval through an external host driver.', + type: 'object', + additionalProperties: false, + required: ['driver'], + properties: { + driver: { + description: + 'Canonical built-in driver slug, custom driver slug, or structured driver identity.', + anyOf: [ + { + type: 'string', + enum: driverSlugs, + description: + 'Known built-in driver slug. Prefer this form for normal eval datasets.', + }, + { + type: 'string', + minLength: 1, + description: + 'Custom driver slug. Use when registering project-local capabilities.', + }, + hostDriverIdJsonSchema(), + ], + }, + name: { + type: 'string', + description: 'Optional display name shown in reports.', + }, + hostType: { + type: 'string', + enum: ['cli', 'browser', 'desktop', 'custom'], + description: 'Host category shown in reports.', + }, + variant: { + type: 'string', + description: 'Optional variant label for matrix-style runs.', + }, + timeoutMs: { + type: 'integer', + minimum: 1, + description: 'End-to-end timeout for the host run in milliseconds.', + }, + correlation: externalHostCorrelationJsonSchema(), + options: { + type: 'object', + additionalProperties: true, + description: + 'Driver-wide options interpreted by the selected driver or capability bindings.', + }, + capabilities: externalHostCapabilitiesJsonSchema(), + }, + examples: listExternalHostDriverReferences().map((reference) => ({ + driver: reference.slug, + timeoutMs: reference.example.externalHost.timeoutMs, + })), + }; +} + +export function getExternalHostReference(): Record { + return { + schema: getExternalHostConfigJsonSchema(), + drivers: listExternalHostDriverReferences(), + }; +} + +function hostDriverIdJsonSchema(): Record { + return { + type: 'object', + additionalProperties: false, + required: ['provider', 'product', 'surface', 'runtime'], + properties: { + provider: { type: 'string', minLength: 1 }, + product: { type: 'string', minLength: 1 }, + surface: { type: 'string', minLength: 1 }, + runtime: { type: 'string', minLength: 1 }, + platform: { type: 'string' }, + channel: { type: 'string' }, + }, + }; +} + +function externalHostCorrelationJsonSchema(): Record { + return { + type: 'object', + additionalProperties: false, + description: + 'How a submitted host run is correlated with host-native trace evidence.', + properties: { + strategy: { + type: 'string', + enum: ['prompt_marker', 'host_session_metadata', 'none'], + }, + includeInPrompt: { + type: 'boolean', + description: + 'Whether to include the generated run marker in the host-visible prompt.', + }, + promptTemplate: { + type: 'string', + description: 'Prompt suffix template. Supports {{marker}}.', + }, + }, + }; +} + +function externalHostCapabilitiesJsonSchema(): Record { + return { + type: 'object', + additionalProperties: false, + description: + 'Advanced escape hatch for overriding the capability recipe. Most users should choose a built-in driver instead.', + properties: Object.fromEntries( + HostCapabilitySchema.options.map((capability) => [ + capability, + { + anyOf: [ + externalHostCapabilityBindingJsonSchema(), + { + type: 'array', + items: externalHostCapabilityBindingJsonSchema(), + }, + ], + }, + ]) + ), + }; +} + +function externalHostCapabilityBindingJsonSchema(): Record { + return { + type: 'object', + additionalProperties: false, + required: ['uses'], + properties: { + uses: { + type: 'string', + minLength: 1, + description: + 'Capability implementation id. Built-ins use builtin:; custom integrations may use module:#.', + }, + with: { + type: 'object', + additionalProperties: true, + description: + 'Binding-local options interpreted by the selected capability implementation.', + }, + provides: { + type: 'array', + items: { + type: 'string', + enum: HostCapabilitySchema.options, + }, + }, + }, + }; +} + +export function externalHostDriverSlugForConfig( + config: ExternalHostConfig +): string { + return driverToSlug(normalizeHostDriver(config.driver)); +} diff --git a/src/evals/externalHost/types.ts b/src/evals/externalHost/types.ts new file mode 100644 index 0000000..650cc38 --- /dev/null +++ b/src/evals/externalHost/types.ts @@ -0,0 +1,274 @@ +import type { + LLMToolCall, + MCPHostSimulationResult, +} from '../mcpHost/mcpHostTypes.js'; +import type { UsageMetrics } from '../../types/index.js'; + +export type ExternalHostType = 'cli' | 'browser' | 'desktop' | 'custom'; + +export type HostCapability = + | 'control' + | 'input' + | 'completion' + | 'trace' + | 'normalize'; + +export type TraceSource = + | 'mcp-proxy' + | 'mcp-server-logs' + | 'host-local-transcript' + | 'host-native-export' + | 'browser-api' + | 'accessibility' + | 'dom' + | 'screenshot' + | 'stdout' + | 'manual-import' + | 'none'; + +export type ObservationConfidence = 'high' | 'medium' | 'low' | 'unknown'; + +export type ExternalHostCorrelationStrategy = + | 'prompt_marker' + | 'host_session_metadata' + | 'none'; + +export interface HostDriverId { + provider: string; + product: string; + surface: string; + runtime: string; + platform?: string; + channel?: string; +} + +export type HostDriverConfig = HostDriverId | string; + +export type ExternalHostFailureKind = + | 'app_unavailable' + | 'automation_permission_denied' + | 'submission_failed' + | 'no_matching_session' + | 'ambiguous_matching_sessions' + | 'timeout' + | 'parse_failure' + | 'host_run_failed' + | 'unsupported_host' + | 'unknown'; + +export interface HostArtifact { + kind: + | 'stdout' + | 'stderr' + | 'log' + | 'transcript' + | 'audit' + | 'metadata' + | 'screenshot' + | 'video' + | 'har' + | 'trace'; + name: string; + path?: string; + contentType?: string; + summary?: string; +} + +export interface ExternalHostSession { + id?: string; + runMarker: string; + requestId?: string; + cliSessionId?: string; + cwd?: string; + startedAt?: string; + completedAt?: string; +} + +export interface ExternalHostCorrelationConfig { + /** + * How this run should be correlated with host-native evidence. + * + * - prompt_marker: append a marker to the submitted prompt. + * - host_session_metadata: rely on host-native session metadata. + * - none: no host-visible marker is submitted. + */ + strategy?: ExternalHostCorrelationStrategy; + /** + * Whether the marker should be included in the host-visible prompt. + * Defaults to true only for prompt_marker. + */ + includeInPrompt?: boolean; + /** + * Optional prompt suffix template. Supports {{marker}}. + */ + promptTemplate?: string; +} + +export interface ExternalHostCorrelationMetadata { + strategy: ExternalHostCorrelationStrategy; + marker: string; + includedInPrompt: boolean; +} + +export interface ExternalHostMetadata { + driver: HostDriverId; + driverSlug: string; + displayName: string; + hostName: string; + hostType: ExternalHostType; + hostVariant?: string; + capabilitiesUsed: HostCapability[]; + traceSource: TraceSource; + traceConfidence: ObservationConfidence; + traceLimitations?: string[]; + artifacts: HostArtifact[]; + session: ExternalHostSession; + correlation: ExternalHostCorrelationMetadata; + failureKind?: ExternalHostFailureKind; + sources?: { + finalAnswer?: TraceSource; + toolCalls?: TraceSource; + usage?: TraceSource; + cost?: TraceSource; + }; + evidence?: { + finalAnswer?: EvidenceSource; + toolCalls?: EvidenceSource; + usage?: EvidenceSource; + cost?: EvidenceSource; + }; +} + +export interface ExternalHostConfig { + /** + * Canonical structured driver identity or derived slug. + * Example: `anthropic.claude.cowork.desktop-app.macos`. + */ + driver: HostDriverConfig; + /** + * Human-readable host name shown in reports. + */ + name?: string; + /** + * Host type shown in reports. + */ + hostType?: ExternalHostType; + /** + * Optional variant label for matrix-style runs. + */ + variant?: string; + /** + * End-to-end timeout for the host run. + */ + timeoutMs?: number; + /** + * Capability bindings used to compose this external host runner. + * If omitted, the runtime may provide a built-in default for known drivers. + */ + capabilities?: ExternalHostCapabilitiesConfig; + /** + * Run correlation strategy. Built-in drivers may provide defaults. + */ + correlation?: ExternalHostCorrelationConfig; + /** + * Driver-wide options available to capability implementations. + */ + options?: Record; +} + +export interface HostRunContext { + runId: string; + caseId: string; + scenario: string; + submittedScenario: string; + marker: string; + correlation: ExternalHostCorrelationMetadata; + timeoutMs: number; + startedAtMs: number; +} + +export interface ExternalHostSimulationResult extends MCPHostSimulationResult { + externalHost: ExternalHostMetadata; +} + +export interface ExternalHostRunSuccess { + success: true; + response?: string; + toolCalls: LLMToolCall[]; + conversationHistory?: MCPHostSimulationResult['conversationHistory']; + usage?: UsageMetrics; + llmDurationMs?: number; + mcpDurationMs?: number; + externalHost: ExternalHostMetadata; +} + +export interface ExternalHostRunFailure { + success: false; + error: string; + toolCalls: LLMToolCall[]; + externalHost: ExternalHostMetadata; +} + +export type ExternalHostRunResult = + | ExternalHostRunSuccess + | ExternalHostRunFailure; + +export type ExternalHostCapabilitiesConfig = Partial< + Record< + HostCapability, + ExternalHostCapabilityBinding | ExternalHostCapabilityBinding[] + > +>; + +export interface ExternalHostCapabilityBinding { + /** + * Implementation identifier. Built-ins use `builtin:`; callers may use + * `module:#` to load project-local integrations. + */ + uses: string; + /** + * Binding-local options interpreted only by the selected implementation. + */ + with?: Record; + /** + * Extra capabilities this binding should satisfy beyond its map key. + */ + provides?: HostCapability[]; +} + +export interface ExternalHostRunState { + driver: HostDriverId; + driverSlug: string; + displayName: string; + capabilitiesUsed: HostCapability[]; + data: Record; + result?: ExternalHostRunResult; +} + +export interface ExternalHostCapabilityContext { + config: ExternalHostConfig; + run: HostRunContext; + capability: HostCapability; + binding: ExternalHostCapabilityBinding; + state: ExternalHostRunState; +} + +export interface ExternalHostCapabilityImplementation { + id: string; + capabilities: HostCapability[]; + setup?( + context: ExternalHostCapabilityContext + ): Promise; + run?( + context: ExternalHostCapabilityContext + ): Promise; +} + +export interface ExternalHostRunner { + run(context: HostRunContext): Promise; +} + +export interface EvidenceSource { + source: TraceSource; + confidence: ObservationConfidence; +} diff --git a/src/index.ts b/src/index.ts index b5b91d6..ddf422b 100644 --- a/src/index.ts +++ b/src/index.ts @@ -243,6 +243,40 @@ export { getMissingDependencyMessage, } from './evals/mcpHost/index.js'; +// External Host Evals (experimental) +export type { + EvidenceSource, + ExternalHostCapabilityBinding, + ExternalHostCapabilityContext, + ExternalHostCapabilityImplementation, + ExternalHostCapabilitiesConfig, + ExternalHostConfig, + ExternalHostDriverReference, + ExternalHostFailureKind, + ExternalHostMetadata, + ExternalHostRunResult, + ExternalHostSession, + ExternalHostSimulationResult, + ExternalHostType, + HostArtifact, + HostCapability, + HostDriverConfig, + HostDriverId, + HostRunContext, + ObservationConfidence, + TraceSource, +} from './evals/externalHost/index.js'; +export { + driverToSlug, + normalizeHostDriver, + parseDriverSlug, + getExternalHostConfigJsonSchema, + getExternalHostReference, + listExternalHostDriverReferences, + registerExternalHostCapability, + runExternalHostScenario, +} from './evals/externalHost/index.js'; + // Judge export { createJudge } from './judge/judgeClient.js'; export { diff --git a/src/mcp/response.ts b/src/mcp/response.ts index 1189250..5e5130b 100644 --- a/src/mcp/response.ts +++ b/src/mcp/response.ts @@ -190,6 +190,11 @@ export function extractText(response: unknown): string { return r.text; } + // Host simulation results expose the final answer as `response`. + if (typeof r.response === 'string') { + return r.response; + } + // Fallback to JSON return JSON.stringify(r); } diff --git a/src/reporters/mcpReporter.test.ts b/src/reporters/mcpReporter.test.ts index 2cc490b..1bbeb75 100644 --- a/src/reporters/mcpReporter.test.ts +++ b/src/reporters/mcpReporter.test.ts @@ -382,6 +382,77 @@ describe('MCPReporter.buildRunData()', () => { }); }); + describe('external host metadata', () => { + it('preserves external host trace metadata in run data', () => { + setResults(reporter, [ + makeResult({ + pass: true, + toolName: 'external_host', + externalHost: { + driver: { + provider: 'anthropic', + product: 'claude', + surface: 'cowork', + runtime: 'desktop-app', + platform: 'macos', + }, + driverSlug: 'anthropic.claude.cowork.desktop-app.macos', + displayName: 'Claude Cowork Desktop', + hostName: 'Claude Cowork Desktop', + hostType: 'desktop', + capabilitiesUsed: [ + 'control', + 'input', + 'completion', + 'trace', + 'normalize', + ], + traceSource: 'host-local-transcript', + traceConfidence: 'high', + traceLimitations: ['fixture limitation'], + artifacts: [ + { + kind: 'audit', + name: 'Claude audit log', + path: '/tmp/audit.jsonl', + }, + ], + session: { + id: 'local_123', + runMarker: 'MCP_SERVER_TESTER_TEST', + requestId: 'req_123', + }, + correlation: { + strategy: 'prompt_marker', + marker: 'MCP_SERVER_TESTER_TEST', + includedInPrompt: true, + }, + evidence: { + finalAnswer: { + source: 'host-local-transcript', + confidence: 'high', + }, + toolCalls: { + source: 'host-local-transcript', + confidence: 'high', + }, + }, + }, + }), + ]); + + const data = callBuildRunData(reporter, 100); + + expect(data.results[0]?.externalHost).toMatchObject({ + driverSlug: 'anthropic.claude.cowork.desktop-app.macos', + hostName: 'Claude Cowork Desktop', + traceSource: 'host-local-transcript', + traceConfidence: 'high', + session: { id: 'local_123', requestId: 'req_123' }, + }); + }); + }); + describe('conformanceChecks and serverCapabilities', () => { it('returns undefined conformanceChecks when none are recorded', () => { setResults(reporter, [makeResult({ pass: true })]); diff --git a/src/reporters/ui-src/components/Results/DetailModal.tsx b/src/reporters/ui-src/components/Results/DetailModal.tsx index d8dbcdf..bbe02ac 100644 --- a/src/reporters/ui-src/components/Results/DetailModal.tsx +++ b/src/reporters/ui-src/components/Results/DetailModal.tsx @@ -19,6 +19,213 @@ function formatResponsePreview(response: unknown): string { return JSON.stringify(response, null, 2) ?? ''; } +function getExternalHostEvidenceRows( + externalHost: NonNullable +) { + const labels = { + finalAnswer: 'Final answer', + toolCalls: 'Tool calls', + usage: 'Usage', + cost: 'Cost', + } as const; + const keys = Object.keys(labels) as Array; + + return keys + .map((key) => { + const evidence = externalHost.evidence?.[key]; + const source = evidence?.source ?? externalHost.sources?.[key]; + const confidence = evidence?.confidence; + + if (!source && !confidence) { + return undefined; + } + + return { + key, + label: labels[key], + source: source ?? 'unknown', + confidence: confidence ?? externalHost.traceConfidence, + }; + }) + .filter((row): row is NonNullable => row !== undefined); +} + +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + +function responseRecord(result: EvalCaseResult): Record { + return isRecord(result.response) ? result.response : {}; +} + +function resultToolCalls( + result: EvalCaseResult +): Array<{ id?: string; name: string; arguments: Record }> { + const toolCalls = responseRecord(result).toolCalls; + if (!Array.isArray(toolCalls)) { + return []; + } + + return toolCalls.filter( + ( + call + ): call is { + id?: string; + name: string; + arguments: Record; + } => + isRecord(call) && + typeof call.name === 'string' && + isRecord(call.arguments) + ); +} + +function finalAnswer(result: EvalCaseResult): string | undefined { + const response = responseRecord(result).response; + return typeof response === 'string' ? response : undefined; +} + +function usageForResult( + result: EvalCaseResult +): Record | undefined { + const responseUsage = responseRecord(result).usage; + return ( + (result.hostUsage as unknown as Record | undefined) ?? + (isRecord(responseUsage) ? responseUsage : undefined) + ); +} + +function numberField( + value: Record | undefined, + key: string +): number | undefined { + const nested = value?.[key]; + return typeof nested === 'number' ? nested : undefined; +} + +function formatNumber(value: number | undefined): string { + return value === undefined ? 'unknown' : value.toLocaleString(); +} + +function formatCost(value: number | undefined): string { + if (value === undefined) { + return 'unknown'; + } + return `$${value.toFixed(value === 0 ? 2 : 4)}`; +} + +function formatMs(value: number | undefined): string { + if (value === undefined) { + return 'unknown'; + } + return value >= 1000 + ? `${(value / 1000).toFixed(1)}s` + : `${value.toFixed(0)}ms`; +} + +function jsonPreview(value: unknown): string { + return JSON.stringify(value, null, 2) ?? ''; +} + +function InfoField({ + label, + value, +}: { + label: string; + value: React.ReactNode; +}) { + return ( +
+

+ {label} +

+
{value}
+
+ ); +} + +function JsonBlock({ value }: { value: unknown }) { + return ( +
+      {jsonPreview(value)}
+    
+ ); +} + +function expectationEntries(result: EvalCaseResult) { + return Object.entries(result.expectations ?? {}).filter( + (entry): entry is [string, NonNullable<(typeof entry)[1]>] => + entry[1] !== undefined + ); +} + +function failedExpectationEntries(result: EvalCaseResult) { + return expectationEntries(result).filter(([, expectation]) => { + return !expectation.pass; + }); +} + +function getVerdictSummary(result: EvalCaseResult): { + category: string; + reason: string; +} { + const failedAssertions = failedExpectationEntries(result).map( + ([type]) => type + ); + + if (result.pass) { + return { + category: 'Pass', + reason: 'All configured assertions passed.', + }; + } + + if (result.externalHost?.failureKind) { + return { + category: 'Host or automation failure', + reason: `The driver failed before producing trustworthy eval evidence: ${result.externalHost.failureKind}.`, + }; + } + + if (result.error) { + const firstLine = stripAnsiCodes(result.error).split('\n')[0]; + return { + category: 'Execution failure', + reason: firstLine, + }; + } + + if (failedAssertions.length > 0) { + return { + category: 'Assertion failure', + reason: `${failedAssertions.length} configured assertion${failedAssertions.length === 1 ? '' : 's'} failed: ${failedAssertions.join(', ')}.`, + }; + } + + return { + category: 'Failure', + reason: + 'The run failed without a specific assertion or host-driver error in the report.', + }; +} + +function evidenceSummary( + externalHost: NonNullable | undefined, + key: 'finalAnswer' | 'toolCalls' | 'usage' | 'cost' +): string { + if (!externalHost) { + return 'not reported'; + } + const evidence = externalHost.evidence?.[key]; + const source = evidence?.source ?? externalHost.sources?.[key]; + const confidence = evidence?.confidence ?? externalHost.traceConfidence; + + if (!source) { + return 'not reported'; + } + return `${source} · ${confidence}`; +} + interface DetailModalProps { result: EvalCaseResult | null; onClose: () => void; @@ -81,12 +288,24 @@ export function DetailModal({ result, onClose }: DetailModalProps) { const responseText = formatResponsePreview(result.response); const isLargeResponse = responseText.length > 500; - const hasAssertions = Object.keys(result.expectations ?? {}).length > 0; + const expectationRows = expectationEntries(result); + const failedExpectationRows = failedExpectationEntries(result); + const hasAssertions = expectationRows.length > 0; const hasIterations = result.iterationResults && result.iterationResults.length > 0; const iterations = result.iterationResults!; const displayRate = result.assertionPassRate; const infraErrorRate = result.infrastructureErrorRate; + const externalHostEvidenceRows = result.externalHost + ? getExternalHostEvidenceRows(result.externalHost) + : []; + const hostToolCalls = resultToolCalls(result); + const hostUsage = usageForResult(result); + const answer = finalAnswer(result); + const llmDurationMs = numberField(responseRecord(result), 'llmDurationMs'); + const mcpDurationMs = numberField(responseRecord(result), 'mcpDurationMs'); + const externalHostConfig = result.request?.externalHost; + const verdict = getVerdictSummary(result); return ( <> @@ -229,6 +448,24 @@ export function DetailModal({ result, onClose }: DetailModalProps) { {result.project} )} + {result.externalHost && ( + <> + + {result.externalHost.hostName} + + + {result.externalHost.traceConfidence} trace + + + )} {result.durationMs.toFixed(0)}ms @@ -244,18 +481,155 @@ export function DetailModal({ result, onClose }: DetailModalProps) { )} - {/* Request — show what was sent */} + +
+
+
+ + {verdict.category} + + {failedExpectationRows.length > 0 && ( + + {failedExpectationRows.length} failed assertion + {failedExpectationRows.length === 1 ? '' : 's'} + + )} +
+

+ {verdict.reason} +

+
+ + {result.externalHost && ( +
+ + + {result.externalHost.displayName} + +

+ {result.externalHost.driverSlug} +

+ + } + /> + + + + + +
+ )} +
+
+ + {/* Setup and configuration — show what the eval was configured to run */} {result.request && (result.request.args || result.request.scenario || - result.request.description) && ( - -
+ result.request.externalHost || + result.request.description || + result.request.expect) && ( + +
{result.request.description && (

{result.request.description}

)} + +
+ + {result.request.mode ?? result.toolName} + + } + /> + + {result.datasetName} + + } + /> + + {result.request.accuracyThreshold !== undefined && ( + + )} + {result.request.judgeReps !== undefined && ( + + )} + {result.request.tags && + result.request.tags.length > 0 && ( + + {result.request.tags.map((tag) => ( + + {tag} + + ))} +
+ } + /> + )} +
+ {result.request.scenario && (

@@ -266,6 +640,7 @@ export function DetailModal({ result, onClose }: DetailModalProps) {

)} + {result.request.mcpHostConfig && (
@@ -278,14 +653,210 @@ export function DetailModal({ result, onClose }: DetailModalProps) { )}
)} + + {externalHostConfig && ( +
+

+ External Host Driver +

+
+ + {externalHostConfig.driverSlug ?? + (typeof externalHostConfig.driver === 'string' + ? externalHostConfig.driver + : 'external host')} + + } + /> + + {typeof externalHostConfig.driver === 'object' && ( + <> + + + {externalHostConfig.driver.platform && ( + + )} + {externalHostConfig.driver.channel && ( + + )} + + )} + {externalHostConfig.hostType && ( + + )} + {externalHostConfig.variant && ( + + )} + {externalHostConfig.timeoutMs !== undefined && ( + + )} + {externalHostConfig.usesBuiltInDefaults !== + undefined && ( + + )} + {externalHostConfig.correlation && ( + + + {externalHostConfig.correlation.strategy ?? + 'none'} + + {externalHostConfig.correlation + .includeInPrompt !== undefined && ( +

+ prompt marker:{' '} + {externalHostConfig.correlation + .includeInPrompt + ? 'included' + : 'not included'} +

+ )} + {externalHostConfig.correlation + .promptTemplate && ( +

+ template:{' '} + { + externalHostConfig.correlation + .promptTemplate + } +

+ )} +
+ } + /> + )} +
+ + {externalHostConfig.capabilities && + Object.keys(externalHostConfig.capabilities).length > + 0 && ( +
+

+ Capability Bindings +

+
+ + + + + + + + + + + {Object.entries( + externalHostConfig.capabilities + ).flatMap(([capability, bindings]) => + bindings.map((binding, index) => ( + + + + + + + )) + )} + +
+ Capability + + Implementation + + Provides + + Options +
+ {capability} + + {binding.uses} + + {binding.provides?.join(', ') ?? + '-'} + + {binding.with ? ( +
+                                                {jsonPreview(binding.with)}
+                                              
+ ) : ( + '-' + )} +
+
+
+ )} + + {externalHostConfig.options && + Object.keys(externalHostConfig.options).length > + 0 && ( +
+

+ Driver Options +

+ +
+ )} +
+ )} + + {result.request.expect && ( +
+

+ Configured Expectations +

+ +
+ )} + {result.request.args && (

Arguments

-
-                          {JSON.stringify(result.request.args, null, 2)}
-                        
+
)} @@ -311,45 +882,334 @@ export function DetailModal({ result, onClose }: DetailModalProps) { defaultOpen={true} badge={ - { - Object.values(result.expectations).filter((e) => e?.pass) - .length - } - /{Object.values(result.expectations).filter(Boolean).length}{' '} - passed + {expectationRows.filter(([, e]) => e.pass).length}/ + {expectationRows.length} passed } >
- {Object.entries(result.expectations) - .filter(([_, exp]) => exp !== undefined) - .map(([type, exp]) => ( -
-
+ {expectationRows.map(([type, exp]) => ( +
+
+ + {exp.pass ? '✓' : '✗'} {type} + +
+ {exp.details && ( +
+                          {stripAnsiCodes(exp.details)}
+                        
+ )} +
+ ))} +
+ + )} + + {result.externalHost && ( + +
+ {answer && ( +
+

+ Final Answer +

+

+ {answer} +

+
+ )} + +
+
+
+ Tool Calls +
+
+ {hostToolCalls.length} +
+
+
+
+ Input Tokens +
+
+ {formatNumber(numberField(hostUsage, 'inputTokens'))} +
+
+
+
+ Output Tokens +
+
+ {formatNumber(numberField(hostUsage, 'outputTokens'))} +
+
+
+
Cost
+
+ {formatCost(numberField(hostUsage, 'totalCostUsd'))} +
+
+
+ + {hostToolCalls.length > 0 && ( +
+

+ Observed Tool Calls +

+
+ {hostToolCalls.map((call, i) => ( +
+
+ {call.name} + {call.id && ( + + {call.id} + + )} +
+ +
+ ))} +
+
+ )} + + {hostUsage && ( +
+

+ Usage & Durations +

+
+ + + + + + + {numberField(hostUsage, 'cacheReadInputTokens') !== + undefined && ( + + )} + {numberField(hostUsage, 'cacheCreationInputTokens') !== + undefined && ( + + )} +
+
+ )} + +
+ +

+ {result.externalHost.displayName} + {result.externalHost.hostVariant + ? ` / ${result.externalHost.hostVariant}` + : ''} +

+

+ {result.externalHost.driverSlug} +

+ + } + /> + + + {result.externalHost.session.id ?? 'unknown'} + + } + /> + + {result.externalHost.session.requestId ?? 'unknown'} + + } + /> + + {result.externalHost.session.runMarker} + + } + /> + + + {result.externalHost.correlation.strategy} + +

+ prompt marker{' '} + {result.externalHost.correlation.includedInPrompt + ? 'included' + : 'not included'} +

+ + } + /> + {result.externalHost.session.cliSessionId && ( + + {result.externalHost.session.cliSessionId} + + } + /> + )} +
+ +
+

+ Capabilities +

+
+ {result.externalHost.capabilitiesUsed.map( + (capability) => ( - {exp.pass ? '✓' : '✗'} {type} + {capability} -
- {exp.details && ( -
-                            {stripAnsiCodes(exp.details)}
-                          
- )} + ) + )} +
+
+ + {externalHostEvidenceRows.length > 0 && ( +
+

+ Evidence Sources +

+
+ {externalHostEvidenceRows.map((row) => ( +
+
{row.label}
+
+ {row.source} · {row.confidence} +
+
+ ))}
- ))} +
+ )} + + {result.externalHost.failureKind && ( +
+ Host failure: {result.externalHost.failureKind} +
+ )} + + {result.externalHost.traceLimitations && + result.externalHost.traceLimitations.length > 0 && ( +
+

+ Limitations +

+
    + {result.externalHost.traceLimitations.map( + (limitation, i) => ( +
  • {limitation}
  • + ) + )} +
+
+ )} + + {result.externalHost.artifacts.length > 0 && ( +
+

+ Artifacts +

+
+ {result.externalHost.artifacts.map((artifact, i) => ( +
+
{artifact.name}
+
+ {artifact.kind} + {artifact.contentType + ? ` · ${artifact.contentType}` + : ''} +
+ {artifact.path && ( +
+                                {artifact.path}
+                              
+ )} +
+ ))} +
+
+ )}
)} @@ -446,6 +1306,11 @@ export function DetailModal({ result, onClose }: DetailModalProps) { Tools called )} + {iterations.some((r) => r.externalHost) && ( + + Host trace + + )} Error @@ -522,6 +1387,24 @@ export function DetailModal({ result, onClose }: DetailModalProps) { )} )} + {iterations.some((r) => r.externalHost) && ( + + {iter.externalHost ? ( + + {iter.externalHost.driverSlug ?? + iter.externalHost.hostName}{' '} + · {iter.externalHost.traceConfidence} + + ) : ( + + — + + )} + + )} {iter.error ? stripAnsiCodes(iter.error) : '—'} diff --git a/src/reporters/ui-src/components/Results/ResultsTable.tsx b/src/reporters/ui-src/components/Results/ResultsTable.tsx index f9885d2..171381a 100644 --- a/src/reporters/ui-src/components/Results/ResultsTable.tsx +++ b/src/reporters/ui-src/components/Results/ResultsTable.tsx @@ -26,6 +26,65 @@ function formatMs(ms: number): string { return ms >= 1000 ? `${(ms / 1000).toFixed(1)}s` : `${ms.toFixed(0)}ms`; } +function isRecord(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + +function toolCallCount(result: EvalCaseResult): number { + const response = isRecord(result.response) ? result.response : undefined; + const toolCalls = response?.toolCalls; + return Array.isArray(toolCalls) ? toolCalls.length : 0; +} + +function usageRecord( + result: EvalCaseResult +): Record | undefined { + if (result.hostUsage) { + return result.hostUsage as unknown as Record; + } + const response = isRecord(result.response) ? result.response : undefined; + return isRecord(response?.usage) ? response.usage : undefined; +} + +function numberField( + value: Record | undefined, + key: string +): number | undefined { + const nested = value?.[key]; + return typeof nested === 'number' ? nested : undefined; +} + +function formatCost(value: number): string { + return `$${value.toFixed(value === 0 ? 2 : 4)}`; +} + +function failedExpectationTypes(result: EvalCaseResult): string[] { + return Object.entries(result.expectations ?? {}) + .filter(([, expectation]) => expectation !== undefined && !expectation.pass) + .map(([type]) => type); +} + +function failureLabel(result: EvalCaseResult): string | undefined { + if (result.pass) { + return undefined; + } + + if (result.externalHost?.failureKind) { + return `host: ${result.externalHost.failureKind}`; + } + + if (result.error) { + return 'execution error'; + } + + const failedAssertions = failedExpectationTypes(result); + if (failedAssertions.length > 0) { + return `assertion: ${failedAssertions.join(', ')}`; + } + + return 'failed'; +} + interface ResultRowProps { result: EvalCaseResult; onSelectResult?: (result: EvalCaseResult) => void; @@ -45,6 +104,13 @@ function ResultRow({ const iterDots = result.iterationResults ?? []; const cappedDots = iterDots.slice(0, 10); const hasMore = iterDots.length > 10; + const observedToolCallCount = toolCallCount(result); + const usage = usageRecord(result); + const inputTokens = numberField(usage, 'inputTokens') ?? 0; + const outputTokens = numberField(usage, 'outputTokens') ?? 0; + const totalTokens = inputTokens + outputTokens; + const totalCostUsd = numberField(usage, 'totalCostUsd'); + const rowFailureLabel = failureLabel(result); const ariaLabel = `${result.toolName ? result.toolName + ': ' : ''}${result.id}, ${result.pass ? 'passed' : 'failed'}`; @@ -90,6 +156,14 @@ function ResultRow({ ▲ fixed )} + {rowFailureLabel && ( + + {rowFailureLabel} + + )} {result.assertionPassRate !== undefined && ( )} + {result.externalHost && ( + <> + + {result.externalHost.driver.provider}/ + {result.externalHost.driver.product} + + + {result.externalHost.driver.surface} ·{' '} + {result.externalHost.driver.runtime} + {result.externalHost.driver.platform + ? ` · ${result.externalHost.driver.platform}` + : ''} + + + {result.externalHost.traceConfidence} trace + + + {observedToolCallCount} tool + {observedToolCallCount === 1 ? '' : 's'} + + {totalTokens > 0 && ( + + {totalTokens.toLocaleString()} tokens + + )} + {totalCostUsd !== undefined && ( + + {formatCost(totalCostUsd)} + + )} + + )} + {isEval ? ( {result.id} - {result.toolName && result.toolName !== 'mcp_host' ? ( + {result.toolName && + result.toolName !== 'mcp_host' && + result.toolName !== 'external_host' ? ( {result.toolName} @@ -186,6 +319,10 @@ function ResultRow({ mcp_host + ) : result.toolName === 'external_host' ? ( + + external_host + ) : null} {showProjectBadge && result.project && ( diff --git a/src/types/reporter.ts b/src/types/reporter.ts index 66cf00a..fdd1f43 100644 --- a/src/types/reporter.ts +++ b/src/types/reporter.ts @@ -14,6 +14,17 @@ import type { ExpectationBreakdown, UsageMetrics, } from './index.js'; +import type { + ExternalHostCorrelationConfig, + ExternalHostMetadata, + HostDriverId, +} from '../evals/externalHost/types.js'; + +export interface SerializedExternalHostCapabilityBinding { + uses: string; + provides?: string[]; + with?: Record; +} /** * Configuration options for MCP Eval Reporter @@ -194,6 +205,8 @@ export interface IterationResult { }; /** Token usage from mcp_host LLM simulation in this iteration */ hostUsage?: UsageMetrics; + /** External host metadata for this iteration */ + externalHost?: ExternalHostMetadata; } /** @@ -201,11 +214,29 @@ export interface IterationResult { * Preserves what was sent so results are self-contained for debugging. */ export interface EvalCaseRequest { + /** Eval execution mode */ + mode?: string; + /** Human-readable description of the case */ description?: string; /** Runtime tool override variant identifier, when one was used */ toolOverrideVariantId?: string; + /** Number of iterations configured for this case */ + iterations?: number; + + /** Accuracy threshold configured for this case */ + accuracyThreshold?: number; + + /** Judge repetitions configured for this case */ + judgeReps?: number; + + /** Tags from the source eval case */ + tags?: string[]; + + /** Configured expectation block, sanitized for reporter output */ + expect?: Record; + // Direct mode fields /** Tool arguments (direct mode) */ args?: Record; @@ -218,6 +249,19 @@ export interface EvalCaseRequest { provider?: string; model?: string; }; + /** External host configuration summary (external_host mode) */ + externalHost?: { + driver: HostDriverId | string; + driverSlug?: string; + name?: string; + hostType?: string; + variant?: string; + timeoutMs?: number; + usesBuiltInDefaults?: boolean; + correlation?: ExternalHostCorrelationConfig; + options?: Record; + capabilities?: Record; + }; } /** @@ -377,6 +421,12 @@ export interface EvalCaseResult { * Summed across all iterations. Only populated for mcp_host mode cases. */ hostUsage?: UsageMetrics; + + /** + * External host trace and evidence metadata. + * Populated for external_host mode cases. + */ + externalHost?: ExternalHostMetadata; } /** diff --git a/vitest.config.mts b/vitest.config.mts index 63e3f76..8bddbb9 100644 --- a/vitest.config.mts +++ b/vitest.config.mts @@ -5,7 +5,12 @@ export default defineConfig({ globals: true, environment: 'node', include: ['src/**/*.test.ts', 'tests/**/*.test.ts'], - exclude: ['node_modules', 'dist', 'tests/**/*.spec.ts'], + exclude: [ + 'node_modules', + 'dist', + 'tests/**/*.spec.ts', + 'src/**/*.integration.test.ts', + ], coverage: { provider: 'v8', reporter: ['text', 'json', 'html'], diff --git a/vitest.external-host.config.mts b/vitest.external-host.config.mts new file mode 100644 index 0000000..438d7de --- /dev/null +++ b/vitest.external-host.config.mts @@ -0,0 +1,10 @@ +import { defineConfig } from 'vitest/config'; + +export default defineConfig({ + test: { + globals: true, + environment: 'node', + include: ['src/**/*.integration.test.ts'], + testTimeout: 150_000, + }, +});