diff --git a/docs/api-reference.md b/docs/api-reference.md
index 36f6400..f43edb4 100644
--- a/docs/api-reference.md
+++ b/docs/api-reference.md
@@ -435,10 +435,7 @@ The result includes pass-rate deltas, optional tool precision/recall/F1 deltas,
 
 **Result Structure:**
 
-```typescript snippet=src/evals/evalRunner.ts#L106-L184
-/**
- * Overall result of running an eval dataset
- */
+```typescript snippet=src/evals/evalRunner.ts#L121-L195
 export interface EvalRunnerResult {
   /**
    * Total number of cases
@@ -1043,7 +1040,12 @@ interface MCPConformanceResult {
 
 ### `EvalExpectBlock`
 
-```typescript snippet=src/evals/datasetTypes.ts#L186-L277
+```typescript snippet=src/evals/datasetTypes.ts#L190-L288
+/**
+ * Unified expectation block for eval cases
+ *
+ * Mirrors the Playwright matcher API for consistency.
+ */
 export interface EvalExpectBlock {
   /**
    * Exact response match (toMatchToolResponse)
@@ -1102,8 +1104,9 @@ export interface EvalExpectBlock {
   };
 
   /**
-   * Asserts which tools the LLM called during a mcp_host simulation.
-   * Only meaningful for mcp_host mode — direct mode has no tool call trace.
+   * Asserts which tools the LLM called during a host simulation.
+   * Only meaningful for mcp_host or external_host runs with high-confidence
+   * structured tool evidence — direct mode has no tool call trace.
    */
   toolsTriggered?: {
     /** Expected tool calls */
@@ -1125,7 +1128,8 @@ export interface EvalExpectBlock {
   };
 
   /**
-   * Asserts the number of tool calls made during a mcp_host simulation.
+   * Asserts the number of tool calls made during a host simulation.
+   * External-host runs require high-confidence structured tool evidence.
    */
   toolCallCount?: {
     /** Minimum number of tool calls */
@@ -1140,7 +1144,14 @@ export interface EvalExpectBlock {
 
 ### `EvalCase`
 
-````typescript snippet=src/evals/datasetTypes.ts#L27-L139
+````typescript snippet=src/evals/datasetTypes.ts#L23-L148
+/**
+ * A single eval test case
+ *
+ * For 'direct' mode: toolName and args are required
+ * For 'mcp_host' mode: scenario and mcpHostConfig are required
+ * For 'external_host' mode: scenario and externalHost are required
+ */
 export interface EvalCase {
   /**
    * Unique identifier for this test case
@@ -1155,7 +1166,8 @@ export interface EvalCase {
   /**
    * Evaluation mode
    * - 'direct': Direct API calls to MCP tools (default)
-   * - 'mcp_host': LLM-driven tool selection via natural language
+   * - 'mcp_host': SDK/CLI host simulation via natural language
+   * - 'external_host': Real external MCP host driven by configured capabilities
    *
    * @default 'direct'
    */
@@ -1172,7 +1184,7 @@ export interface EvalCase {
   args?: Record<string, unknown>;
 
   /**
-   * Natural language scenario for LLM to execute (optional, required for 'mcp_host' mode)
+   * Natural language scenario for LLM to execute (required for 'mcp_host' and 'external_host' modes)
    *
    * @example "Get the weather for London and tell me if I need an umbrella"
    */
@@ -1185,6 +1197,11 @@ export interface EvalCase {
    */
   mcpHostConfig?: MCPHostConfig;
 
+  /**
+   * External host configuration (required for 'external_host' mode)
+   */
+  externalHost?: ExternalHostConfig;
+
   /**
    * Additional metadata for this test case
    *
@@ -1256,18 +1273,6 @@ export interface EvalCase {
 }
 ````
 
-### `EvalDataset`
-
-```typescript
-interface EvalDataset {
-  name: string;
-  description?: string;
-  cases: EvalCase[];
-  metadata?: Record<string, unknown>;
-  schemas?: Record<string, ZodSchema>; // Zod schemas for toMatchToolSchema assertions
-}
-```
-
 ## Next Steps
 
 - See the [Authentication Guide](./authentication.md) for OAuth and token auth
diff --git a/package-lock.json b/package-lock.json
index 3376a2e..147c174 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -16,6 +16,7 @@
         "debug": "^4.4.3",
         "ink": "^5.2.1",
         "ink-spinner": "^5.0.0",
+        "ndjson": "^2.0.0",
         "oauth4webapi": "^3.0.0",
         "open": "^10.1.0",
         "react": "^18.3.1",
@@ -30,6 +31,7 @@
         "@playwright/test": "^1.49.0",
         "@release-it-plugins/lerna-changelog": "^8.0.1",
         "@types/debug": "^4.1.12",
+        "@types/ndjson": "^2.0.4",
         "@types/node": "^22.10.2",
         "@types/react": "^18.3.12",
         "@types/react-dom": "^18.3.1",
@@ -5083,6 +5085,17 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/@types/ndjson": {
+      "version": "2.0.4",
+      "resolved": "https://registry.npmjs.org/@types/ndjson/-/ndjson-2.0.4.tgz",
+      "integrity": "sha512-ajAl7AjhFstF6waORYNSS49GL5iBKisqJlgvXuprXFKCX9fto4ordlNU3+XMgkMddgeR0WoQQBmKUk0v0dJ4pw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@types/node": "*",
+        "@types/through": "*"
+      }
+    },
     "node_modules/@types/node": {
       "version": "22.19.3",
       "resolved": "https://registry.npmjs.org/@types/node/-/node-22.19.3.tgz",
@@ -5149,6 +5162,16 @@
         "@types/node": "*"
       }
     },
+    "node_modules/@types/through": {
+      "version": "0.0.33",
+      "resolved": "https://registry.npmjs.org/@types/through/-/through-0.0.33.tgz",
+      "integrity": "sha512-HsJ+z3QuETzP3cswwtzt2vEIiHBk/dCcHGhbmG5X3ecnwFD/lPrMpliGXxSCg03L9AhrdwA4Oz/qfspkDW+xGQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@types/node": "*"
+      }
+    },
     "node_modules/@types/unist": {
       "version": "3.0.3",
       "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz",
@@ -10190,7 +10213,6 @@
       "version": "5.0.1",
       "resolved": "https://registry.npmjs.org/json-stringify-safe/-/json-stringify-safe-5.0.1.tgz",
       "integrity": "sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA==",
-      "dev": true,
       "license": "ISC"
     },
     "node_modules/json-with-bigint": {
@@ -11732,7 +11754,6 @@
       "version": "1.2.8",
       "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz",
       "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==",
-      "dev": true,
       "license": "MIT",
       "funding": {
         "url": "https://github.com/sponsors/ljharb"
@@ -11944,7 +11965,6 @@
       "version": "2.0.0",
       "resolved": "https://registry.npmjs.org/ndjson/-/ndjson-2.0.0.tgz",
       "integrity": "sha512-nGl7LRGrzugTtaFcJMhLbpzJM6XdivmbkdlaGcrk/LXg2KL/YBC6z1g70xh0/al+oFuVFP8N8kiWRucmeEH/qQ==",
-      "dev": true,
       "license": "BSD-3-Clause",
       "dependencies": {
         "json-stringify-safe": "^5.0.1",
@@ -13628,7 +13648,6 @@
       "version": "3.6.2",
       "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz",
       "integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==",
-      "dev": true,
       "license": "MIT",
       "dependencies": {
         "inherits": "^2.0.3",
@@ -14114,7 +14133,6 @@
       "version": "5.2.1",
       "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
       "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==",
-      "devOptional": true,
       "funding": [
         {
           "type": "github",
@@ -14443,7 +14461,6 @@
       "version": "3.2.2",
       "resolved": "https://registry.npmjs.org/split2/-/split2-3.2.2.tgz",
       "integrity": "sha512-9NThjpgZnifTkJpzTZ7Eue85S49QwpNhZTq6GRJwObb6jnLFNGB7Qm73V5HewTROPyxD0C29xqmaI68bQtV+hg==",
-      "dev": true,
       "license": "ISC",
       "dependencies": {
         "readable-stream": "^3.0.0"
@@ -14534,7 +14551,6 @@
       "version": "1.3.0",
       "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz",
       "integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==",
-      "dev": true,
       "license": "MIT",
       "dependencies": {
         "safe-buffer": "~5.2.0"
@@ -14874,7 +14890,6 @@
       "version": "4.0.2",
       "resolved": "https://registry.npmjs.org/through2/-/through2-4.0.2.tgz",
       "integrity": "sha512-iOqSav00cVxEEICeD7TjLB1sueEL+81Wpzp2bY17uZjZN0pWZPuo4suZ/61VujxmqSGFfgOcNuTZ85QJwNZQpw==",
-      "dev": true,
       "license": "MIT",
       "dependencies": {
         "readable-stream": "3"
@@ -16438,7 +16453,6 @@
       "version": "1.0.2",
       "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",
       "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==",
-      "dev": true,
       "license": "MIT"
     },
     "node_modules/validate-npm-package-name": {
diff --git a/package.json b/package.json
index d5c89dc..5af7e9f 100644
--- a/package.json
+++ b/package.json
@@ -72,6 +72,9 @@
     "preview-reporter": "tsx scripts/preview-reporter.ts",
     "test": "vitest run",
     "test:all": "npm run build && npm run format:check && npm run lint && npm run typecheck && npm test",
+    "test:external-host": "vitest run --config vitest.external-host.config.mts",
+    "test:external-host:chat": "vitest run --config vitest.external-host.config.mts -t \"Claude Chat\"",
+    "test:external-host:cowork": "vitest run --config vitest.external-host.config.mts -t \"Claude Cowork\"",
     "test:playwright": "playwright test",
     "test:watch": "vitest",
     "typecheck": "tsc --noEmit"
@@ -84,6 +87,7 @@
     "debug": "^4.4.3",
     "ink": "^5.2.1",
     "ink-spinner": "^5.0.0",
+    "ndjson": "^2.0.0",
     "oauth4webapi": "^3.0.0",
     "open": "^10.1.0",
     "react": "^18.3.1",
@@ -95,6 +99,7 @@
     "@playwright/test": "^1.49.0",
     "@release-it-plugins/lerna-changelog": "^8.0.1",
     "@types/debug": "^4.1.12",
+    "@types/ndjson": "^2.0.4",
     "@types/node": "^22.10.2",
     "@types/react": "^18.3.12",
     "@types/react-dom": "^18.3.1",
diff --git a/src/assertions/validators/toolCalls.test.ts b/src/assertions/validators/toolCalls.test.ts
index 42da8b7..637c613 100644
--- a/src/assertions/validators/toolCalls.test.ts
+++ b/src/assertions/validators/toolCalls.test.ts
@@ -182,7 +182,7 @@ describe('validateToolCalls', () => {
       calls: [{ name: 'search' }],
     });
     expect(v.pass).toBe(false);
-    expect(v.message).toContain('mcp_host');
+    expect(v.message).toContain('host simulation response');
   });
 });
 
@@ -292,6 +292,6 @@ describe('validateToolCallCount', () => {
   it('returns error when response is not an MCPHostSimulationResult', () => {
     const v = validateToolCallCount('not a simulation result', { exact: 1 });
     expect(v.pass).toBe(false);
-    expect(v.message).toContain('mcp_host');
+    expect(v.message).toContain('host simulation response');
   });
 });
diff --git a/src/assertions/validators/toolCalls.ts b/src/assertions/validators/toolCalls.ts
index ccb36fe..2a11b70 100644
--- a/src/assertions/validators/toolCalls.ts
+++ b/src/assertions/validators/toolCalls.ts
@@ -102,9 +102,9 @@ function findMatchingCall(
 }
 
 /**
- * Validates tool calls made during an MCP host simulation.
+ * Validates tool calls made during a host simulation.
  *
- * @param response - Must be an MCPHostSimulationResult (from mcp_host mode)
+ * @param response - Must be an MCPHostSimulationResult-compatible response
  * @param expectation - Expected tool call specification
  */
 export function validateToolCalls(
@@ -115,7 +115,7 @@ export function validateToolCalls(
     return {
       pass: false,
       message:
-        'toolsTriggered expectation requires mcp_host mode — response must be an MCPHostSimulationResult',
+        'toolsTriggered expectation requires a host simulation response with structured tool calls',
     };
   }
 
@@ -206,9 +206,9 @@ export function validateToolCalls(
 }
 
 /**
- * Validates the number of tool calls made during an MCP host simulation.
+ * Validates the number of tool calls made during a host simulation.
  *
- * @param response - Must be an MCPHostSimulationResult (from mcp_host mode)
+ * @param response - Must be an MCPHostSimulationResult-compatible response
  * @param options - Count constraints (min, max, exact)
  */
 export function validateToolCallCount(
@@ -219,7 +219,7 @@ export function validateToolCallCount(
     return {
       pass: false,
       message:
-        'toolCallCount expectation requires mcp_host mode — response must be an MCPHostSimulationResult',
+        'toolCallCount expectation requires a host simulation response with structured tool calls',
     };
   }
 
diff --git a/src/assertions/validators/validators.test.ts b/src/assertions/validators/validators.test.ts
index d4f19cc..1e7b361 100644
--- a/src/assertions/validators/validators.test.ts
+++ b/src/assertions/validators/validators.test.ts
@@ -183,6 +183,18 @@ describe('validateText', () => {
       const result = validateText(response, 'result');
       expect(result.pass).toBe(true);
     });
+
+    it('should prefer host simulation final response over metadata JSON', () => {
+      const response = {
+        response: 'final answer text',
+        externalHost: {
+          traceLimitations: ['metadata-only text'],
+        },
+      };
+
+      expect(validateText(response, 'final answer').pass).toBe(true);
+      expect(validateText(response, 'metadata-only').pass).toBe(false);
+    });
   });
 });
 
diff --git a/src/evals/datasetTypes.test.ts b/src/evals/datasetTypes.test.ts
index 94201d7..e1df9df 100644
--- a/src/evals/datasetTypes.test.ts
+++ b/src/evals/datasetTypes.test.ts
@@ -92,6 +92,68 @@ describe('datasetTypes', () => {
       expect(() => validateEvalCase(evalCase)).not.toThrow();
     });
 
+    it('should accept external_host eval case configuration', () => {
+      const evalCase = {
+        id: 'external-1',
+        mode: 'external_host' as const,
+        scenario: 'Reply with exactly hello',
+        externalHost: {
+          driver: {
+            provider: 'anthropic',
+            product: 'claude',
+            surface: 'cowork',
+            runtime: 'desktop-app',
+            platform: 'macos',
+          },
+          name: 'Claude Cowork Desktop',
+          timeoutMs: 120000,
+          capabilities: {
+            control: [
+              { uses: 'builtin:platform.macos' },
+              { uses: 'builtin:anthropic.claude.coworkSurface' },
+            ],
+            input: {
+              uses: 'builtin:desktop.macos.accessibilitySubmit',
+              with: { createNewConversation: false },
+            },
+            completion: {
+              uses: 'builtin:anthropic.claude.localAgentTrace',
+              provides: ['trace'],
+            },
+            normalize: {
+              uses: 'builtin:anthropic.claude.localAgentNormalize',
+            },
+          },
+          correlation: {
+            strategy: 'prompt_marker',
+            includeInPrompt: false,
+            promptTemplate: 'trace: {{marker}}',
+          },
+          options: {
+            appName: 'Claude',
+            newConversationShortcut: 'cmd+n',
+          },
+        },
+      };
+
+      const result = validateEvalCase(evalCase);
+
+      expect(result).toEqual(evalCase);
+    });
+
+    it('should reject external_host configuration without a driver', () => {
+      const evalCase = {
+        id: 'external-1',
+        mode: 'external_host' as const,
+        scenario: 'Reply with exactly hello',
+        externalHost: {
+          name: 'Claude Cowork Desktop',
+        },
+      };
+
+      expect(() => validateEvalCase(evalCase)).toThrow(ZodError);
+    });
+
     it('should accept eval case with complex args', () => {
       const evalCase = {
         id: 'test-1',
diff --git a/src/evals/datasetTypes.ts b/src/evals/datasetTypes.ts
index 0a7d7ba..01fcd7e 100644
--- a/src/evals/datasetTypes.ts
+++ b/src/evals/datasetTypes.ts
@@ -1,5 +1,7 @@
 import { z } from 'zod';
 import type { MCPHostConfig } from './mcpHost/mcpHostTypes.js';
+import type { ExternalHostConfig } from './externalHost/types.js';
+import { ExternalHostConfigSchema } from './externalHost/schema.js';
 import type { SnapshotSanitizer } from '../assertions/validators/types.js';
 import type { BuiltInRubric } from '../judge/judgeTypes.js';
 
@@ -16,13 +18,14 @@ export type {
 /**
  * Evaluation mode
  */
-export type EvalMode = 'direct' | 'mcp_host';
+export type EvalMode = 'direct' | 'mcp_host' | 'external_host';
 
 /**
  * A single eval test case
  *
  * For 'direct' mode: toolName and args are required
  * For 'mcp_host' mode: scenario and mcpHostConfig are required
+ * For 'external_host' mode: scenario and externalHost are required
  */
 export interface EvalCase {
   /**
@@ -38,7 +41,8 @@ export interface EvalCase {
   /**
    * Evaluation mode
    * - 'direct': Direct API calls to MCP tools (default)
-   * - 'mcp_host': LLM-driven tool selection via natural language
+   * - 'mcp_host': SDK/CLI host simulation via natural language
+   * - 'external_host': Real external MCP host driven by configured capabilities
    *
    * @default 'direct'
    */
@@ -55,7 +59,7 @@ export interface EvalCase {
   args?: Record<string, unknown>;
 
   /**
-   * Natural language scenario for LLM to execute (optional, required for 'mcp_host' mode)
+   * Natural language scenario for LLM to execute (required for 'mcp_host' and 'external_host' modes)
    *
    * @example "Get the weather for London and tell me if I need an umbrella"
    */
@@ -68,6 +72,11 @@ export interface EvalCase {
    */
   mcpHostConfig?: MCPHostConfig;
 
+  /**
+   * External host configuration (required for 'external_host' mode)
+   */
+  externalHost?: ExternalHostConfig;
+
   /**
    * Additional metadata for this test case
    *
@@ -241,8 +250,9 @@ export interface EvalExpectBlock {
   };
 
   /**
-   * Asserts which tools the LLM called during a mcp_host simulation.
-   * Only meaningful for mcp_host mode — direct mode has no tool call trace.
+   * Asserts which tools the LLM called during a host simulation.
+   * Only meaningful for mcp_host or external_host runs with high-confidence
+   * structured tool evidence — direct mode has no tool call trace.
    */
   toolsTriggered?: {
     /** Expected tool calls */
@@ -264,7 +274,8 @@ export interface EvalExpectBlock {
   };
 
   /**
-   * Asserts the number of tool calls made during a mcp_host simulation.
+   * Asserts the number of tool calls made during a host simulation.
+   * External-host runs require high-confidence structured tool evidence.
    */
   toolCallCount?: {
     /** Minimum number of tool calls */
@@ -447,11 +458,12 @@ const EvalExpectBlockSchema = z.object({
 export const EvalCaseSchema = z.object({
   id: z.string().min(1, 'id must not be empty'),
   description: z.string().optional(),
-  mode: z.enum(['direct', 'mcp_host']).optional(),
+  mode: z.enum(['direct', 'mcp_host', 'external_host']).optional(),
   toolName: z.string().min(1, 'toolName must not be empty').optional(),
   args: z.record(z.string(), z.unknown()).optional(),
   scenario: z.string().optional(),
   mcpHostConfig: MCPHostConfigSchema.optional(),
+  externalHost: ExternalHostConfigSchema.optional(),
   metadata: z.record(z.string(), z.unknown()).optional(),
   iterations: z.number().int().min(1).optional(),
   accuracyThreshold: z.number().min(0).max(1).optional(),
diff --git a/src/evals/evalRunner.externalHost.test.ts b/src/evals/evalRunner.externalHost.test.ts
new file mode 100644
index 0000000..8cd888b
--- /dev/null
+++ b/src/evals/evalRunner.externalHost.test.ts
@@ -0,0 +1,365 @@
+import { describe, expect, it, vi } from 'vitest';
+import type { MCPFixtureApi } from '../mcp/fixtures/mcpFixture.js';
+import type {
+  ExternalHostRunResult,
+  ExternalHostSimulationResult,
+} from './externalHost/types.js';
+
+const TEST_CORRELATION = {
+  strategy: 'prompt_marker',
+  marker: 'MCP_SERVER_TESTER_TEST',
+  includedInPrompt: true,
+} as const;
+
+vi.mock('./externalHost/runtime.js', () => ({
+  runExternalHostScenario: vi.fn(async () => {
+    const result: ExternalHostSimulationResult = {
+      success: true,
+      response: 'external host trace acknowledged.',
+      toolCalls: [{ name: 'search', arguments: { query: 'planning' } }],
+      scenario: 'unused',
+      usage: {
+        inputTokens: 10,
+        outputTokens: 5,
+        totalCostUsd: 0.01,
+        durationMs: 1000,
+      },
+      externalHost: {
+        driver: {
+          provider: 'anthropic',
+          product: 'claude',
+          surface: 'cowork',
+          runtime: 'desktop-app',
+          platform: 'macos',
+        },
+        driverSlug: 'anthropic.claude.cowork.desktop-app.macos',
+        displayName: 'Claude Cowork Desktop',
+        hostName: 'Claude Cowork Desktop',
+        hostType: 'desktop',
+        capabilitiesUsed: [
+          'control',
+          'input',
+          'completion',
+          'trace',
+          'normalize',
+        ],
+        traceSource: 'host-local-transcript',
+        traceConfidence: 'high',
+        traceLimitations: ['fixture limitation'],
+        artifacts: [
+          {
+            kind: 'audit',
+            name: 'Claude audit log',
+            path: '/tmp/audit.jsonl',
+          },
+        ],
+        session: {
+          id: 'local_123',
+          runMarker: 'MCP_SERVER_TESTER_TEST',
+          requestId: 'req_123',
+        },
+        correlation: TEST_CORRELATION,
+        sources: {
+          finalAnswer: 'host-local-transcript',
+          toolCalls: 'host-local-transcript',
+          usage: 'host-local-transcript',
+          cost: 'host-local-transcript',
+        },
+        evidence: {
+          finalAnswer: {
+            source: 'host-local-transcript',
+            confidence: 'high',
+          },
+          toolCalls: {
+            source: 'host-local-transcript',
+            confidence: 'high',
+          },
+          usage: { source: 'host-local-transcript', confidence: 'high' },
+          cost: { source: 'host-local-transcript', confidence: 'high' },
+        },
+      },
+    };
+    return result;
+  }),
+}));
+
+const { runEvalCase } = await import('./evalRunner.js');
+const { runExternalHostScenario } = await import('./externalHost/runtime.js');
+
+function makeContext(): { mcp: MCPFixtureApi } {
+  return {
+    mcp: {
+      authType: 'none',
+      project: 'external-host-test',
+    } as MCPFixtureApi,
+  };
+}
+
+describe('runEvalCase external_host mode', () => {
+  it('runs an external host case through existing expectations and preserves trace metadata', async () => {
+    const result = await runEvalCase(
+      {
+        id: 'external-host-case',
+        mode: 'external_host',
+        scenario: 'Say hello and search.',
+        externalHost: {
+          driver: 'anthropic.claude.cowork.desktop-app.macos',
+          name: 'Claude Cowork Desktop',
+        },
+        expect: {
+          containsText: 'trace acknowledged',
+          toolsTriggered: {
+            calls: [{ name: 'search', arguments: { query: 'planning' } }],
+          },
+        },
+      },
+      makeContext()
+    );
+
+    expect(runExternalHostScenario).toHaveBeenCalledWith(
+      'Say hello and search.',
+      {
+        driver: 'anthropic.claude.cowork.desktop-app.macos',
+        name: 'Claude Cowork Desktop',
+      },
+      { caseId: 'external-host-case' }
+    );
+    expect(result.pass).toBe(true);
+    expect(result.toolName).toBe('external_host');
+    expect(result.hostUsage).toMatchObject({ totalCostUsd: 0.01 });
+    expect(result.externalHost).toMatchObject({
+      hostName: 'Claude Cowork Desktop',
+      traceSource: 'host-local-transcript',
+      traceConfidence: 'high',
+      session: { id: 'local_123', requestId: 'req_123' },
+    });
+    expect(result.request?.externalHost).toEqual({
+      driver: 'anthropic.claude.cowork.desktop-app.macos',
+      driverSlug: 'anthropic.claude.cowork.desktop-app.macos',
+      name: 'Claude Cowork Desktop',
+      hostType: undefined,
+      variant: undefined,
+      timeoutMs: undefined,
+      usesBuiltInDefaults: true,
+      correlation: {
+        strategy: 'prompt_marker',
+        includeInPrompt: true,
+      },
+      options: undefined,
+      capabilities: {
+        control: [
+          { uses: 'builtin:platform.macos' },
+          {
+            uses: 'builtin:anthropic.claude.activateCoworkSurface',
+            with: { appName: 'Claude' },
+          },
+        ],
+        input: [
+          {
+            uses: 'builtin:desktop.macos.accessibilitySubmit',
+            with: { appName: 'Claude', createNewConversation: true },
+          },
+        ],
+        completion: [
+          {
+            uses: 'builtin:anthropic.claude.localAgentTrace',
+            provides: ['trace'],
+          },
+        ],
+        normalize: [{ uses: 'builtin:anthropic.claude.localAgentNormalize' }],
+      },
+    });
+    expect(result.mcpHostTrace?.calls).toEqual([
+      {
+        name: 'search',
+        arguments: { query: 'planning' },
+        status: 'expected',
+      },
+    ]);
+  });
+
+  it('fails tool assertions as trace insufficiency when external host evidence is low confidence', async () => {
+    vi.mocked(runExternalHostScenario).mockResolvedValueOnce({
+      success: true,
+      response: 'external host trace acknowledged.',
+      toolCalls: [{ name: 'search', arguments: { query: 'planning' } }],
+      externalHost: {
+        driver: {
+          provider: 'anthropic',
+          product: 'claude',
+          surface: 'chat',
+          runtime: 'desktop-app',
+          platform: 'macos',
+        },
+        driverSlug: 'anthropic.claude.chat.desktop-app.macos',
+        displayName: 'Claude Chat Desktop',
+        hostName: 'Claude Chat Desktop',
+        hostType: 'desktop',
+        capabilitiesUsed: [
+          'control',
+          'input',
+          'completion',
+          'trace',
+          'normalize',
+        ],
+        traceSource: 'accessibility',
+        traceConfidence: 'low',
+        artifacts: [],
+        session: { runMarker: 'MCP_SERVER_TESTER_TEST' },
+        correlation: TEST_CORRELATION,
+        sources: {
+          finalAnswer: 'accessibility',
+          toolCalls: 'none',
+          usage: 'none',
+          cost: 'none',
+        },
+        evidence: {
+          finalAnswer: { source: 'accessibility', confidence: 'low' },
+          toolCalls: { source: 'none', confidence: 'unknown' },
+        },
+      },
+    });
+
+    const result = await runEvalCase(
+      {
+        id: 'external-host-low-confidence',
+        mode: 'external_host',
+        scenario: 'Say hello and search.',
+        externalHost: {
+          driver: 'anthropic.claude.chat.desktop-app.macos',
+        },
+        expect: {
+          containsText: 'trace acknowledged',
+          toolsTriggered: {
+            calls: [{ name: 'search' }],
+          },
+        },
+      },
+      makeContext()
+    );
+
+    expect(result.pass).toBe(false);
+    expect(result.mcpHostTrace).toBeUndefined();
+    expect(result.expectations.toolsTriggered?.details).toContain(
+      'cannot support tool-call assertions'
+    );
+  });
+
+  it('requires high-confidence structured evidence for tool assertions when per-field evidence is absent', async () => {
+    vi.mocked(runExternalHostScenario).mockResolvedValueOnce({
+      success: true,
+      response: 'external host trace acknowledged.',
+      toolCalls: [{ name: 'search', arguments: { query: 'planning' } }],
+      externalHost: {
+        driver: {
+          provider: 'anthropic',
+          product: 'claude',
+          surface: 'cowork',
+          runtime: 'desktop-app',
+          platform: 'macos',
+        },
+        driverSlug: 'anthropic.claude.cowork.desktop-app.macos',
+        displayName: 'Claude Cowork Desktop',
+        hostName: 'Claude Cowork Desktop',
+        hostType: 'desktop',
+        capabilitiesUsed: [
+          'control',
+          'input',
+          'completion',
+          'trace',
+          'normalize',
+        ],
+        traceSource: 'host-local-transcript',
+        traceConfidence: 'medium',
+        artifacts: [],
+        session: { runMarker: 'MCP_SERVER_TESTER_TEST' },
+        correlation: TEST_CORRELATION,
+        sources: {
+          finalAnswer: 'host-local-transcript',
+          toolCalls: 'host-local-transcript',
+        },
+      },
+    });
+
+    const result = await runEvalCase(
+      {
+        id: 'external-host-medium-confidence',
+        mode: 'external_host',
+        scenario: 'Say hello and search.',
+        externalHost: {
+          driver: 'anthropic.claude.cowork.desktop-app.macos',
+        },
+        expect: {
+          toolsTriggered: {
+            calls: [{ name: 'search' }],
+          },
+        },
+      },
+      makeContext()
+    );
+
+    expect(result.pass).toBe(false);
+    expect(result.mcpHostTrace).toBeUndefined();
+    expect(result.expectations.toolsTriggered?.details).toContain(
+      'cannot support tool-call assertions'
+    );
+  });
+
+  it('counts external host driver failures as infrastructure failures across iterations', async () => {
+    const failure: ExternalHostRunResult = {
+      success: false,
+      error: 'Failed to submit prompt to Claude: automation permission denied',
+      toolCalls: [],
+      externalHost: {
+        driver: {
+          provider: 'anthropic',
+          product: 'claude',
+          surface: 'cowork',
+          runtime: 'desktop-app',
+          platform: 'macos',
+        },
+        driverSlug: 'anthropic.claude.cowork.desktop-app.macos',
+        displayName: 'Claude Cowork Desktop',
+        hostName: 'Claude Cowork Desktop',
+        hostType: 'desktop',
+        capabilitiesUsed: [],
+        traceSource: 'none',
+        traceConfidence: 'unknown',
+        artifacts: [],
+        session: { runMarker: 'MCP_SERVER_TESTER_TEST' },
+        correlation: TEST_CORRELATION,
+        failureKind: 'automation_permission_denied',
+      },
+    };
+    const deniedAgain: ExternalHostRunResult = {
+      ...failure,
+      error: 'Failed to submit prompt to Claude: still denied',
+    };
+    vi.mocked(runExternalHostScenario)
+      .mockResolvedValueOnce(failure)
+      .mockResolvedValueOnce(deniedAgain);
+
+    const result = await runEvalCase(
+      {
+        id: 'external-host-driver-failure',
+        mode: 'external_host',
+        scenario: 'Say hello.',
+        externalHost: {
+          driver: 'anthropic.claude.cowork.desktop-app.macos',
+        },
+        iterations: 2,
+        expect: {
+          containsText: 'hello',
+        },
+      },
+      makeContext()
+    );
+
+    expect(result.pass).toBe(false);
+    expect(result.infrastructureErrorCount).toBe(2);
+    expect(result.infrastructureErrorRate).toBe(1);
+    expect(result.iterationResults?.every((r) => r.isInfrastructureError)).toBe(
+      true
+    );
+  });
+});
diff --git a/src/evals/evalRunner.test.ts b/src/evals/evalRunner.test.ts
index f2f7200..82d9f66 100644
--- a/src/evals/evalRunner.test.ts
+++ b/src/evals/evalRunner.test.ts
@@ -608,7 +608,9 @@ describe('toolsTriggered and toolCallCount expectations in eval runner', () => {
 
     const result = await runEvalCase(evalCase, createContext(mcp));
     expect(result.expectations.toolsTriggered?.pass).toBe(false);
-    expect(result.expectations.toolsTriggered?.details).toContain('mcp_host');
+    expect(result.expectations.toolsTriggered?.details).toContain(
+      'host simulation response'
+    );
   });
 
   it('validates toolCallCount correctly from simulation result', async () => {
diff --git a/src/evals/evalRunner.ts b/src/evals/evalRunner.ts
index 5c943c5..235a712 100644
--- a/src/evals/evalRunner.ts
+++ b/src/evals/evalRunner.ts
@@ -5,6 +5,18 @@ import type { Tool } from '@modelcontextprotocol/sdk/types.js';
 import type { ZodType } from 'zod';
 import { simulateMCPHost } from './mcpHost/mcpHostSimulation.js';
 import type { MCPHostSimulationResult } from './mcpHost/mcpHostTypes.js';
+import { runExternalHostScenario } from './externalHost/runtime.js';
+import type {
+  ExternalHostCapabilitiesConfig,
+  ExternalHostCorrelationConfig,
+  ExternalHostMetadata,
+  ExternalHostSimulationResult,
+} from './externalHost/types.js';
+import {
+  driverToSlug,
+  normalizeHostDriver,
+} from './externalHost/driverIdentity.js';
+import { getRegisteredExternalHostConfig } from './externalHost/hostRegistry.js';
 import type { EvalExpectationResult, UsageMetrics } from '../types/index.js';
 import type {
   EvalCaseResult,
@@ -411,6 +423,33 @@ async function executeToolCall(
         throw new Error(simulationResult.error || 'MCP host simulation failed');
       }
 
+      return { response: simulationResult };
+    } else if (mode === 'external_host') {
+      if (!evalCase.scenario) {
+        throw new Error(
+          `Eval case ${evalCase.id}: scenario is required for external_host mode`
+        );
+      }
+
+      if (!evalCase.externalHost) {
+        throw new Error(
+          `Eval case ${evalCase.id}: externalHost is required for external_host mode`
+        );
+      }
+
+      const simulationResult = await runExternalHostScenario(
+        evalCase.scenario,
+        evalCase.externalHost,
+        { caseId: evalCase.id }
+      );
+
+      if (!simulationResult.success) {
+        return {
+          response: simulationResult,
+          error: simulationResult.error || 'External host simulation failed',
+        };
+      }
+
       return { response: simulationResult };
     } else {
       // Direct mode - call tool directly
@@ -670,11 +709,26 @@ function buildRequest(
   evalCase: EvalCase,
   toolOverrideVariantId?: string
 ): EvalCaseRequest {
-  const request: EvalCaseRequest = {};
+  const request: EvalCaseRequest = {
+    mode: evalCase.mode ?? 'direct',
+  };
   if (evalCase.description) request.description = evalCase.description;
   if (toolOverrideVariantId !== undefined) {
     request.toolOverrideVariantId = toolOverrideVariantId;
   }
+  if (evalCase.iterations !== undefined)
+    request.iterations = evalCase.iterations;
+  if (evalCase.accuracyThreshold !== undefined) {
+    request.accuracyThreshold = evalCase.accuracyThreshold;
+  }
+  if (evalCase.judgeReps !== undefined) request.judgeReps = evalCase.judgeReps;
+  if (evalCase.tags) request.tags = evalCase.tags;
+  if (evalCase.expect) {
+    request.expect = sanitizeReporterValue(evalCase.expect) as Record<
+      string,
+      unknown
+    >;
+  }
 
   if (evalCase.mode === 'mcp_host') {
     if (evalCase.scenario) request.scenario = evalCase.scenario;
@@ -686,6 +740,45 @@ function buildRequest(
         }),
       };
     }
+  } else if (evalCase.mode === 'external_host') {
+    if (evalCase.scenario) request.scenario = evalCase.scenario;
+    if (evalCase.externalHost) {
+      let driverSlug: string | undefined;
+      try {
+        driverSlug = driverToSlug(
+          normalizeHostDriver(evalCase.externalHost.driver)
+        );
+      } catch {
+        driverSlug = undefined;
+      }
+      const registeredConfig = driverSlug
+        ? getRegisteredExternalHostConfig(driverSlug)
+        : undefined;
+      const effectiveOptions = mergeReporterOptions(
+        registeredConfig?.options,
+        evalCase.externalHost.options
+      );
+      const effectiveCapabilities = mergeReporterCapabilities(
+        registeredConfig?.capabilities,
+        evalCase.externalHost.capabilities
+      );
+      const effectiveCorrelation = mergeReporterCorrelation(
+        registeredConfig?.correlation,
+        evalCase.externalHost.correlation
+      );
+      request.externalHost = {
+        driver: evalCase.externalHost.driver,
+        driverSlug,
+        name: evalCase.externalHost.name ?? registeredConfig?.name,
+        hostType: evalCase.externalHost.hostType,
+        variant: evalCase.externalHost.variant,
+        timeoutMs: evalCase.externalHost.timeoutMs,
+        usesBuiltInDefaults: registeredConfig !== undefined,
+        correlation: effectiveCorrelation,
+        options: sanitizeReporterRecord(effectiveOptions),
+        capabilities: serializeExternalHostCapabilities(effectiveCapabilities),
+      };
+    }
   } else {
     if (evalCase.args) request.args = evalCase.args;
   }
@@ -693,6 +786,130 @@ function buildRequest(
   return request;
 }
 
+function mergeReporterOptions(
+  base: Record<string, unknown> | undefined,
+  override: Record<string, unknown> | undefined
+): Record<string, unknown> | undefined {
+  if (!base) {
+    return override;
+  }
+  if (!override) {
+    return base;
+  }
+  return {
+    ...base,
+    ...override,
+  };
+}
+
+function mergeReporterCapabilities(
+  base: ExternalHostCapabilitiesConfig | undefined,
+  override: ExternalHostCapabilitiesConfig | undefined
+): ExternalHostCapabilitiesConfig | undefined {
+  if (!base) {
+    return override;
+  }
+  if (!override) {
+    return base;
+  }
+  return {
+    ...base,
+    ...override,
+  };
+}
+
+function mergeReporterCorrelation(
+  base: ExternalHostCorrelationConfig | undefined,
+  override: ExternalHostCorrelationConfig | undefined
+): ExternalHostCorrelationConfig | undefined {
+  if (!base) {
+    return override;
+  }
+  if (!override) {
+    return base;
+  }
+  return {
+    ...base,
+    ...override,
+  };
+}
+
+function serializeExternalHostCapabilities(
+  capabilities: ExternalHostCapabilitiesConfig | undefined
+): NonNullable<EvalCaseRequest['externalHost']>['capabilities'] {
+  if (!capabilities || typeof capabilities !== 'object') {
+    return undefined;
+  }
+
+  const serialized: NonNullable<
+    NonNullable<EvalCaseRequest['externalHost']>['capabilities']
+  > = {};
+
+  for (const [capability, bindingOrBindings] of Object.entries(capabilities)) {
+    const bindings = Array.isArray(bindingOrBindings)
+      ? bindingOrBindings
+      : [bindingOrBindings];
+    serialized[capability] = bindings
+      .filter((binding): binding is NonNullable<typeof binding> =>
+        Boolean(binding)
+      )
+      .map((binding) => ({
+        uses: binding.uses,
+        ...(binding.provides !== undefined && {
+          provides: [...binding.provides],
+        }),
+        ...(binding.with !== undefined && {
+          with: sanitizeReporterRecord(binding.with),
+        }),
+      }));
+  }
+
+  return serialized;
+}
+
+function sanitizeReporterRecord(
+  value: Record<string, unknown> | undefined
+): Record<string, unknown> | undefined {
+  if (!value) {
+    return undefined;
+  }
+  return sanitizeReporterValue(value) as Record<string, unknown>;
+}
+
+function sanitizeReporterValue(value: unknown): unknown {
+  if (Array.isArray(value)) {
+    return value.map((item) => sanitizeReporterValue(item));
+  }
+
+  if (value && typeof value === 'object') {
+    const sanitized: Record<string, unknown> = {};
+    for (const [key, nestedValue] of Object.entries(
+      value as Record<string, unknown>
+    )) {
+      sanitized[key] = isSecretLikeKey(key)
+        ? '[redacted]'
+        : sanitizeReporterValue(nestedValue);
+    }
+    return sanitized;
+  }
+
+  if (typeof value === 'function') {
+    return '[function]';
+  }
+
+  if (typeof value === 'bigint') {
+    return value.toString();
+  }
+
+  return value;
+}
+
+function isSecretLikeKey(key: string): boolean {
+  return /token|secret|password|credential|authorization|api[-_]?key/i.test(
+    key
+  );
+}
+
 function isMCPHostSimulationResult(
   value: unknown
 ): value is MCPHostSimulationResult {
@@ -705,6 +922,12 @@ function isMCPHostSimulationResult(
   );
 }
 
+function isExternalHostSimulationResult(
+  value: unknown
+): value is ExternalHostSimulationResult {
+  return isMCPHostSimulationResult(value) && 'externalHost' in value;
+}
+
 /**
  * Runs a single iteration of an eval case (the atomic unit of work).
  * Extracted from runEvalCase to support multi-iteration accuracy loops.
@@ -718,6 +941,10 @@ async function runSingleIteration(
 
   // Execute tool call
   const { response, error } = await executeToolCall(evalCase, context.mcp);
+  const externalHost =
+    isExternalHostSimulationResult(response) && response.externalHost
+      ? response.externalHost
+      : undefined;
 
   // Collect expectation results from expect block
   let expectationResults: EvalCaseResult['expectations'] = {};
@@ -741,10 +968,24 @@ async function runSingleIteration(
     toolPrecision = tp;
     toolRecall = tr;
 
+    if (evalCase.mode === 'external_host' && externalHost) {
+      applyExternalHostEvidenceGating(
+        evalCase.expect,
+        externalHost,
+        expectationResults
+      );
+      if (expectationResults.toolsTriggered?.pass === false) {
+        toolPrecision = undefined;
+        toolRecall = undefined;
+      }
+    }
+
     // Build mcpHostTrace when toolsTriggered expectation is present
     if (
       evalCase.expect.toolsTriggered !== undefined &&
-      isMCPHostSimulationResult(response)
+      isMCPHostSimulationResult(response) &&
+      (evalCase.mode !== 'external_host' ||
+        (externalHost !== undefined && hasStructuredToolEvidence(externalHost)))
     ) {
       const expectedNames = new Set(
         evalCase.expect.toolsTriggered.calls.map((c) => c.name)
@@ -780,7 +1021,11 @@ async function runSingleIteration(
     id: evalCase.id,
     datasetName: options.datasetName ?? 'single-case',
     toolName:
-      evalCase.scenario != null ? 'mcp_host' : (evalCase.toolName ?? 'unknown'),
+      evalCase.mode === 'external_host'
+        ? 'external_host'
+        : evalCase.scenario != null
+          ? 'mcp_host'
+          : (evalCase.toolName ?? 'unknown'),
     source: 'eval',
     pass: didCasePass(error, expectationResults),
     request: buildRequest(evalCase, options.toolOverrideVariantId),
@@ -795,9 +1040,60 @@ async function runSingleIteration(
     toolRecall,
     mcpHostTrace,
     hostUsage,
+    externalHost,
   };
 }
 
+function applyExternalHostEvidenceGating(
+  expectBlock: EvalExpectBlock,
+  externalHost: ExternalHostMetadata,
+  expectationResults: EvalCaseResult['expectations']
+): void {
+  const needsToolEvidence =
+    expectBlock.toolsTriggered !== undefined ||
+    expectBlock.toolCallCount !== undefined;
+
+  if (!needsToolEvidence || hasStructuredToolEvidence(externalHost)) {
+    return;
+  }
+
+  const details = `External host trace source ${
+    externalHost.sources?.toolCalls ?? externalHost.traceSource
+  } (${externalHost.traceConfidence} confidence) cannot support tool-call assertions. Use protocol traces or host-native structured traces for toolsTriggered/toolCallCount.`;
+
+  if (expectBlock.toolsTriggered !== undefined) {
+    expectationResults.toolsTriggered = { pass: false, details };
+  }
+  if (expectBlock.toolCallCount !== undefined) {
+    expectationResults.toolCallCount = { pass: false, details };
+  }
+}
+
+function hasStructuredToolEvidence(
+  externalHost: ExternalHostMetadata
+): boolean {
+  const structuredSources = [
+    'mcp-proxy',
+    'mcp-server-logs',
+    'host-local-transcript',
+    'host-native-export',
+  ];
+  const evidence = externalHost.evidence?.toolCalls;
+
+  if (evidence) {
+    return (
+      evidence.confidence === 'high' &&
+      structuredSources.includes(evidence.source)
+    );
+  }
+
+  const source = externalHost.sources?.toolCalls ?? externalHost.traceSource;
+  return (
+    externalHost.traceConfidence === 'high' &&
+    structuredSources.includes(source)
+  );
+}
+
 /**
  * Returns true when the error message appears to be caused by network or
  * infrastructure issues (connection resets, timeouts, rate limits, etc.)
@@ -830,6 +1126,12 @@ function isInfrastructureError(err: unknown): boolean {
     msg.includes('429') ||
     msg.includes('503') ||
     msg.includes('network') ||
+    msg.includes('automation permission') ||
+    msg.includes('automation/accessibility') ||
+    msg.includes('no matching claude session') ||
+    msg.includes('timed out waiting for claude session') ||
+    msg.includes('failed to submit prompt to claude') ||
+    msg.includes('failed to submit prompt to desktop host') ||
     // Prompt/context overflow — LLM couldn't run, not a tool discoverability failure
     msg.includes('prompt is too long') ||
     msg.includes('context length exceeded') ||
@@ -842,6 +1144,12 @@ function isInfrastructureError(err: unknown): boolean {
   );
 }
 
+function isExternalHostInfrastructureFailure(
+  externalHost: ExternalHostMetadata | undefined
+): boolean {
+  return externalHost?.failureKind !== undefined;
+}
+
 /**
  * Runs a single eval case and returns the result.
  * When `evalCase.iterations > 1`, runs the case N times and returns accuracy.
@@ -884,7 +1192,8 @@ export async function runEvalCase(
       // Check whether the tool call itself failed due to infrastructure (the
       // error is surfaced as result.error since executeToolCall swallows throws)
       const infraError =
-        result.error != null && isInfrastructureError(result.error);
+        isExternalHostInfrastructureFailure(result.externalHost) ||
+        (result.error != null && isInfrastructureError(result.error));
       iterationResults.push({
         pass: result.pass,
         durationMs: result.durationMs,
@@ -892,6 +1201,7 @@ export async function runEvalCase(
         isInfrastructureError: infraError,
         mcpHostTrace: result.mcpHostTrace,
         hostUsage: result.hostUsage,
+        externalHost: result.externalHost,
       });
     } catch (err) {
       // runSingleIteration should not throw, but guard defensively
@@ -920,7 +1230,11 @@ export async function runEvalCase(
     id: evalCase.id,
     datasetName: options.datasetName ?? 'single-case',
     toolName:
-      evalCase.scenario != null ? 'mcp_host' : (evalCase.toolName ?? 'unknown'),
+      evalCase.mode === 'external_host'
+        ? 'external_host'
+        : evalCase.scenario != null
+          ? 'mcp_host'
+          : (evalCase.toolName ?? 'unknown'),
     source: 'eval',
     pass: false,
     error: iterationResults[0]?.error,
@@ -1080,7 +1394,7 @@ export async function runEvalDataset(
   // Preflight cost warning: estimate the number of LLM judge API calls this run will make
   const estimatedJudgeCalls = casesToRun.reduce((sum, c) => {
     const effectiveIterations =
-      c.mode === 'mcp_host'
+      c.mode === 'mcp_host' || c.mode === 'external_host'
         ? (c.iterations ?? defaultLlmIterations ?? 1)
         : (c.iterations ?? 1);
     if (c.expect?.passesJudge == null) return sum;
@@ -1102,10 +1416,10 @@ export async function runEvalDataset(
 
   // Build task factories for all cases
   const tasks = casesToRun.map((evalCase) => async () => {
-    // Apply defaultLlmIterations to mcp_host cases that don't specify iterations.
+    // Apply defaultLlmIterations to host-driven cases that don't specify iterations.
     // Direct mode cases are deterministic — they always stay at 1 iteration.
     const withIterations =
-      evalCase.mode === 'mcp_host' &&
+      (evalCase.mode === 'mcp_host' || evalCase.mode === 'external_host') &&
       evalCase.iterations === undefined &&
       defaultLlmIterations !== undefined
         ? { ...evalCase, iterations: defaultLlmIterations }
@@ -1116,11 +1430,11 @@ export async function runEvalDataset(
     // Single-iteration mcp_host runs (the default) are a valid smoke-test pattern
     // and are not warned about — the warning is scoped to cases that have
     // explicitly chosen a multi-iteration count that is too small to be reliable.
-    if (evalCase.mode === 'mcp_host') {
+    if (evalCase.mode === 'mcp_host' || evalCase.mode === 'external_host') {
       const effectiveIterations = withIterations.iterations ?? 1;
       if (effectiveIterations > 1 && effectiveIterations < 10) {
         console.warn(
-          `[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in mcp_host mode ` +
+          `[mcp-server-tester] Eval case "${evalCase.id}": running ${effectiveIterations} iterations in ${evalCase.mode} mode ` +
             `may not be statistically reliable. Consider using 10+ iterations for accuracy measurements you can trust.`
         );
       }
diff --git a/src/evals/externalHost/builtinCapabilities.ts b/src/evals/externalHost/builtinCapabilities.ts
new file mode 100644
index 0000000..ec3e55c
--- /dev/null
+++ b/src/evals/externalHost/builtinCapabilities.ts
@@ -0,0 +1,22 @@
+import { ANTHROPIC_CLAUDE_CAPABILITIES } from './builtins/anthropicClaude.js';
+import { MACOS_DESKTOP_CAPABILITIES } from './builtins/macosDesktop.js';
+import type { ExternalHostCapabilityImplementation } from './types.js';
+
+const BUILTIN_CAPABILITIES = new Map<
+  string,
+  ExternalHostCapabilityImplementation
+>(
+  [...MACOS_DESKTOP_CAPABILITIES, ...ANTHROPIC_CLAUDE_CAPABILITIES].map(
+    (implementation) => [implementation.id, implementation]
+  )
+);
+
+export function listBuiltinExternalHostCapabilities(): ExternalHostCapabilityImplementation[] {
+  return Array.from(BUILTIN_CAPABILITIES.values());
+}
+
+export function resolveBuiltinExternalHostCapability(
+  uses: string
+): ExternalHostCapabilityImplementation | undefined {
+  return BUILTIN_CAPABILITIES.get(uses);
+}
diff --git a/src/evals/externalHost/builtins/anthropicClaude.integration.test.ts b/src/evals/externalHost/builtins/anthropicClaude.integration.test.ts
new file mode 100644
index 0000000..4687173
--- /dev/null
+++ b/src/evals/externalHost/builtins/anthropicClaude.integration.test.ts
@@ -0,0 +1,70 @@
+import { describe, expect, it } from 'vitest';
+import { runExternalHostScenario } from '../runtime.js';
+
+describe('Claude external host integrations', () => {
+  it('drives Claude Chat Desktop and captures low-confidence visible response evidence', async () => {
+    const result = await runExternalHostScenario(
+      'Please reply with exactly: external host integration acknowledged.',
+      {
+        driver: 'anthropic.claude.chat.desktop-app.macos',
+        name: 'Claude Chat Desktop',
+        timeoutMs: 30_000,
+      },
+      { caseId: 'claude-chat-desktop-integration' }
+    );
+
+    if (!result.success) {
+      throw new Error(
+        `${result.externalHost.failureKind ?? 'unknown'}: ${result.error}`
+      );
+    }
+
+    expect(result.response?.toLowerCase()).toContain(
+      'external host integration acknowledged'
+    );
+    expect(result.externalHost.driverSlug).toBe(
+      'anthropic.claude.chat.desktop-app.macos'
+    );
+    expect(result.externalHost.traceSource).toBe('accessibility');
+    expect(result.externalHost.traceConfidence).toBe('low');
+    expect(result.externalHost.artifacts.length).toBeGreaterThan(0);
+    expect(result.externalHost.session.runMarker).toContain(
+      'MCP_SERVER_TESTER_'
+    );
+  }, 150_000);
+
+  it('drives the active Claude Cowork Desktop surface and captures high-confidence local-agent trace evidence', async () => {
+    const result = await runExternalHostScenario(
+      'Please reply with exactly: external host integration acknowledged.',
+      {
+        driver: 'anthropic.claude.cowork.desktop-app.macos',
+        name: 'Claude Cowork Desktop',
+        timeoutMs: 60_000,
+        options: {
+          newConversationShortcut: 'none',
+        },
+      },
+      { caseId: 'claude-cowork-desktop-integration' }
+    );
+
+    if (!result.success) {
+      throw new Error(
+        `${result.externalHost.failureKind ?? 'unknown'}: ${result.error}`
+      );
+    }
+
+    expect(result.response?.toLowerCase()).toContain(
+      'external host integration acknowledged'
+    );
+    expect(result.externalHost.driverSlug).toBe(
+      'anthropic.claude.cowork.desktop-app.macos'
+    );
+    expect(result.externalHost.traceSource).toBe('host-local-transcript');
+    expect(result.externalHost.traceConfidence).toBe('high');
+    expect(result.externalHost.artifacts.length).toBeGreaterThan(0);
+    expect(result.externalHost.session.id).toBeDefined();
+    expect(result.externalHost.session.runMarker).toContain(
+      'MCP_SERVER_TESTER_'
+    );
+  }, 150_000);
+});
diff --git a/src/evals/externalHost/builtins/anthropicClaude.test.ts b/src/evals/externalHost/builtins/anthropicClaude.test.ts
new file mode 100644
index 0000000..1a9b5d2
--- /dev/null
+++ b/src/evals/externalHost/builtins/anthropicClaude.test.ts
@@ -0,0 +1,835 @@
+import { mkdtemp, mkdir, writeFile } from 'node:fs/promises';
+import { tmpdir } from 'node:os';
+import { join } from 'node:path';
+import { describe, expect, it } from 'vitest';
+import {
+  buildClaudeTraceMetadata,
+  findMatchingClaudeSessions,
+  extractAccessibilityResponse,
+  getClaudeDataDir,
+  looksLikeClaudeChatSurface,
+  parseClaudeTrace,
+  snapshotClaudeSessions,
+  waitForClaudeTrace,
+  type SessionCandidate,
+} from './anthropicClaude.js';
+
+const COWORK_DRIVER = {
+  provider: 'anthropic',
+  product: 'claude',
+  surface: 'cowork',
+  runtime: 'desktop-app',
+  platform: 'macos',
+} as const;
+
+async function writeJsonl(path: string, events: unknown[]): Promise<void> {
+  await writeFile(
+    path,
+    events.map((event) => JSON.stringify(event)).join('\n'),
+    'utf-8'
+  );
+}
+
+describe('anthropicClaude trace parsing', () => {
+  it('parses final answer, usage, tool calls, and artifacts from local Claude files', async () => {
+    const root = await mkdtemp(join(tmpdir(), 'claude-trace-'));
+    const sessionId = 'local_test';
+    const cliSessionId = 'cli-session';
+    const sessionDir = join(root, sessionId);
+    const transcriptDir = join(
+      sessionDir,
+      '.claude',
+      'projects',
+      '-sessions-test'
+    );
+    await mkdir(transcriptDir, { recursive: true });
+
+    const metadataPath = join(root, `${sessionId}.json`);
+    await writeFile(
+      metadataPath,
+      JSON.stringify({
+        sessionId,
+        cliSessionId,
+        initialMessage: 'marker MCP_SERVER_TESTER_TEST',
+        cwd: '/sessions/test',
+        createdAt: '2026-05-09T00:00:00.000Z',
+      }),
+      'utf-8'
+    );
+
+    await writeJsonl(join(sessionDir, 'audit.jsonl'), [
+      {
+        type: 'result',
+        result: 'trace spike acknowledged.',
+        requestId: 'req_123',
+        duration_ms: 1234,
+        duration_api_ms: 1000,
+        total_cost_usd: 0.01,
+        usage: {
+          input_tokens: 10,
+          output_tokens: 5,
+          cache_read_input_tokens: 2,
+        },
+        timestamp: '2026-05-09T00:00:02.000Z',
+      },
+    ]);
+
+    await writeJsonl(join(transcriptDir, `${cliSessionId}.jsonl`), [
+      {
+        type: 'assistant',
+        message: {
+          content: [
+            {
+              type: 'tool_use',
+              id: 'toolu_1',
+              name: 'mcp__server__search',
+              input: { query: 'planning' },
+            },
+          ],
+        },
+      },
+    ]);
+
+    const candidate: SessionCandidate = {
+      id: sessionId,
+      metadataPath,
+      sessionDir,
+      statMtimeMs: Date.now(),
+      metadata: {
+        sessionId,
+        cliSessionId,
+        initialMessage: 'marker MCP_SERVER_TESTER_TEST',
+        cwd: '/sessions/test',
+      },
+    };
+
+    const trace = await parseClaudeTrace(candidate);
+
+    expect(trace.finalAnswer).toBe('trace spike acknowledged.');
+    expect(trace.requestId).toBe('req_123');
+    expect(trace.usage).toMatchObject({
+      inputTokens: 10,
+      outputTokens: 5,
+      totalCostUsd: 0.01,
+      durationMs: 1234,
+      durationApiMs: 1000,
+      cacheReadInputTokens: 2,
+    });
+    expect(trace.toolCalls).toEqual([
+      {
+        id: 'toolu_1',
+        name: 'search',
+        arguments: { query: 'planning' },
+      },
+    ]);
+    expect(trace.transcriptPath).toContain(`${cliSessionId}.jsonl`);
+    expect(trace.isComplete).toBe(true);
+    expect(trace.auditParsed).toBe(true);
+    expect(trace.transcriptParsed).toBe(true);
+    expect(trace.usageAvailable).toBe(true);
+    expect(trace.costAvailable).toBe(true);
+  });
+
+  it('does not treat assistant text without a result event as a completed run', async () => {
+    const root = await mkdtemp(join(tmpdir(), 'claude-pending-'));
+    const sessionId = 'local_pending';
+    const sessionDir = join(root, sessionId);
+    await mkdir(sessionDir, { recursive: true });
+    const metadataPath = join(root, `${sessionId}.json`);
+    await writeFile(
+      metadataPath,
+      JSON.stringify({
+        sessionId,
+        initialMessage: 'marker MCP_SERVER_TESTER_PENDING',
+        createdAt: new Date().toISOString(),
+      }),
+      'utf-8'
+    );
+    await writeJsonl(join(sessionDir, 'audit.jsonl'), [
+      {
+        type: 'assistant',
+        message: {
+          content: [{ type: 'text', text: 'partial assistant response' }],
+        },
+      },
+    ]);
+
+    const candidate: SessionCandidate = {
+      id: sessionId,
+      metadataPath,
+      sessionDir,
+      statMtimeMs: Date.now(),
+      metadata: {
+        sessionId,
+        initialMessage: 'marker MCP_SERVER_TESTER_PENDING',
+      },
+    };
+
+    const trace = await parseClaudeTrace(candidate);
+
+    expect(trace.finalAnswer).toBe('partial assistant response');
+    expect(trace.isComplete).toBe(false);
+  });
+
+  it('continues parsing valid JSONL events when one line is malformed', async () => {
+    const root = await mkdtemp(join(tmpdir(), 'claude-jsonl-'));
+    const sessionId = 'local_jsonl';
+    const sessionDir = join(root, sessionId);
+    await mkdir(sessionDir, { recursive: true });
+    const metadataPath = join(root, `${sessionId}.json`);
+    await writeFile(
+      metadataPath,
+      JSON.stringify({
+        sessionId,
+        initialMessage: 'marker MCP_SERVER_TESTER_JSONL',
+      }),
+      'utf-8'
+    );
+    await writeFile(
+      join(sessionDir, 'audit.jsonl'),
+      [
+        JSON.stringify({ type: 'assistant', result: 'ignored' }),
+        '{not-json',
+        JSON.stringify({ type: 'result', result: 'final answer' }),
+      ].join('\n'),
+      'utf-8'
+    );
+
+    const candidate: SessionCandidate = {
+      id: sessionId,
+      metadataPath,
+      sessionDir,
+      statMtimeMs: Date.now(),
+      metadata: {
+        sessionId,
+        initialMessage: 'marker MCP_SERVER_TESTER_JSONL',
+      },
+    };
+
+    const trace = await parseClaudeTrace(candidate);
+
+    expect(trace.finalAnswer).toBe('final answer');
+    expect(trace.isComplete).toBe(true);
+    expect(trace.parseWarnings.join('\n')).toContain(
+      'discarded 1 malformed JSONL line'
+    );
+  });
+
+  it('only marks evidence fields high confidence when the parsed trace supports them', async () => {
+    const root = await mkdtemp(join(tmpdir(), 'claude-evidence-'));
+    const sessionId = 'local_evidence';
+    const sessionDir = join(root, sessionId);
+    await mkdir(sessionDir, { recursive: true });
+    const metadataPath = join(root, `${sessionId}.json`);
+    await writeFile(
+      metadataPath,
+      JSON.stringify({
+        sessionId,
+        initialMessage: 'marker MCP_SERVER_TESTER_EVIDENCE',
+      }),
+      'utf-8'
+    );
+    await writeJsonl(join(sessionDir, 'audit.jsonl'), [
+      {
+        type: 'result',
+        result: 'final answer',
+        total_cost_usd: 0.01,
+        usage: { input_tokens: 1, output_tokens: 2 },
+      },
+    ]);
+
+    const trace = await parseClaudeTrace({
+      id: sessionId,
+      metadataPath,
+      sessionDir,
+      statMtimeMs: Date.now(),
+      metadata: {
+        sessionId,
+        initialMessage: 'marker MCP_SERVER_TESTER_EVIDENCE',
+      },
+    });
+    const metadata = buildClaudeTraceMetadata({
+      config: {
+        driver: COWORK_DRIVER,
+        name: 'Claude Cowork Desktop',
+      },
+      context: {
+        runId: 'run',
+        caseId: 'case',
+        scenario: 'scenario',
+        submittedScenario: 'scenario',
+        marker: 'MCP_SERVER_TESTER_EVIDENCE',
+        correlation: {
+          strategy: 'prompt_marker',
+          marker: 'MCP_SERVER_TESTER_EVIDENCE',
+          includedInPrompt: true,
+        },
+        timeoutMs: 1000,
+        startedAtMs: Date.now(),
+      },
+      driver: COWORK_DRIVER,
+      displayName: 'Claude Cowork Desktop',
+      artifacts: [],
+      trace,
+      limitations: [],
+    });
+
+    expect(metadata.evidence?.finalAnswer).toEqual({
+      source: 'host-local-transcript',
+      confidence: 'high',
+    });
+    expect(metadata.evidence?.toolCalls).toEqual({
+      source: 'none',
+      confidence: 'unknown',
+    });
+    expect(metadata.evidence?.usage).toEqual({
+      source: 'host-local-transcript',
+      confidence: 'high',
+    });
+    expect(metadata.evidence?.cost).toEqual({
+      source: 'host-local-transcript',
+      confidence: 'high',
+    });
+    expect(metadata.traceConfidence).toBe('high');
+    expect(metadata.traceLimitations?.join('\n')).toContain(
+      'Tool-call evidence is unavailable'
+    );
+  });
+
+  it('allows capability-local Claude data directory options to override driver-wide options', () => {
+    expect(
+      getClaudeDataDir(
+        {
+          driver: COWORK_DRIVER,
+          options: { dataDir: '/global/claude' },
+        },
+        { with: { dataDir: '/capability/claude' } }
+      )
+    ).toBe('/capability/claude');
+  });
+
+  it('matches sessions by marker instead of timing alone', async () => {
+    const root = await mkdtemp(join(tmpdir(), 'claude-match-'));
+    const sessionDir = join(root, 'local_match');
+    await mkdir(sessionDir, { recursive: true });
+    const metadataPath = join(root, 'local_match.json');
+
+    await writeFile(
+      metadataPath,
+      JSON.stringify({
+        sessionId: 'local_match',
+        initialMessage: 'hello MCP_SERVER_TESTER_MATCH',
+        createdAt: new Date().toISOString(),
+      }),
+      'utf-8'
+    );
+    await writeJsonl(join(sessionDir, 'audit.jsonl'), [
+      { type: 'result', result: 'done' },
+    ]);
+
+    const matches = await findMatchingClaudeSessions({
+      dataDir: root,
+      marker: 'MCP_SERVER_TESTER_MATCH',
+      snapshot: new Map(),
+      startedAtMs: Date.now() - 1000,
+    });
+
+    expect(matches).toHaveLength(1);
+    expect(matches[0]?.finalAnswer).toBe('done');
+  });
+
+  it('handles numeric Claude metadata timestamps when checking recency', async () => {
+    const root = await mkdtemp(join(tmpdir(), 'claude-numeric-time-'));
+    const sessionDir = join(root, 'local_numeric_time');
+    await mkdir(sessionDir, { recursive: true });
+    await writeFile(
+      join(root, 'local_numeric_time.json'),
+      JSON.stringify({
+        sessionId: 'local_numeric_time',
+        initialMessage: 'hello MCP_SERVER_TESTER_NUMERIC_TIME',
+        createdAt: Date.now(),
+      }),
+      'utf-8'
+    );
+    await writeJsonl(join(sessionDir, 'audit.jsonl'), [
+      { type: 'result', result: 'numeric timestamp done' },
+    ]);
+
+    const snapshot = await snapshotClaudeSessions(root);
+    const matches = await findMatchingClaudeSessions({
+      dataDir: root,
+      marker: 'MCP_SERVER_TESTER_NUMERIC_TIME',
+      snapshot,
+      startedAtMs: Date.now() - 1000,
+    });
+
+    expect(matches).toHaveLength(1);
+    expect(matches[0]?.finalAnswer).toBe('numeric timestamp done');
+  });
+
+  it('snapshots existing sessions so old unchanged files are ignored', async () => {
+    const root = await mkdtemp(join(tmpdir(), 'claude-snapshot-'));
+    await mkdir(join(root, 'local_old'), { recursive: true });
+    await writeFile(
+      join(root, 'local_old.json'),
+      JSON.stringify({
+        sessionId: 'local_old',
+        initialMessage: 'MCP_SERVER_TESTER_OLD',
+      }),
+      'utf-8'
+    );
+
+    const snapshot = await snapshotClaudeSessions(root);
+    const matches = await findMatchingClaudeSessions({
+      dataDir: root,
+      marker: 'MCP_SERVER_TESTER_OLD',
+      snapshot,
+      startedAtMs: Date.now(),
+    });
+
+    expect(matches).toEqual([]);
+  });
+
+  it('detects reused sessions when audit files change after the snapshot', async () => {
+    const root = await mkdtemp(join(tmpdir(), 'claude-reuse-'));
+    const sessionDir = join(root, 'local_reuse');
+    await mkdir(sessionDir, { recursive: true });
+    await writeFile(
+      join(root, 'local_reuse.json'),
+      JSON.stringify({
+        sessionId: 'local_reuse',
+        initialMessage: 'MCP_SERVER_TESTER_REUSE',
+      }),
+      'utf-8'
+    );
+
+    const snapshot = await snapshotClaudeSessions(root);
+    await writeJsonl(join(sessionDir, 'audit.jsonl'), [
+      { type: 'result', result: 'reuse done' },
+    ]);
+
+    const matches = await findMatchingClaudeSessions({
+      dataDir: root,
+      marker: 'MCP_SERVER_TESTER_REUSE',
+      snapshot,
+      startedAtMs: Date.now(),
+    });
+
+    expect(matches).toHaveLength(1);
+    expect(matches[0]?.finalAnswer).toBe('reuse done');
+  });
+
+  it('does not use a pre-marker result as completion for a reused session', async () => {
+    const root = await mkdtemp(join(tmpdir(), 'claude-reuse-marker-'));
+    const sessionId = 'local_reuse_marker';
+    const sessionDir = join(root, sessionId);
+    await mkdir(sessionDir, { recursive: true });
+    const metadataPath = join(root, `${sessionId}.json`);
+    await writeFile(
+      metadataPath,
+      JSON.stringify({
+        sessionId,
+        initialMessage: 'old run',
+      }),
+      'utf-8'
+    );
+    await writeJsonl(join(sessionDir, 'audit.jsonl'), [
+      { type: 'result', result: 'old completed answer' },
+      {
+        type: 'assistant',
+        message: {
+          content: [
+            {
+              type: 'text',
+              text: 'MCP_SERVER_TESTER_REUSED_MARKER partial new response',
+            },
+          ],
+        },
+      },
+    ]);
+
+    const trace = await parseClaudeTrace(
+      {
+        id: sessionId,
+        metadataPath,
+        sessionDir,
+        statMtimeMs: Date.now(),
+        metadata: {
+          sessionId,
+          initialMessage: 'old run',
+        },
+      },
+      'MCP_SERVER_TESTER_REUSED_MARKER'
+    );
+
+    expect(trace.finalAnswer).toBe(
+      'MCP_SERVER_TESTER_REUSED_MARKER partial new response'
+    );
+    expect(trace.isComplete).toBe(false);
+  });
+
+  it('does not use a pre-marker result when metadata contains the marker but the audit is still pending', async () => {
+    const root = await mkdtemp(join(tmpdir(), 'claude-metadata-marker-'));
+    const sessionId = 'local_metadata_marker';
+    const sessionDir = join(root, sessionId);
+    await mkdir(sessionDir, { recursive: true });
+    const metadataPath = join(root, `${sessionId}.json`);
+    await writeFile(
+      metadataPath,
+      JSON.stringify({
+        sessionId,
+        initialMessage: 'MCP_SERVER_TESTER_METADATA_MARKER prompt',
+      }),
+      'utf-8'
+    );
+    await writeJsonl(join(sessionDir, 'audit.jsonl'), [
+      { type: 'result', result: 'old completed answer' },
+      {
+        type: 'assistant',
+        message: {
+          content: [
+            {
+              type: 'text',
+              text: 'MCP_SERVER_TESTER_METADATA_MARKER partial new response',
+            },
+          ],
+        },
+      },
+    ]);
+
+    const trace = await parseClaudeTrace(
+      {
+        id: sessionId,
+        metadataPath,
+        sessionDir,
+        statMtimeMs: Date.now(),
+        metadata: {
+          sessionId,
+          initialMessage: 'MCP_SERVER_TESTER_METADATA_MARKER prompt',
+        },
+      },
+      'MCP_SERVER_TESTER_METADATA_MARKER'
+    );
+
+    expect(trace.finalAnswer).toBe(
+      'MCP_SERVER_TESTER_METADATA_MARKER partial new response'
+    );
+    expect(trace.isComplete).toBe(false);
+  });
+
+  it('does not combine a transcript marker with a pre-marker audit result', async () => {
+    const root = await mkdtemp(join(tmpdir(), 'claude-cross-source-marker-'));
+    const sessionId = 'local_cross_source_marker';
+    const cliSessionId = 'cli-cross-source';
+    const sessionDir = join(root, sessionId);
+    const transcriptDir = join(sessionDir, '.claude', 'projects', '-project');
+    await mkdir(transcriptDir, { recursive: true });
+    const metadataPath = join(root, `${sessionId}.json`);
+    await writeFile(
+      metadataPath,
+      JSON.stringify({
+        sessionId,
+        cliSessionId,
+        initialMessage: 'old run',
+      }),
+      'utf-8'
+    );
+    await writeJsonl(join(sessionDir, 'audit.jsonl'), [
+      { type: 'result', result: 'old completed answer' },
+    ]);
+    await writeJsonl(join(transcriptDir, `${cliSessionId}.jsonl`), [
+      {
+        type: 'assistant',
+        message: {
+          content: [
+            {
+              type: 'text',
+              text: 'MCP_SERVER_TESTER_CROSS_SOURCE partial new response',
+            },
+          ],
+        },
+      },
+    ]);
+
+    const trace = await parseClaudeTrace(
+      {
+        id: sessionId,
+        metadataPath,
+        sessionDir,
+        statMtimeMs: Date.now(),
+        metadata: {
+          sessionId,
+          cliSessionId,
+          initialMessage: 'old run',
+        },
+      },
+      'MCP_SERVER_TESTER_CROSS_SOURCE'
+    );
+
+    expect(trace.finalAnswer).toBe(
+      'MCP_SERVER_TESTER_CROSS_SOURCE partial new response'
+    );
+    expect(trace.isComplete).toBe(false);
+  });
+
+  it('normalizes MCP tool names when server names contain underscores', async () => {
+    const root = await mkdtemp(join(tmpdir(), 'claude-tool-name-'));
+    const sessionId = 'local_tool_name';
+    const cliSessionId = 'cli-session';
+    const sessionDir = join(root, sessionId);
+    const transcriptDir = join(sessionDir, '.claude', 'projects', '-project');
+    await mkdir(transcriptDir, { recursive: true });
+    const metadataPath = join(root, `${sessionId}.json`);
+    await writeFile(
+      metadataPath,
+      JSON.stringify({
+        sessionId,
+        cliSessionId,
+        initialMessage: 'marker MCP_SERVER_TESTER_TOOL',
+      }),
+      'utf-8'
+    );
+    await writeJsonl(join(sessionDir, 'audit.jsonl'), [
+      { type: 'result', result: 'done' },
+    ]);
+    await writeJsonl(join(transcriptDir, `${cliSessionId}.jsonl`), [
+      {
+        type: 'assistant',
+        message: {
+          content: [
+            {
+              type: 'tool_use',
+              id: 'toolu_1',
+              name: 'mcp__my_server__search',
+              input: { query: 'planning' },
+            },
+          ],
+        },
+      },
+    ]);
+
+    const trace = await parseClaudeTrace({
+      id: sessionId,
+      metadataPath,
+      sessionDir,
+      statMtimeMs: Date.now(),
+      metadata: {
+        sessionId,
+        cliSessionId,
+        initialMessage: 'marker MCP_SERVER_TESTER_TOOL',
+      },
+    });
+
+    expect(trace.toolCalls[0]?.name).toBe('search');
+  });
+
+  it('waits for a terminal result event before returning a matched trace', async () => {
+    const root = await mkdtemp(join(tmpdir(), 'claude-wait-result-'));
+    const sessionDir = join(root, 'local_wait');
+    await mkdir(sessionDir, { recursive: true });
+    await writeFile(
+      join(root, 'local_wait.json'),
+      JSON.stringify({
+        sessionId: 'local_wait',
+        initialMessage: 'MCP_SERVER_TESTER_WAIT',
+        createdAt: new Date().toISOString(),
+      }),
+      'utf-8'
+    );
+    await writeJsonl(join(sessionDir, 'audit.jsonl'), [
+      {
+        type: 'assistant',
+        message: { content: [{ type: 'text', text: 'partial' }] },
+      },
+    ]);
+
+    const tracePromise = waitForClaudeTrace({
+      dataDir: root,
+      marker: 'MCP_SERVER_TESTER_WAIT',
+      correlation: {
+        strategy: 'prompt_marker',
+        marker: 'MCP_SERVER_TESTER_WAIT',
+        includedInPrompt: true,
+      },
+      snapshot: new Map(),
+      timeoutMs: 2_500,
+      startedAtMs: Date.now() - 1000,
+    });
+
+    await new Promise((resolve) => setTimeout(resolve, 900));
+    await writeJsonl(join(sessionDir, 'audit.jsonl'), [
+      {
+        type: 'assistant',
+        message: { content: [{ type: 'text', text: 'partial' }] },
+      },
+      { type: 'result', result: 'complete' },
+    ]);
+
+    await expect(tracePromise).resolves.toMatchObject({
+      finalAnswer: 'complete',
+      isComplete: true,
+    });
+  });
+
+  it('waits briefly for an expected embedded transcript after the result event', async () => {
+    const root = await mkdtemp(join(tmpdir(), 'claude-wait-transcript-'));
+    const sessionId = 'local_wait_transcript';
+    const cliSessionId = 'cli-session';
+    const sessionDir = join(root, sessionId);
+    const transcriptDir = join(sessionDir, '.claude', 'projects', '-project');
+    await mkdir(transcriptDir, { recursive: true });
+    await writeFile(
+      join(root, `${sessionId}.json`),
+      JSON.stringify({
+        sessionId,
+        cliSessionId,
+        initialMessage: 'MCP_SERVER_TESTER_TRANSCRIPT',
+        createdAt: new Date().toISOString(),
+      }),
+      'utf-8'
+    );
+    await writeJsonl(join(sessionDir, 'audit.jsonl'), [
+      { type: 'result', result: 'complete' },
+    ]);
+
+    const tracePromise = waitForClaudeTrace({
+      dataDir: root,
+      marker: 'MCP_SERVER_TESTER_TRANSCRIPT',
+      correlation: {
+        strategy: 'prompt_marker',
+        marker: 'MCP_SERVER_TESTER_TRANSCRIPT',
+        includedInPrompt: true,
+      },
+      snapshot: new Map(),
+      timeoutMs: 3_500,
+      startedAtMs: Date.now() - 1000,
+    });
+
+    await new Promise((resolve) => setTimeout(resolve, 900));
+    await writeJsonl(join(transcriptDir, `${cliSessionId}.jsonl`), [
+      {
+        type: 'assistant',
+        message: {
+          content: [
+            {
+              type: 'tool_use',
+              id: 'toolu_1',
+              name: 'mcp__server__search',
+              input: { query: 'planning' },
+            },
+          ],
+        },
+      },
+    ]);
+
+    await expect(tracePromise).resolves.toMatchObject({
+      finalAnswer: 'complete',
+      transcriptParsed: true,
+      toolCalls: [{ name: 'search', arguments: { query: 'planning' } }],
+    });
+  });
+
+  it('discovers nested Claude local-agent session metadata', async () => {
+    const root = await mkdtemp(join(tmpdir(), 'claude-nested-'));
+    const nested = join(root, 'workspace', 'project');
+    await mkdir(join(nested, 'local_nested'), { recursive: true });
+    await writeFile(
+      join(nested, 'local_nested.json'),
+      JSON.stringify({
+        sessionId: 'local_nested',
+        initialMessage: 'MCP_SERVER_TESTER_NESTED',
+        createdAt: new Date().toISOString(),
+      }),
+      'utf-8'
+    );
+    await writeJsonl(join(nested, 'local_nested', 'audit.jsonl'), [
+      { type: 'result', result: 'nested done' },
+    ]);
+
+    const matches = await findMatchingClaudeSessions({
+      dataDir: root,
+      marker: 'MCP_SERVER_TESTER_NESTED',
+      snapshot: new Map(),
+      startedAtMs: Date.now() - 1000,
+    });
+
+    expect(matches).toHaveLength(1);
+    expect(matches[0]?.finalAnswer).toBe('nested done');
+  });
+
+  it('can match a single fresh Claude local-agent session without a prompt marker', async () => {
+    const root = await mkdtemp(join(tmpdir(), 'claude-no-marker-'));
+    const sessionDir = join(root, 'local_no_marker');
+    await mkdir(sessionDir, { recursive: true });
+    await writeFile(
+      join(root, 'local_no_marker.json'),
+      JSON.stringify({
+        sessionId: 'local_no_marker',
+        initialMessage: 'plain prompt without marker',
+        createdAt: new Date().toISOString(),
+      }),
+      'utf-8'
+    );
+    await writeJsonl(join(sessionDir, 'audit.jsonl'), [
+      { type: 'result', result: 'plain prompt done' },
+    ]);
+
+    const matches = await findMatchingClaudeSessions({
+      dataDir: root,
+      marker: 'MCP_SERVER_TESTER_NOT_IN_PROMPT',
+      correlation: {
+        strategy: 'none',
+        marker: 'MCP_SERVER_TESTER_NOT_IN_PROMPT',
+        includedInPrompt: false,
+      },
+      snapshot: new Map(),
+      startedAtMs: Date.now() - 1000,
+    });
+
+    expect(matches).toHaveLength(1);
+    expect(matches[0]?.finalAnswer).toBe('plain prompt done');
+  });
+
+  it('extracts final answer from accessibility fallback text', () => {
+    expect(
+      extractAccessibilityResponse(
+        [
+          'You said: Please reply with exactly: external host integration acknowledged.',
+          '[eval-run-marker:MCP_SERVER_TESTER_TEST]',
+          'Claude responded: external host integration acknowledged.',
+          'Write a message...',
+        ].join('\n')
+      )
+    ).toBe('external host integration acknowledged.');
+  });
+
+  it('extracts final answer from comma-separated accessibility fallback text', () => {
+    expect(
+      extractAccessibilityResponse(
+        'You said: prompt [eval-run-marker:MCP_SERVER_TESTER_TEST], Claude responded: external host integration acknowledged., Write a message...'
+      )
+    ).toBe('external host integration acknowledged.');
+  });
+
+  it('recognizes the regular Claude Chat surface from visible controls', () => {
+    expect(
+      looksLikeClaudeChatSurface(
+        [
+          'New chat',
+          'Projects',
+          'Artifacts',
+          'Ask your org',
+          'Write a message...',
+        ].join('\n')
+      )
+    ).toBe(true);
+  });
+
+  it('does not classify a local-agent surface from generic composer text alone', () => {
+    expect(
+      looksLikeClaudeChatSurface(
+        ['Claude Code', 'Session', 'Write a message...'].join('\n')
+      )
+    ).toBe(false);
+  });
+});
diff --git a/src/evals/externalHost/builtins/anthropicClaude.ts b/src/evals/externalHost/builtins/anthropicClaude.ts
new file mode 100644
index 0000000..2f852bc
--- /dev/null
+++ b/src/evals/externalHost/builtins/anthropicClaude.ts
@@ -0,0 +1,1389 @@
+import { randomUUID } from 'node:crypto';
+import { readdir, readFile, stat } from 'node:fs/promises';
+import { homedir } from 'node:os';
+import { basename, dirname, join } from 'node:path';
+import { Readable } from 'node:stream';
+import { parse as parseNdjson } from 'ndjson';
+import type { LLMToolCall } from '../../mcpHost/mcpHostTypes.js';
+import type {
+  ExternalHostConfig,
+  ExternalHostCapabilityContext,
+  ExternalHostCapabilityImplementation,
+  ExternalHostFailureKind,
+  ExternalHostMetadata,
+  ExternalHostRunResult,
+  HostArtifact,
+  HostCapability,
+  HostDriverId,
+  HostRunContext,
+} from '../types.js';
+import type { UsageMetrics } from '../../../types/index.js';
+import { driverToSlug, hostTypeFromDriver } from '../driverIdentity.js';
+import {
+  readMacosAccessibilityText,
+  readMacosFrontWindowContents,
+  runAppleScript,
+} from './macosDesktop.js';
+
+const DEFAULT_APP_NAME = 'Claude';
+const POLL_INTERVAL_MS = 750;
+const TRACE_SETTLE_AFTER_COMPLETE_MS = 1_500;
+const CLAUDE_DESKTOP_MACOS_CAPABILITIES = [
+  'control',
+  'input',
+  'completion',
+  'trace',
+  'normalize',
+] as const;
+
+export interface ClaudeSessionMetadata {
+  sessionId?: string;
+  cliSessionId?: string;
+  createdAt?: string | number;
+  lastActivityAt?: string | number;
+  cwd?: string;
+  model?: string;
+  title?: string;
+  initialMessage?: string;
+}
+
+export interface SessionCandidate {
+  id: string;
+  metadataPath: string;
+  sessionDir: string;
+  statMtimeMs: number;
+  metadata: ClaudeSessionMetadata;
+}
+
+interface SnapshotEntry {
+  mtimeMs: number;
+}
+
+export type ClaudeSessionSnapshot = Map<string, SnapshotEntry>;
+
+export interface ClaudeTrace {
+  candidate: SessionCandidate;
+  auditPath?: string;
+  transcriptPath?: string;
+  finalAnswer?: string;
+  toolCalls: LLMToolCall[];
+  usage?: UsageMetrics;
+  requestId?: string;
+  completedAt?: string;
+  llmDurationMs?: number;
+  terminalReason?: string;
+  isError?: boolean;
+  isComplete: boolean;
+  auditParsed: boolean;
+  transcriptParsed: boolean;
+  usageAvailable: boolean;
+  costAvailable: boolean;
+  parseWarnings: string[];
+  rawText: string;
+}
+
+interface ClaudeAuditEvent {
+  type?: string;
+  result?: unknown;
+  is_error?: boolean;
+  duration_ms?: number;
+  duration_api_ms?: number;
+  total_cost_usd?: number;
+  requestId?: string;
+  request_id?: string;
+  usage?: Record<string, unknown>;
+  message?: {
+    content?: Array<{
+      type?: string;
+      id?: string;
+      name?: string;
+      input?: Record<string, unknown>;
+      text?: string;
+    }>;
+  };
+  timestamp?: string;
+  terminal_reason?: string;
+}
+
+export const ANTHROPIC_CLAUDE_CAPABILITIES: ExternalHostCapabilityImplementation[] =
+  [
+    {
+      id: 'builtin:anthropic.claude.coworkSurface',
+      capabilities: ['control'],
+      run: rejectClaudeChatSurfaceCapability,
+    },
+    {
+      id: 'builtin:anthropic.claude.activateCoworkSurface',
+      capabilities: ['control'],
+      run: activateCoworkSurfaceCapability,
+    },
+    {
+      id: 'builtin:anthropic.claude.accessibilityTrace',
+      capabilities: ['completion', 'trace', 'normalize'],
+      run: captureClaudeChatAccessibilityResultCapability,
+    },
+    {
+      id: 'builtin:anthropic.claude.localAgentTrace',
+      capabilities: ['completion', 'trace'],
+      setup: snapshotClaudeSessionsCapability,
+      run: captureClaudeCoworkAgentTraceCapability,
+    },
+    {
+      id: 'builtin:anthropic.claude.localAgentNormalize',
+      capabilities: ['normalize'],
+      run: normalizeClaudeCoworkAgentTraceCapability,
+    },
+  ];
+
+/**
+ * Deterministically switches the Claude desktop app to the Cowork surface via
+ * Cmd+2 (the app's built-in shortcut for the Cowork sidebar tab). Idempotent —
+ * sending Cmd+2 while already on Cowork is a no-op. Replaces the older
+ * rejectClaudeChatSurface capability for use cases that need automatic surface
+ * activation (e.g. CI runs).
+ */
+async function activateCoworkSurfaceCapability({
+  config,
+  run,
+  binding,
+  state,
+}: ExternalHostCapabilityContext): Promise<ExternalHostRunResult | void> {
+  const appName =
+    runStringOption(config, binding, 'appName') ?? DEFAULT_APP_NAME;
+  const settleDelayMs = 700;
+  const script = `
+tell application ${JSON.stringify(appName)} to activate
+delay 0.4
+tell application "System Events"
+  tell process ${JSON.stringify(appName)}
+    set frontmost to true
+    keystroke "2" using command down
+  end tell
+end tell
+delay ${settleDelayMs / 1000}
+return "ok"
+`;
+  try {
+    await runAppleScript(script, { timeoutMs: 8_000 });
+  } catch (err) {
+    return failureResult({
+      config,
+      context: run,
+      driver: state.driver,
+      displayName: state.displayName,
+      capabilitiesUsed: state.capabilitiesUsed,
+      failureKind: 'submission_failed',
+      error: `Failed to activate Cowork surface via Cmd+2: ${formatError(err)}`,
+      artifacts: [],
+      limitations: [
+        'Cowork surface activation depends on Cmd+2 being bound to the Cowork sidebar tab in the user-installed Claude app version.',
+      ],
+    });
+  }
+}
+
+async function rejectClaudeChatSurfaceCapability({
+  config,
+  run,
+  binding,
+  state,
+}: ExternalHostCapabilityContext): Promise<ExternalHostRunResult | void> {
+  const appName =
+    runStringOption(config, binding, 'appName') ?? DEFAULT_APP_NAME;
+  const chatSurfaceReason = await detectClaudeChatSurface(appName);
+  if (!chatSurfaceReason) {
+    return;
+  }
+
+  return failureResult({
+    config,
+    context: run,
+    driver: state.driver,
+    displayName: state.displayName,
+    capabilitiesUsed: state.capabilitiesUsed,
+    failureKind: 'submission_failed',
+    error: `${state.displayName} surface is not active: ${chatSurfaceReason}`,
+    artifacts: [],
+    limitations: [
+      'Cowork is a distinct Claude Desktop surface; this driver will not submit Cowork evals through the regular Claude Chat composer.',
+      'Open or focus an active Cowork/local-agent session before running this driver, or add a deterministic Cowork launch step.',
+    ],
+  });
+}
+
+async function snapshotClaudeSessionsCapability({
+  config,
+  run,
+  binding,
+  state,
+}: ExternalHostCapabilityContext): Promise<ExternalHostRunResult | void> {
+  const dataDir = getClaudeDataDir(config, binding);
+  state.data.claudeDataDir = dataDir;
+
+  try {
+    state.data.claudeSessionSnapshot = await snapshotClaudeSessions(dataDir);
+  } catch (err) {
+    return failureResult({
+      config,
+      context: run,
+      driver: state.driver,
+      displayName: state.displayName,
+      capabilitiesUsed: state.capabilitiesUsed,
+      failureKind: 'parse_failure',
+      error: `Failed to snapshot Claude session directory: ${formatError(err)}`,
+      artifacts: [],
+      limitations: [`Claude data directory: ${dataDir}`],
+    });
+  }
+}
+
+async function captureClaudeChatAccessibilityResultCapability({
+  config,
+  run,
+  binding,
+  state,
+}: ExternalHostCapabilityContext): Promise<ExternalHostRunResult | void> {
+  try {
+    return await waitForAccessibilityTrace({
+      config,
+      context: run,
+      driver: state.driver,
+      displayName: state.displayName,
+      capabilitiesUsed: state.capabilitiesUsed,
+      timeoutMs: run.timeoutMs,
+      appName: runStringOption(config, binding, 'appName'),
+    });
+  } catch (err) {
+    const message = formatError(err);
+    return failureResult({
+      config,
+      context: run,
+      driver: state.driver,
+      displayName: state.displayName,
+      capabilitiesUsed: state.capabilitiesUsed,
+      failureKind: classifyTraceFailure(message),
+      error: message,
+      artifacts: [],
+      limitations: [
+        'Claude Chat Desktop currently uses Accessibility as the fallback trace source; IndexedDB parsing has not been stabilized.',
+      ],
+    });
+  }
+}
+
+async function captureClaudeCoworkAgentTraceCapability({
+  config,
+  run,
+  binding,
+  state,
+}: ExternalHostCapabilityContext): Promise<ExternalHostRunResult | void> {
+  const dataDir =
+    typeof state.data.claudeDataDir === 'string'
+      ? state.data.claudeDataDir
+      : getClaudeDataDir(config, binding);
+  const snapshot = state.data.claudeSessionSnapshot as
+    | ClaudeSessionSnapshot
+    | undefined;
+
+  if (!snapshot) {
+    return failureResult({
+      config,
+      context: run,
+      driver: state.driver,
+      displayName: state.displayName,
+      capabilitiesUsed: state.capabilitiesUsed,
+      failureKind: 'parse_failure',
+      error: 'Claude Cowork trace step requires a session snapshot.',
+      artifacts: [],
+      limitations: [`Claude data directory: ${dataDir}`],
+    });
+  }
+
+  try {
+    state.data.claudeTrace = await waitForClaudeTrace({
+      dataDir,
+      marker: run.marker,
+      correlation: run.correlation,
+      snapshot,
+      timeoutMs: run.timeoutMs,
+      startedAtMs: run.startedAtMs,
+    });
+  } catch (err) {
+    const message = formatError(err);
+    return failureResult({
+      config,
+      context: run,
+      driver: state.driver,
+      displayName: state.displayName,
+      capabilitiesUsed: state.capabilitiesUsed,
+      failureKind: classifyTraceFailure(message),
+      error: message,
+      artifacts: [],
+      limitations: [`Claude data directory: ${dataDir}`],
+    });
+  }
+}
+
+async function normalizeClaudeCoworkAgentTraceCapability({
+  config,
+  run,
+  state,
+}: ExternalHostCapabilityContext): Promise<ExternalHostRunResult> {
+  const trace = state.data.claudeTrace as ClaudeTrace | undefined;
+  if (!trace) {
+    return failureResult({
+      config,
+      context: run,
+      driver: state.driver,
+      displayName: state.displayName,
+      capabilitiesUsed: state.capabilitiesUsed,
+      failureKind: 'parse_failure',
+      error: 'Claude Cowork trace normalization requires a parsed trace.',
+      artifacts: [],
+      limitations: [],
+    });
+  }
+
+  const artifacts = buildArtifacts(trace);
+  const metadata = buildClaudeTraceMetadata({
+    config,
+    context: run,
+    driver: state.driver,
+    displayName: state.displayName,
+    capabilitiesUsed: state.capabilitiesUsed,
+    artifacts,
+    trace,
+    limitations: trace.parseWarnings,
+  });
+
+  if (trace.isError) {
+    return {
+      success: false,
+      toolCalls: trace.toolCalls,
+      error:
+        trace.finalAnswer ??
+        `Claude host run failed${trace.terminalReason ? `: ${trace.terminalReason}` : ''}`,
+      externalHost: {
+        ...metadata,
+        failureKind: 'host_run_failed',
+      },
+    };
+  }
+
+  if (trace.finalAnswer === undefined) {
+    return {
+      success: false,
+      toolCalls: trace.toolCalls,
+      error: 'Claude trace completed but did not include a final answer.',
+      externalHost: {
+        ...metadata,
+        failureKind: 'parse_failure',
+      },
+    };
+  }
+
+  return {
+    success: true,
+    toolCalls: trace.toolCalls,
+    response: trace.finalAnswer,
+    conversationHistory: trace.finalAnswer
+      ? [{ role: 'assistant', content: trace.finalAnswer }]
+      : undefined,
+    usage: trace.usage,
+    llmDurationMs: trace.llmDurationMs,
+    externalHost: metadata,
+  };
+}
+
+function stringOption(
+  options: Record<string, unknown> | undefined,
+  key: string
+): string | undefined {
+  const value = options?.[key];
+  return typeof value === 'string' ? value : undefined;
+}
+
+function configStringOption(
+  config: ExternalHostConfig,
+  key: string
+): string | undefined {
+  const value = config.options?.[key];
+  return typeof value === 'string' ? value : undefined;
+}
+
+function runStringOption(
+  config: ExternalHostConfig,
+  binding: { with?: Record<string, unknown> },
+  key: string
+): string | undefined {
+  return stringOption(binding.with, key) ?? configStringOption(config, key);
+}
+
+export function getClaudeDataDir(
+  config: ExternalHostConfig,
+  binding?: { with?: Record<string, unknown> }
+): string {
+  const configuredDataDir = binding
+    ? runStringOption(config, binding, 'dataDir')
+    : configStringOption(config, 'dataDir');
+
+  return (
+    configuredDataDir ??
+    join(
+      homedir(),
+      'Library',
+      'Application Support',
+      'Claude',
+      'local-agent-mode-sessions'
+    )
+  );
+}
+
+export async function snapshotClaudeSessions(
+  dataDir: string
+): Promise<ClaudeSessionSnapshot> {
+  const snapshot = new Map<string, SnapshotEntry>();
+  const sessions = await listSessionCandidates(dataDir);
+  for (const session of sessions) {
+    snapshot.set(session.metadataPath, { mtimeMs: session.statMtimeMs });
+  }
+  return snapshot;
+}
+
+export async function waitForClaudeTrace(options: {
+  dataDir: string;
+  marker: string;
+  correlation: HostRunContext['correlation'];
+  snapshot: ClaudeSessionSnapshot;
+  timeoutMs: number;
+  startedAtMs: number;
+}): Promise<ClaudeTrace> {
+  const deadline = Date.now() + options.timeoutMs;
+  let lastPending: ClaudeTrace | undefined;
+  let completeTraceFirstSeenAtMs: number | undefined;
+
+  while (Date.now() < deadline) {
+    const matches = await findMatchingClaudeSessions(options);
+
+    if (matches.length > 1) {
+      throw new Error(
+        `Ambiguous Claude sessions for ${describeCorrelation(options)}: ${matches
+          .map((m) => m.candidate.id)
+          .join(', ')}`
+      );
+    }
+
+    if (matches.length === 1) {
+      const trace = matches[0]!;
+      if (isTraceReady(trace, completeTraceFirstSeenAtMs)) {
+        return trace;
+      }
+      if (trace.isComplete && completeTraceFirstSeenAtMs === undefined) {
+        completeTraceFirstSeenAtMs = Date.now();
+      }
+      lastPending = trace;
+    }
+
+    await delay(POLL_INTERVAL_MS);
+  }
+
+  if (lastPending) {
+    throw new Error(
+      `Timed out waiting for Claude session ${lastPending.candidate.id} to complete`
+    );
+  }
+
+  throw new Error(
+    `No matching Claude session found for ${describeCorrelation(options)}`
+  );
+}
+
+function isTraceReady(
+  trace: ClaudeTrace,
+  completeTraceFirstSeenAtMs: number | undefined
+): boolean {
+  if (!trace.isComplete) {
+    return false;
+  }
+
+  if (!trace.candidate.metadata.cliSessionId || trace.transcriptParsed) {
+    return true;
+  }
+
+  return (
+    completeTraceFirstSeenAtMs !== undefined &&
+    Date.now() - completeTraceFirstSeenAtMs >= TRACE_SETTLE_AFTER_COMPLETE_MS
+  );
+}
+
+export async function findMatchingClaudeSessions(options: {
+  dataDir: string;
+  marker: string;
+  correlation?: HostRunContext['correlation'];
+  snapshot: ClaudeSessionSnapshot;
+  startedAtMs: number;
+}): Promise<ClaudeTrace[]> {
+  const sessions = await listSessionCandidates(options.dataDir);
+  const traces: ClaudeTrace[] = [];
+
+  for (const session of sessions) {
+    const previous = options.snapshot.get(session.metadataPath);
+    const isNewOrUpdated =
+      previous === undefined || session.statMtimeMs > previous.mtimeMs;
+    const createdAtMs = metadataTimestampMs(session.metadata.createdAt);
+    const isRecent =
+      !Number.isNaN(createdAtMs) && createdAtMs >= options.startedAtMs - 5_000;
+
+    if (!isNewOrUpdated && !isRecent) {
+      continue;
+    }
+
+    const trace = await parseClaudeTrace(
+      session,
+      options.correlation?.includedInPrompt === false
+        ? undefined
+        : options.marker
+    );
+    if (
+      sessionMatchesCorrelation({
+        session,
+        trace,
+        marker: options.marker,
+        correlation: options.correlation,
+        isNewOrUpdated,
+        isRecent,
+      })
+    ) {
+      traces.push(trace);
+    }
+  }
+
+  return traces;
+}
+
+function describeCorrelation(options: {
+  marker: string;
+  correlation?: HostRunContext['correlation'];
+}): string {
+  if (options.correlation?.includedInPrompt) {
+    return `marker ${options.marker}`;
+  }
+  return `${options.correlation?.strategy ?? 'none'} correlation near the run start`;
+}
+
+async function readAccessibilityFallback(
+  config: ExternalHostConfig,
+  context: HostRunContext,
+  driver: HostDriverId,
+  displayName: string,
+  capabilitiesUsed: readonly HostCapability[],
+  options: { appName?: string } = {}
+): Promise<ExternalHostRunResult | undefined> {
+  let visibleText: string;
+  try {
+    visibleText = await readMacosAccessibilityText(
+      options.appName ??
+        configStringOption(config, 'appName') ??
+        DEFAULT_APP_NAME
+    );
+  } catch {
+    return undefined;
+  }
+
+  if (!visibleText.includes(context.marker)) {
+    return undefined;
+  }
+
+  const response = extractAccessibilityResponse(visibleText);
+  if (!response) {
+    return undefined;
+  }
+
+  return {
+    success: true,
+    toolCalls: [],
+    response,
+    conversationHistory: [{ role: 'assistant', content: response }],
+    externalHost: {
+      ...buildHostIdentityMetadata(config, driver, displayName),
+      hostVariant: config.variant,
+      capabilitiesUsed: [...capabilitiesUsed],
+      traceSource: 'accessibility',
+      traceConfidence: 'low',
+      traceLimitations: [
+        'Claude did not produce a matching local-agent transcript; final answer was captured from the visible Accessibility tree.',
+        'Tool calls, token usage, cost, and hidden context are unavailable from this fallback source.',
+      ],
+      artifacts: [
+        {
+          kind: 'trace',
+          name: 'Claude visible accessibility text',
+          contentType: 'text/plain',
+          summary: visibleText.slice(0, 1000),
+        },
+      ],
+      session: {
+        runMarker: context.marker,
+      },
+      correlation: context.correlation,
+      sources: {
+        finalAnswer: 'accessibility',
+        toolCalls: 'none',
+        usage: 'none',
+        cost: 'none',
+      },
+      evidence: {
+        finalAnswer: { source: 'accessibility', confidence: 'low' },
+        toolCalls: { source: 'none', confidence: 'unknown' },
+        usage: { source: 'none', confidence: 'unknown' },
+        cost: { source: 'none', confidence: 'unknown' },
+      },
+    },
+  };
+}
+
+async function detectClaudeChatSurface(
+  appName: string
+): Promise<string | undefined> {
+  let surfaceText: string;
+  try {
+    // `entire contents of front window` is a single IPC batch transfer; it can
+    // be multi-MB on a fully-loaded Electron window (handled by the maxBuffer
+    // bump in runAppleScript). The recursive AppleScript alternative does one
+    // IPC round-trip per element and hits the per-script timeout on large
+    // trees.
+    surfaceText = await readMacosFrontWindowContents(appName);
+  } catch (err) {
+    return `could not verify active Claude surface via Accessibility: ${formatError(err)}`;
+  }
+
+  if (looksLikeClaudeChatSurface(surfaceText)) {
+    return 'visible controls match the regular Claude Chat surface';
+  }
+
+  return undefined;
+}
+
+export function looksLikeClaudeChatSurface(visibleText: string): boolean {
+  const chatSignals = [
+    'New chat',
+    'Projects',
+    'Artifacts',
+    'Ask your org',
+    'Write a message',
+  ];
+  const signalCount = chatSignals.filter((signal) =>
+    visibleText.includes(signal)
+  ).length;
+  return signalCount >= 3;
+}
+
+async function waitForAccessibilityTrace(options: {
+  config: ExternalHostConfig;
+  context: HostRunContext;
+  driver: HostDriverId;
+  displayName: string;
+  capabilitiesUsed: readonly HostCapability[];
+  timeoutMs: number;
+  appName?: string;
+}): Promise<ExternalHostRunResult> {
+  const deadline = Date.now() + options.timeoutMs;
+
+  while (Date.now() < deadline) {
+    const fallback = await readAccessibilityFallback(
+      options.config,
+      options.context,
+      options.driver,
+      options.displayName,
+      options.capabilitiesUsed,
+      { appName: options.appName }
+    );
+    if (fallback) {
+      return fallback;
+    }
+    await delay(POLL_INTERVAL_MS);
+  }
+
+  throw new Error(
+    `Timed out waiting for Claude Chat Desktop visible response for marker ${options.context.marker}`
+  );
+}
+
+export async function parseClaudeTrace(
+  candidate: SessionCandidate,
+  marker?: string
+): Promise<ClaudeTrace> {
+  const parseWarnings: string[] = [];
+  const auditPath = join(candidate.sessionDir, 'audit.jsonl');
+  const transcriptPath = candidate.metadata.cliSessionId
+    ? await findFile(
+        candidate.sessionDir,
+        `${candidate.metadata.cliSessionId}.jsonl`
+      )
+    : undefined;
+
+  let auditEvents: ClaudeAuditEvent[] = [];
+  let transcriptEvents: ClaudeAuditEvent[] = [];
+  let rawAudit = '';
+  let rawTranscript = '';
+  let auditParsed = false;
+  let transcriptParsed = false;
+
+  try {
+    rawAudit = await readFile(auditPath, 'utf-8');
+    const parsed = await parseNdjsonContent<ClaudeAuditEvent>(
+      rawAudit,
+      'Claude audit log'
+    );
+    auditEvents = parsed.events;
+    auditParsed = parsed.events.length > 0;
+    parseWarnings.push(...parsed.warnings);
+  } catch (err) {
+    parseWarnings.push(`Could not read Claude audit log: ${formatError(err)}`);
+  }
+
+  if (transcriptPath) {
+    try {
+      rawTranscript = await readFile(transcriptPath, 'utf-8');
+      const parsed = await parseNdjsonContent<ClaudeAuditEvent>(
+        rawTranscript,
+        'Claude transcript'
+      );
+      transcriptEvents = parsed.events;
+      transcriptParsed = parsed.ok;
+      parseWarnings.push(...parsed.warnings);
+    } catch (err) {
+      parseWarnings.push(
+        `Could not read Claude transcript: ${formatError(err)}`
+      );
+    }
+  } else if (candidate.metadata.cliSessionId) {
+    parseWarnings.push(
+      `Could not locate transcript for cliSessionId ${candidate.metadata.cliSessionId}.`
+    );
+  }
+
+  const auditEventsForRun = selectEventsForMarker(
+    candidate.metadata,
+    auditEvents,
+    marker
+  );
+  const transcriptEventsForRun = selectEventsForMarker(
+    candidate.metadata,
+    transcriptEvents,
+    marker
+  );
+  const combinedEventsForRun = [
+    ...auditEventsForRun,
+    ...transcriptEventsForRun,
+  ];
+  const resultEvent =
+    findLastResultEvent(auditEventsForRun) ??
+    findLastResultEvent(transcriptEventsForRun);
+  const finalAnswer =
+    typeof resultEvent?.result === 'string'
+      ? resultEvent.result
+      : extractAssistantText(combinedEventsForRun);
+  const usage = resultEvent ? extractUsage(resultEvent) : undefined;
+  const toolCalls = extractToolCalls(
+    transcriptEventsForRun.length > 0
+      ? transcriptEventsForRun
+      : combinedEventsForRun
+  );
+
+  return {
+    candidate,
+    auditPath,
+    transcriptPath,
+    finalAnswer,
+    toolCalls,
+    usage,
+    requestId: resultEvent?.requestId ?? resultEvent?.request_id,
+    completedAt: resultEvent?.timestamp,
+    llmDurationMs: resultEvent?.duration_api_ms ?? resultEvent?.duration_ms,
+    terminalReason: resultEvent?.terminal_reason,
+    isError: resultEvent?.is_error === true,
+    isComplete: resultEvent !== undefined,
+    auditParsed,
+    transcriptParsed,
+    usageAvailable: usage !== undefined,
+    costAvailable: typeof resultEvent?.total_cost_usd === 'number',
+    parseWarnings,
+    rawText: `${rawAudit}\n${rawTranscript}`,
+  };
+}
+
+function selectEventsForMarker(
+  metadata: ClaudeSessionMetadata,
+  events: ClaudeAuditEvent[],
+  marker?: string
+): ClaudeAuditEvent[] {
+  if (!marker) {
+    return events;
+  }
+
+  const markerIndex = events.findIndex((event) =>
+    JSON.stringify(event).includes(marker)
+  );
+  if (markerIndex < 0) {
+    return metadata.initialMessage?.includes(marker) ? events : [];
+  }
+
+  return events.slice(markerIndex);
+}
+
+export function buildClaudeTraceMetadata(options: {
+  config: ExternalHostConfig;
+  context: HostRunContext;
+  driver: HostDriverId;
+  displayName: string;
+  capabilitiesUsed?: readonly HostCapability[];
+  artifacts: HostArtifact[];
+  trace: ClaudeTrace;
+  limitations: string[];
+}): ExternalHostMetadata {
+  const correlationLimitations = options.context.correlation.includedInPrompt
+    ? []
+    : [
+        'Trace was matched by recently updated host artifacts because no prompt marker was included.',
+      ];
+  const limitations = buildTraceLimitations(options.trace, [
+    ...options.limitations,
+    ...correlationLimitations,
+  ]);
+  const traceConfidence = getTraceConfidence(
+    options.trace,
+    options.context.correlation
+  );
+  const finalAnswerEvidence = buildEvidence(
+    options.trace.isComplete && options.trace.finalAnswer !== undefined,
+    traceConfidence
+  );
+  const toolCallsEvidence = buildEvidence(
+    options.trace.transcriptParsed,
+    traceConfidence
+  );
+  const usageEvidence = buildEvidence(
+    options.trace.usageAvailable,
+    traceConfidence
+  );
+  const costEvidence = buildEvidence(
+    options.trace.costAvailable,
+    traceConfidence
+  );
+
+  return {
+    ...buildHostIdentityMetadata(
+      options.config,
+      options.driver,
+      options.displayName
+    ),
+    hostVariant: options.config.variant,
+    capabilitiesUsed: [
+      ...(options.capabilitiesUsed ?? CLAUDE_DESKTOP_MACOS_CAPABILITIES),
+    ],
+    traceSource: 'host-local-transcript',
+    traceConfidence,
+    traceLimitations: limitations.length > 0 ? limitations : undefined,
+    artifacts: options.artifacts,
+    session: {
+      id:
+        options.trace.candidate.metadata.sessionId ??
+        options.trace.candidate.id,
+      runMarker: options.context.marker,
+      requestId: options.trace.requestId,
+      cliSessionId: options.trace.candidate.metadata.cliSessionId,
+      cwd: options.trace.candidate.metadata.cwd,
+      startedAt: metadataTimestampString(
+        options.trace.candidate.metadata.createdAt
+      ),
+      completedAt: options.trace.completedAt,
+    },
+    correlation: options.context.correlation,
+    sources: {
+      finalAnswer: finalAnswerEvidence.source,
+      toolCalls: toolCallsEvidence.source,
+      usage: usageEvidence.source,
+      cost: costEvidence.source,
+    },
+    evidence: {
+      finalAnswer: finalAnswerEvidence,
+      toolCalls: toolCallsEvidence,
+      usage: usageEvidence,
+      cost: costEvidence,
+    },
+  };
+}
+
+function buildEvidence(
+  available: boolean,
+  confidence: ExternalHostMetadata['traceConfidence']
+) {
+  return available
+    ? ({ source: 'host-local-transcript', confidence } as const)
+    : ({ source: 'none', confidence: 'unknown' } as const);
+}
+
+function getTraceConfidence(
+  trace: ClaudeTrace,
+  correlation: HostRunContext['correlation']
+): ExternalHostMetadata['traceConfidence'] {
+  if (!trace.isComplete || !trace.auditParsed) {
+    return 'unknown';
+  }
+  if (
+    trace.parseWarnings.some((warning) =>
+      warning.startsWith('Claude audit log discarded')
+    )
+  ) {
+    return 'medium';
+  }
+  return correlation.includedInPrompt ? 'high' : 'medium';
+}
+
+function buildTraceLimitations(
+  trace: ClaudeTrace,
+  limitations: string[]
+): string[] {
+  const output = [...limitations];
+
+  if (!trace.transcriptParsed) {
+    output.push(
+      'Tool-call evidence is unavailable because a complete structured Claude transcript was not found or could not be parsed.'
+    );
+  }
+
+  if (!trace.usageAvailable) {
+    output.push('Usage evidence is unavailable from the parsed Claude trace.');
+  }
+
+  if (!trace.costAvailable) {
+    output.push('Cost evidence is unavailable from the parsed Claude trace.');
+  }
+
+  return Array.from(new Set(output));
+}
+
+function failureResult(options: {
+  config: ExternalHostConfig;
+  context: HostRunContext;
+  driver: HostDriverId;
+  displayName: string;
+  capabilitiesUsed?: readonly HostCapability[];
+  failureKind: ExternalHostFailureKind;
+  error: string;
+  artifacts: HostArtifact[];
+  limitations: string[];
+}): ExternalHostRunResult {
+  return {
+    success: false,
+    toolCalls: [],
+    error: options.error,
+    externalHost: {
+      ...buildHostIdentityMetadata(
+        options.config,
+        options.driver,
+        options.displayName
+      ),
+      hostVariant: options.config.variant,
+      capabilitiesUsed: [...(options.capabilitiesUsed ?? [])],
+      traceSource: 'none',
+      traceConfidence: 'unknown',
+      traceLimitations: options.limitations,
+      artifacts: options.artifacts,
+      session: { runMarker: options.context.marker },
+      correlation: options.context.correlation,
+      failureKind: options.failureKind,
+    },
+  };
+}
+
+function buildHostIdentityMetadata(
+  config: ExternalHostConfig,
+  driver: HostDriverId,
+  displayName: string
+): Pick<
+  ExternalHostMetadata,
+  'driver' | 'driverSlug' | 'displayName' | 'hostName' | 'hostType'
+> {
+  return {
+    driver,
+    driverSlug: driverToSlug(driver),
+    displayName,
+    hostName: displayName,
+    hostType: config.hostType ?? hostTypeFromDriver(driver),
+  };
+}
+
+function buildArtifacts(trace: ClaudeTrace): HostArtifact[] {
+  const artifacts: HostArtifact[] = [
+    {
+      kind: 'metadata',
+      name: 'Claude session metadata',
+      path: trace.candidate.metadataPath,
+      contentType: 'application/json',
+    },
+  ];
+
+  if (trace.auditPath) {
+    artifacts.push({
+      kind: 'audit',
+      name: 'Claude audit log',
+      path: trace.auditPath,
+      contentType: 'application/x-ndjson',
+    });
+  }
+
+  if (trace.transcriptPath) {
+    artifacts.push({
+      kind: 'transcript',
+      name: 'Claude transcript',
+      path: trace.transcriptPath,
+      contentType: 'application/x-ndjson',
+    });
+  }
+
+  return artifacts;
+}
+
+export function extractAccessibilityResponse(
+  visibleText: string
+): string | undefined {
+  const lines = visibleText
+    .split('\n')
+    .map((line) => line.trim())
+    .filter(Boolean);
+  const responseLine = [...lines]
+    .reverse()
+    .find((line) => line.startsWith('Claude responded: '));
+  if (responseLine) {
+    return responseLine.slice('Claude responded: '.length).trim();
+  }
+
+  const inlineResponseMatch = /Claude responded:\s*([^,\n]+)/.exec(visibleText);
+  if (inlineResponseMatch?.[1]) {
+    return inlineResponseMatch[1].trim();
+  }
+
+  const markerIndex = lines.findIndex((line) =>
+    line.includes('[eval-run-marker:')
+  );
+  if (markerIndex >= 0) {
+    return lines
+      .slice(markerIndex + 1)
+      .find(
+        (line) =>
+          !line.startsWith('Write a message') &&
+          !line.includes('Claude is AI and can make mistakes')
+      );
+  }
+
+  return undefined;
+}
+
+async function listSessionCandidates(
+  dataDir: string
+): Promise<SessionCandidate[]> {
+  const metadataPaths = await findClaudeMetadataFiles(dataDir);
+  const candidates: SessionCandidate[] = [];
+
+  for (const metadataPath of metadataPaths) {
+    try {
+      const metadata = JSON.parse(
+        await readFile(metadataPath, 'utf-8')
+      ) as ClaudeSessionMetadata;
+      const metadataStat = await stat(metadataPath);
+      const id = basename(metadataPath, '.json');
+      const sessionDir = join(dirname(metadataPath), id);
+      const statMtimeMs = await getSessionObservedMtime({
+        sessionDir,
+        cliSessionId: metadata.cliSessionId,
+        metadataMtimeMs: metadataStat.mtimeMs,
+      });
+      candidates.push({
+        id,
+        metadataPath,
+        sessionDir,
+        statMtimeMs,
+        metadata,
+      });
+    } catch {
+      continue;
+    }
+  }
+
+  return candidates;
+}
+
+async function getSessionObservedMtime(options: {
+  sessionDir: string;
+  cliSessionId?: string;
+  metadataMtimeMs: number;
+}): Promise<number> {
+  const observed = [
+    options.metadataMtimeMs,
+    await getFileMtime(join(options.sessionDir, 'audit.jsonl')),
+    await getFileMtime(options.sessionDir),
+  ];
+
+  if (options.cliSessionId) {
+    const transcriptPath = await findFile(
+      options.sessionDir,
+      `${options.cliSessionId}.jsonl`
+    );
+    if (transcriptPath) {
+      observed.push(await getFileMtime(transcriptPath));
+    }
+  }
+
+  return Math.max(
+    ...observed.filter((mtime): mtime is number => mtime !== undefined)
+  );
+}
+
+async function getFileMtime(path: string): Promise<number | undefined> {
+  try {
+    return (await stat(path)).mtimeMs;
+  } catch {
+    return undefined;
+  }
+}
+
+async function findClaudeMetadataFiles(root: string): Promise<string[]> {
+  const stack = [root];
+  const matches: string[] = [];
+
+  while (stack.length > 0) {
+    const current = stack.pop()!;
+    let entries;
+    try {
+      entries = await readdir(current, { withFileTypes: true });
+    } catch {
+      continue;
+    }
+
+    for (const entry of entries) {
+      const path = join(current, entry.name);
+      if (entry.isFile() && /^local_.+\.json$/.test(entry.name)) {
+        matches.push(path);
+      } else if (entry.isDirectory()) {
+        stack.push(path);
+      }
+    }
+  }
+
+  return matches;
+}
+
+function sessionMatchesMarker(
+  session: SessionCandidate,
+  trace: ClaudeTrace,
+  marker: string
+): boolean {
+  if (session.metadata.initialMessage?.includes(marker)) {
+    return true;
+  }
+  if (trace.finalAnswer?.includes(marker)) {
+    return true;
+  }
+  return trace.rawText.includes(marker);
+}
+
+function sessionMatchesCorrelation(options: {
+  session: SessionCandidate;
+  trace: ClaudeTrace;
+  marker: string;
+  correlation?: HostRunContext['correlation'];
+  isNewOrUpdated: boolean;
+  isRecent: boolean;
+}): boolean {
+  if (options.correlation?.includedInPrompt !== false) {
+    return sessionMatchesMarker(options.session, options.trace, options.marker);
+  }
+
+  return options.isNewOrUpdated || options.isRecent;
+}
+
+async function parseNdjsonContent<T>(
+  content: string,
+  sourceName: string
+): Promise<{ events: T[]; ok: boolean; warnings: string[] }> {
+  const events: T[] = [];
+  const parser = parseNdjson({ strict: false });
+
+  await new Promise<void>((resolve, reject) => {
+    parser.on('data', (event: T) => events.push(event));
+    parser.on('error', reject);
+    parser.on('end', resolve);
+    Readable.from([content]).pipe(parser);
+  });
+
+  const nonEmptyLineCount = content
+    .split('\n')
+    .filter((line) => line.trim().length > 0).length;
+  const discardedLineCount = nonEmptyLineCount - events.length;
+  const warnings =
+    discardedLineCount > 0
+      ? [
+          `${sourceName} discarded ${discardedLineCount} malformed JSONL line${
+            discardedLineCount === 1 ? '' : 's'
+          } using ndjson strict=false parsing.`,
+        ]
+      : [];
+
+  return { events, ok: warnings.length === 0, warnings };
+}
+
+function findLastResultEvent(
+  events: ClaudeAuditEvent[]
+): ClaudeAuditEvent | undefined {
+  return [...events]
+    .reverse()
+    .find((event) => event.type === 'result' || event.result !== undefined);
+}
+
+async function findFile(
+  root: string,
+  filename: string
+): Promise<string | undefined> {
+  const stack = [root];
+
+  while (stack.length > 0) {
+    const current = stack.pop()!;
+    let entries;
+    try {
+      entries = await readdir(current, { withFileTypes: true });
+    } catch {
+      continue;
+    }
+
+    for (const entry of entries) {
+      const path = join(current, entry.name);
+      if (entry.isFile() && entry.name === filename) {
+        return path;
+      }
+      if (entry.isDirectory()) {
+        stack.push(path);
+      }
+    }
+  }
+
+  return undefined;
+}
+
+function extractAssistantText(events: ClaudeAuditEvent[]): string | undefined {
+  const parts: string[] = [];
+
+  for (const event of events) {
+    for (const block of event.message?.content ?? []) {
+      if (block.type === 'text' && block.text) {
+        parts.push(block.text);
+      }
+    }
+  }
+
+  return parts.length > 0 ? parts.join('') : undefined;
+}
+
+function extractToolCalls(events: ClaudeAuditEvent[]): LLMToolCall[] {
+  const toolCalls: LLMToolCall[] = [];
+
+  for (const event of events) {
+    for (const block of event.message?.content ?? []) {
+      if (block.type !== 'tool_use' || !block.name) {
+        continue;
+      }
+      const mcpMatch = /^mcp__(.+)__(.+)$/.exec(block.name);
+      toolCalls.push({
+        name: mcpMatch ? mcpMatch[2]! : block.name,
+        arguments: block.input ?? {},
+        id: block.id,
+      });
+    }
+  }
+
+  return toolCalls;
+}
+
+function extractUsage(event: ClaudeAuditEvent): UsageMetrics | undefined {
+  const usage = event.usage;
+  const inputTokens =
+    getNumber(usage, 'input_tokens') ?? getNumber(usage, 'inputTokens');
+  const outputTokens =
+    getNumber(usage, 'output_tokens') ?? getNumber(usage, 'outputTokens');
+
+  if (
+    inputTokens === undefined &&
+    outputTokens === undefined &&
+    event.total_cost_usd === undefined &&
+    event.duration_ms === undefined
+  ) {
+    return undefined;
+  }
+
+  return {
+    inputTokens: inputTokens ?? 0,
+    outputTokens: outputTokens ?? 0,
+    totalCostUsd: event.total_cost_usd ?? 0,
+    durationMs: event.duration_ms ?? 0,
+    durationApiMs: event.duration_api_ms,
+    cacheReadInputTokens:
+      getNumber(usage, 'cache_read_input_tokens') ??
+      getNumber(usage, 'cacheReadInputTokens'),
+    cacheCreationInputTokens:
+      getNumber(usage, 'cache_creation_input_tokens') ??
+      getNumber(usage, 'cacheCreationInputTokens'),
+  };
+}
+
+function getNumber(
+  object: Record<string, unknown> | undefined,
+  key: string
+): number | undefined {
+  const value = object?.[key];
+  return typeof value === 'number' ? value : undefined;
+}
+
+function metadataTimestampMs(value: string | number | undefined): number {
+  if (typeof value === 'number') {
+    return value;
+  }
+  if (typeof value === 'string') {
+    const parsed = Date.parse(value);
+    return Number.isNaN(parsed) ? Number.NaN : parsed;
+  }
+  return Number.NaN;
+}
+
+function metadataTimestampString(
+  value: string | number | undefined
+): string | undefined {
+  if (typeof value === 'string') {
+    return value;
+  }
+  if (typeof value === 'number') {
+    return new Date(value).toISOString();
+  }
+  return undefined;
+}
+
+function classifyTraceFailure(message: string): ExternalHostFailureKind {
+  const lower = message.toLowerCase();
+  if (lower.includes('ambiguous')) return 'ambiguous_matching_sessions';
+  if (lower.includes('timed out')) return 'timeout';
+  if (lower.includes('no matching')) return 'no_matching_session';
+  if (lower.includes('parse')) return 'parse_failure';
+  return 'unknown';
+}
+
+function formatError(err: unknown): string {
+  return err instanceof Error ? err.message : String(err);
+}
+
+function delay(ms: number): Promise<void> {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+export function createExternalHostRunId(caseId: string): string {
+  return `${caseId}-${randomUUID()}`;
+}
diff --git a/src/evals/externalHost/builtins/macosDesktop.test.ts b/src/evals/externalHost/builtins/macosDesktop.test.ts
new file mode 100644
index 0000000..4898742
--- /dev/null
+++ b/src/evals/externalHost/builtins/macosDesktop.test.ts
@@ -0,0 +1,74 @@
+import { describe, expect, it } from 'vitest';
+import {
+  buildMacosDesktopSubmitScript,
+  MACOS_DESKTOP_CAPABILITIES,
+} from './macosDesktop.js';
+
+describe('macOS desktop built-in capabilities', () => {
+  it('declares reusable platform and accessibility submit capabilities', () => {
+    expect(
+      MACOS_DESKTOP_CAPABILITIES.map((capability) => ({
+        id: capability.id,
+        capabilities: capability.capabilities,
+      }))
+    ).toEqual([
+      {
+        id: 'builtin:platform.macos',
+        capabilities: ['control'],
+      },
+      {
+        id: 'builtin:desktop.macos.accessibilitySubmit',
+        capabilities: ['control', 'input'],
+      },
+    ]);
+  });
+
+  it('builds a submit script that uses keyboard-only input (no coordinate clicks)', () => {
+    const script = buildMacosDesktopSubmitScript('hello marker', {
+      appName: 'Example',
+      createNewConversation: false,
+      settleDelayMs: 500,
+    });
+
+    expect(script).toContain('tell application "Example" to activate');
+    expect(script).toContain('keystroke "v" using command down');
+    expect(script).toContain('key code 36');
+    // Coordinate-based clicks were removed in favor of relying on Chromium's
+    // DOM autofocus when a new conversation opens via Cmd+N.
+    expect(script).not.toContain('click at {');
+  });
+
+  it('emits Cmd+N when createNewConversation is enabled', () => {
+    const script = buildMacosDesktopSubmitScript('hello marker', {
+      appName: 'Example',
+      createNewConversation: true,
+      settleDelayMs: 500,
+    });
+
+    expect(script).toContain('keystroke "n" using command down');
+  });
+
+  it('verifies the target app is foregrounded before sending keystrokes and errors fast otherwise', () => {
+    const script = buildMacosDesktopSubmitScript('hello marker', {
+      appName: 'Example',
+      createNewConversation: false,
+      settleDelayMs: 500,
+    });
+
+    // The retry loop polls `frontmost` and re-asserts `set frontmost to true`
+    // up to 10 times so transient focus-prevention can be retried before we
+    // give up.
+    expect(script).toContain('repeat 10 times');
+    expect(script).toContain('if frontmost then');
+    expect(script).toContain('set frontmost to true');
+
+    // If the loop exits without activation succeeding, the script must error
+    // fast with a message identifying the foreground problem rather than
+    // letting downstream keystrokes route to the wrong app and surface as a
+    // 90-second eval timeout.
+    expect(script).toContain('if not activated then');
+    expect(script).toContain(
+      'could not be brought to the foreground (focus is held by another app)'
+    );
+  });
+});
diff --git a/src/evals/externalHost/builtins/macosDesktop.ts b/src/evals/externalHost/builtins/macosDesktop.ts
new file mode 100644
index 0000000..e679fb9
--- /dev/null
+++ b/src/evals/externalHost/builtins/macosDesktop.ts
@@ -0,0 +1,358 @@
+import { execFile } from 'node:child_process';
+import { promisify } from 'node:util';
+import type {
+  ExternalHostCapabilityContext,
+  ExternalHostCapabilityImplementation,
+  ExternalHostFailureKind,
+  ExternalHostRunResult,
+} from '../types.js';
+import { driverToSlug, hostTypeFromDriver } from '../driverIdentity.js';
+
+const execFileAsync = promisify(execFile);
+const DEFAULT_SETTLE_DELAY_MS = 500;
+const DEFAULT_APPLESCRIPT_TIMEOUT_MS = 30_000;
+const DEFAULT_APPLESCRIPT_MAX_BUFFER = 64 * 1024 * 1024;
+
+export const MACOS_DESKTOP_CAPABILITIES: ExternalHostCapabilityImplementation[] =
+  [
+    {
+      id: 'builtin:platform.macos',
+      capabilities: ['control'],
+      run: requireMacosCapability,
+    },
+    {
+      id: 'builtin:desktop.macos.accessibilitySubmit',
+      capabilities: ['control', 'input'],
+      run: submitPromptCapability,
+    },
+  ];
+
+export async function runAppleScript(
+  script: string,
+  options: { timeoutMs?: number; maxBuffer?: number } = {}
+): Promise<string> {
+  const result = await execFileAsync('osascript', ['-e', script], {
+    maxBuffer: options.maxBuffer ?? DEFAULT_APPLESCRIPT_MAX_BUFFER,
+    timeout: options.timeoutMs ?? DEFAULT_APPLESCRIPT_TIMEOUT_MS,
+    killSignal: 'SIGKILL',
+  });
+  return result.stdout;
+}
+
+export function writeMacosClipboard(value: string): Promise<void> {
+  return new Promise((resolve, reject) => {
+    const child = execFile('pbcopy', (error) => {
+      if (error) {
+        reject(new Error(error.message));
+        return;
+      }
+      resolve();
+    });
+    child.stdin?.end(value);
+  });
+}
+
+export async function readMacosAccessibilityText(
+  appName: string
+): Promise<string> {
+  const script = `
+on collectText(theElement)
+  set output to {}
+  try
+    tell application "System Events" to set elementRole to role of theElement
+    tell application "System Events" to set elementValue to value of theElement
+    if (elementRole is "AXStaticText" or elementRole is "AXTextArea") and elementValue is not missing value then set end of output to (elementValue as text)
+  end try
+  try
+    tell application "System Events" to set uiChildren to UI elements of theElement
+    repeat with childElement in uiChildren
+      set output to output & my collectText(childElement)
+    end repeat
+  end try
+  return output
+end collectText
+
+tell application "System Events" to tell process ${JSON.stringify(appName)}
+  set textItems to my collectText(front window)
+end tell
+set AppleScript's text item delimiters to linefeed
+return textItems as text
+`;
+  return runAppleScript(script);
+}
+
+export async function readMacosFrontWindowContents(
+  appName: string
+): Promise<string> {
+  const script = `tell application "System Events" to tell process ${JSON.stringify(
+    appName
+  )} to get entire contents of front window`;
+  return runAppleScript(script);
+}
+
+async function requireMacosCapability({
+  config,
+  run,
+  binding,
+  state,
+}: ExternalHostCapabilityContext): Promise<ExternalHostRunResult | void> {
+  if (process.platform === 'darwin') {
+    return;
+  }
+
+  return desktopFailureResult({
+    config,
+    context: run,
+    state,
+    failureKind: 'unsupported_host',
+    error:
+      stringOption(binding.with, 'error') ??
+      `${state.displayName} currently requires macOS automation support.`,
+    limitations: [
+      stringOption(binding.with, 'limitation') ??
+        'Windows UI Automation support has not been added yet.',
+    ],
+  });
+}
+
+async function submitPromptCapability({
+  config,
+  run,
+  binding,
+  state,
+}: ExternalHostCapabilityContext): Promise<ExternalHostRunResult | void> {
+  try {
+    const appName =
+      runStringOption(config, binding, 'appName') ?? state.displayName;
+    await submitPromptToMacosDesktopApp(run.submittedScenario, {
+      appName,
+      createNewConversation: shouldCreateNewConversation(
+        binding.with?.createNewConversation,
+        config
+      ),
+      settleDelayMs: runNumberOption(config, binding, 'settleDelayMs'),
+      submitButtonNames: stringArrayOption(binding.with, 'submitButtonNames'),
+    });
+  } catch (err) {
+    const message = formatError(err);
+    return desktopFailureResult({
+      config,
+      context: run,
+      state,
+      failureKind: classifyDesktopSubmissionFailure(message),
+      error: `Failed to submit prompt to desktop host: ${message}`,
+      limitations: [
+        'The desktop host app must be installed, signed in, and allowed in macOS Automation/Accessibility settings.',
+      ],
+    });
+  }
+}
+
+export async function submitPromptToMacosDesktopApp(
+  prompt: string,
+  options: {
+    appName: string;
+    createNewConversation: boolean;
+    settleDelayMs?: number;
+    submitButtonNames?: string[];
+  }
+): Promise<void> {
+  const settleDelayMs = options.settleDelayMs ?? DEFAULT_SETTLE_DELAY_MS;
+  const script = buildMacosDesktopSubmitScript(prompt, {
+    ...options,
+    settleDelayMs,
+  });
+  await writeMacosClipboard(prompt);
+  await runAppleScript(script);
+}
+
+export function buildMacosDesktopSubmitScript(
+  _prompt: string,
+  options: {
+    appName: string;
+    createNewConversation: boolean;
+    settleDelayMs: number;
+    submitButtonNames?: string[];
+  }
+): string {
+  const settleDelayMs = options.settleDelayMs;
+
+  const newConversation = options.createNewConversation
+    ? `keystroke "n" using command down
+  delay ${Math.max(settleDelayMs, 1500) / 1000}`
+    : '';
+
+  return `
+tell application ${JSON.stringify(options.appName)} to activate
+delay ${settleDelayMs / 1000}
+
+-- Verify the app actually came to the foreground. tell-to-activate is
+-- unreliable on multi-monitor / multi-Space setups when another app
+-- (browser, terminal, etc.) holds focus-prevention precedence. Retry
+-- bringing the app forward up to ~2 seconds; fail fast with a clear
+-- error if the OS refuses, since otherwise our keystrokes route to
+-- whatever app actually has focus and the eval times out 90s later.
+set activated to false
+repeat 10 times
+  tell application "System Events" to tell process ${JSON.stringify(options.appName)}
+    if frontmost then
+      set activated to true
+      exit repeat
+    end if
+    try
+      set frontmost to true
+    end try
+  end tell
+  delay 0.2
+end repeat
+if not activated then
+  error ${JSON.stringify(options.appName)} & " could not be brought to the foreground (focus is held by another app); keystrokes would route to the wrong app"
+end if
+
+tell application "System Events"
+  -- Force a known-focus state by opening a new conversation. Chromium's React
+  -- app autofocuses the composer on a fresh chat view, even though
+  -- AXFocusedUIElement doesn't expose that state to AppleScript. This avoids
+  -- coordinate-based clicks that are fragile to window position, monitor
+  -- placement, or layout drift.
+  ${newConversation}
+  -- Paste the prompt from clipboard. The caller has already written the
+  -- prompt to the macOS clipboard via writeMacosClipboard. The keystroke
+  -- routes to whatever has DOM focus inside the active window.
+  keystroke "v" using command down
+  delay 0.4
+  -- Submit via Return.
+  key code 36
+end tell
+`;
+}
+
+function shouldCreateNewConversation(
+  option: unknown,
+  config: { options?: Record<string, unknown> }
+): boolean {
+  if (option === 'unless-disabled') {
+    return configStringOption(config, 'newConversationShortcut') !== 'none';
+  }
+  return option === true;
+}
+
+function desktopFailureResult({
+  config,
+  context,
+  state,
+  failureKind,
+  error,
+  limitations,
+}: {
+  config: ExternalHostCapabilityContext['config'];
+  context: ExternalHostCapabilityContext['run'];
+  state: ExternalHostCapabilityContext['state'];
+  failureKind: ExternalHostFailureKind;
+  error: string;
+  limitations: string[];
+}): ExternalHostRunResult {
+  return {
+    success: false,
+    toolCalls: [],
+    error,
+    externalHost: {
+      driver: state.driver,
+      driverSlug: driverToSlug(state.driver),
+      displayName: state.displayName,
+      hostName: state.displayName,
+      hostType: config.hostType ?? hostTypeFromDriver(state.driver),
+      hostVariant: config.variant,
+      capabilitiesUsed: state.capabilitiesUsed,
+      traceSource: 'none',
+      traceConfidence: 'unknown',
+      traceLimitations: limitations,
+      artifacts: [],
+      session: { runMarker: context.marker },
+      correlation: context.correlation,
+      failureKind,
+    },
+  };
+}
+
+function runStringOption(
+  config: { options?: Record<string, unknown> },
+  binding: { with?: Record<string, unknown> },
+  key: string
+): string | undefined {
+  return stringOption(binding.with, key) ?? configStringOption(config, key);
+}
+
+function runNumberOption(
+  config: { options?: Record<string, unknown> },
+  binding: { with?: Record<string, unknown> },
+  key: string
+): number | undefined {
+  const value = binding.with?.[key];
+  return typeof value === 'number' ? value : configNumberOption(config, key);
+}
+
+function configStringOption(
+  config: { options?: Record<string, unknown> },
+  key: string
+): string | undefined {
+  return stringOption(config.options, key);
+}
+
+function configNumberOption(
+  config: { options?: Record<string, unknown> },
+  key: string
+): number | undefined {
+  const value = config.options?.[key];
+  return typeof value === 'number' ? value : undefined;
+}
+
+function stringOption(
+  options: Record<string, unknown> | undefined,
+  key: string
+): string | undefined {
+  const value = options?.[key];
+  return typeof value === 'string' ? value : undefined;
+}
+
+function stringArrayOption(
+  options: Record<string, unknown> | undefined,
+  key: string
+): string[] | undefined {
+  const value = options?.[key];
+  if (!Array.isArray(value)) {
+    return undefined;
+  }
+  const strings = value.filter(
+    (item): item is string => typeof item === 'string'
+  );
+  return strings.length > 0 ? strings : undefined;
+}
+
+function classifyDesktopSubmissionFailure(
+  message: string
+): ExternalHostFailureKind {
+  const lower = message.toLowerCase();
+  if (
+    lower.includes('not authorized') ||
+    lower.includes('not permitted') ||
+    lower.includes('assistive access') ||
+    lower.includes('accessibility') ||
+    lower.includes('automation')
+  ) {
+    return 'automation_permission_denied';
+  }
+  if (
+    lower.includes('can’t get application') ||
+    lower.includes("can't get application") ||
+    lower.includes('application isn’t running') ||
+    lower.includes("application isn't running")
+  ) {
+    return 'app_unavailable';
+  }
+  return 'submission_failed';
+}
+
+function formatError(err: unknown): string {
+  return err instanceof Error ? err.message : String(err);
+}
diff --git a/src/evals/externalHost/capabilities.test.ts b/src/evals/externalHost/capabilities.test.ts
new file mode 100644
index 0000000..a5692bd
--- /dev/null
+++ b/src/evals/externalHost/capabilities.test.ts
@@ -0,0 +1,24 @@
+import { describe, expect, it } from 'vitest';
+import { validateHostCapabilities } from './capabilities.js';
+
+describe('validateHostCapabilities', () => {
+  it('passes when all required external host capabilities are present', () => {
+    expect(
+      validateHostCapabilities([
+        'control',
+        'input',
+        'completion',
+        'trace',
+        'normalize',
+      ])
+    ).toEqual([]);
+  });
+
+  it('reports missing required capabilities', () => {
+    expect(validateHostCapabilities(['control', 'input'])).toEqual([
+      'completion',
+      'trace',
+      'normalize',
+    ]);
+  });
+});
diff --git a/src/evals/externalHost/capabilities.ts b/src/evals/externalHost/capabilities.ts
new file mode 100644
index 0000000..6877e36
--- /dev/null
+++ b/src/evals/externalHost/capabilities.ts
@@ -0,0 +1,18 @@
+import type { HostCapability } from './types.js';
+
+export const REQUIRED_HOST_CAPABILITIES: HostCapability[] = [
+  'control',
+  'input',
+  'completion',
+  'trace',
+  'normalize',
+];
+
+export function validateHostCapabilities(
+  capabilities: readonly HostCapability[]
+): HostCapability[] {
+  const provided = new Set(capabilities);
+  return REQUIRED_HOST_CAPABILITIES.filter(
+    (capability) => !provided.has(capability)
+  );
+}
diff --git a/src/evals/externalHost/capabilityRuntime.test.ts b/src/evals/externalHost/capabilityRuntime.test.ts
new file mode 100644
index 0000000..76a639b
--- /dev/null
+++ b/src/evals/externalHost/capabilityRuntime.test.ts
@@ -0,0 +1,155 @@
+import { describe, expect, it } from 'vitest';
+import {
+  loadExternalHostConfig,
+  loadExternalHostRunner,
+  registerExternalHostCapability,
+} from './capabilityRuntime.js';
+
+const TEST_DRIVER = {
+  provider: 'test',
+  product: 'host',
+  surface: 'chat',
+  runtime: 'desktop-app',
+  platform: 'macos',
+} as const;
+
+const TEST_CORRELATION = {
+  strategy: 'prompt_marker',
+  marker: 'MCP_SERVER_TESTER_CAPABILITY',
+  includedInPrompt: true,
+} as const;
+
+describe('external host capability runtime', () => {
+  it('composes a runner from config-declared capability bindings', async () => {
+    const calls: string[] = [];
+
+    registerExternalHostCapability({
+      id: 'test.capability.success',
+      capabilities: ['control', 'input', 'completion', 'trace', 'normalize'],
+      async setup({ state }) {
+        calls.push('setup');
+        state.data.setupSeen = true;
+      },
+      async run({ run, state }) {
+        calls.push('run');
+        expect(state.driverSlug).toBe('test.host.chat.desktop-app.macos');
+        expect(state.data.setupSeen).toBe(true);
+        return {
+          success: true,
+          response: 'composed result',
+          toolCalls: [],
+          externalHost: {
+            driver: state.driver,
+            driverSlug: state.driverSlug,
+            displayName: state.displayName,
+            hostName: state.displayName,
+            hostType: 'custom',
+            capabilitiesUsed: state.capabilitiesUsed,
+            traceSource: 'manual-import',
+            traceConfidence: 'high',
+            artifacts: [],
+            session: { runMarker: run.marker },
+            correlation: run.correlation,
+          },
+        };
+      },
+    });
+
+    const runner = await loadExternalHostRunner({
+      driver: TEST_DRIVER,
+      capabilities: {
+        control: {
+          uses: 'test.capability.success',
+          provides: ['input', 'completion', 'trace', 'normalize'],
+        },
+      },
+    });
+
+    const result = await runner.run({
+      runId: 'run',
+      caseId: 'case',
+      scenario: 'scenario',
+      submittedScenario: 'scenario',
+      marker: 'MCP_SERVER_TESTER_CAPABILITY',
+      correlation: TEST_CORRELATION,
+      timeoutMs: 1000,
+      startedAtMs: Date.now(),
+    });
+
+    expect(calls).toEqual(['setup', 'run']);
+    expect(result).toMatchObject({
+      success: true,
+      response: 'composed result',
+      externalHost: {
+        driverSlug: 'test.host.chat.desktop-app.macos',
+        capabilitiesUsed: [
+          'control',
+          'input',
+          'completion',
+          'trace',
+          'normalize',
+        ],
+      },
+    });
+  });
+
+  it('treats binding provides as additional capabilities', async () => {
+    registerExternalHostCapability({
+      id: 'test.capability.extraControl',
+      capabilities: ['control'],
+    });
+    registerExternalHostCapability({
+      id: 'test.capability.inputTrace',
+      capabilities: ['input', 'trace'],
+    });
+
+    const loaded = await loadExternalHostConfig({
+      driver: TEST_DRIVER,
+      capabilities: {
+        control: { uses: 'test.capability.extraControl' },
+        input: {
+          uses: 'test.capability.inputTrace',
+          provides: ['completion', 'normalize'],
+        },
+      },
+    });
+
+    expect(loaded.capabilitiesUsed).toEqual([
+      'control',
+      'input',
+      'trace',
+      'completion',
+      'normalize',
+    ]);
+  });
+
+  it('fails config loading when required capabilities are missing', async () => {
+    registerExternalHostCapability({
+      id: 'test.capability.controlOnly',
+      capabilities: ['control'],
+    });
+
+    await expect(
+      loadExternalHostConfig({
+        driver: TEST_DRIVER,
+        capabilities: {
+          control: { uses: 'test.capability.controlOnly' },
+        },
+      })
+    ).rejects.toThrow('missing capabilities');
+  });
+
+  it('fails config loading for unavailable capability implementations', async () => {
+    await expect(
+      loadExternalHostConfig({
+        driver: TEST_DRIVER,
+        capabilities: {
+          control: {
+            uses: 'missing.capability',
+            provides: ['input', 'completion', 'trace', 'normalize'],
+          },
+        },
+      })
+    ).rejects.toThrow('not available');
+  });
+});
diff --git a/src/evals/externalHost/capabilityRuntime.ts b/src/evals/externalHost/capabilityRuntime.ts
new file mode 100644
index 0000000..152044f
--- /dev/null
+++ b/src/evals/externalHost/capabilityRuntime.ts
@@ -0,0 +1,336 @@
+import {
+  REQUIRED_HOST_CAPABILITIES,
+  validateHostCapabilities,
+} from './capabilities.js';
+import {
+  getRegisteredExternalHostConfig,
+  getRegisteredExternalHostDisplayName,
+} from './hostRegistry.js';
+import {
+  listBuiltinExternalHostCapabilities,
+  resolveBuiltinExternalHostCapability,
+} from './builtinCapabilities.js';
+import {
+  driverToSlug,
+  hostTypeFromDriver,
+  normalizeHostDriver,
+} from './driverIdentity.js';
+import type {
+  ExternalHostCapabilityBinding,
+  ExternalHostCapabilityContext,
+  ExternalHostCapabilityImplementation,
+  ExternalHostCapabilitiesConfig,
+  ExternalHostConfig,
+  ExternalHostRunResult,
+  ExternalHostRunState,
+  ExternalHostRunner,
+  HostCapability,
+  HostDriverId,
+  HostRunContext,
+} from './types.js';
+
+const CAPABILITIES = new Map<string, ExternalHostCapabilityImplementation>();
+
+export interface LoadedExternalHostCapability {
+  capability: HostCapability;
+  binding: ExternalHostCapabilityBinding;
+  implementation: ExternalHostCapabilityImplementation;
+}
+
+export interface LoadedExternalHostConfig {
+  config: ExternalHostConfig;
+  driver: HostDriverId;
+  driverSlug: string;
+  displayName: string;
+  loadedCapabilities: LoadedExternalHostCapability[];
+  capabilitiesUsed: HostCapability[];
+}
+
+export function registerExternalHostCapability(
+  implementation: ExternalHostCapabilityImplementation
+): void {
+  CAPABILITIES.set(implementation.id, implementation);
+}
+
+export function listExternalHostCapabilities(): ExternalHostCapabilityImplementation[] {
+  return Array.from(
+    new Map(
+      [...listBuiltinExternalHostCapabilities(), ...CAPABILITIES.values()].map(
+        (implementation) => [implementation.id, implementation]
+      )
+    ).values()
+  );
+}
+
+export async function resolveExternalHostCapability(
+  uses: string
+): Promise<ExternalHostCapabilityImplementation | undefined> {
+  const registered = CAPABILITIES.get(uses);
+  if (registered) {
+    return registered;
+  }
+
+  const configuredBuiltin = resolveBuiltinExternalHostCapability(uses);
+  if (configuredBuiltin) {
+    return configuredBuiltin;
+  }
+
+  if (uses.startsWith('module:')) {
+    return loadModuleCapability(uses);
+  }
+
+  return undefined;
+}
+
+export async function loadExternalHostRunner(
+  config: ExternalHostConfig
+): Promise<ExternalHostRunner> {
+  const loaded = await loadExternalHostConfig(config);
+
+  return createExternalHostRunner(loaded);
+}
+
+export function createExternalHostRunner(
+  loaded: LoadedExternalHostConfig
+): ExternalHostRunner {
+  return {
+    async run(context: HostRunContext): Promise<ExternalHostRunResult> {
+      return runLoadedExternalHost(loaded, context);
+    },
+  };
+}
+
+export async function loadExternalHostConfig(
+  config: ExternalHostConfig
+): Promise<LoadedExternalHostConfig> {
+  const driver = normalizeHostDriver(config.driver);
+  const driverSlug = driverToSlug(driver);
+  const registeredConfig = getRegisteredExternalHostConfig(driverSlug);
+  const effectiveConfig = mergeExternalHostConfig(config, registeredConfig);
+  const capabilitiesConfig = effectiveConfig.capabilities;
+
+  if (!capabilitiesConfig) {
+    throw new Error(
+      `External host ${driverSlug} does not declare capabilities and has no built-in defaults.`
+    );
+  }
+
+  const loadedCapabilities: LoadedExternalHostCapability[] = [];
+  const providedCapabilities = new Set<HostCapability>();
+
+  for (const capability of REQUIRED_HOST_CAPABILITIES) {
+    const bindings = normalizeCapabilityBindings(
+      capabilitiesConfig[capability]
+    );
+    for (const binding of bindings) {
+      const implementation = await resolveExternalHostCapability(binding.uses);
+      if (!implementation) {
+        throw new Error(
+          `External host capability implementation is not available: ${binding.uses}`
+        );
+      }
+
+      loadedCapabilities.push({
+        capability,
+        binding,
+        implementation,
+      });
+      providedCapabilities.add(capability);
+      for (const provided of [
+        ...implementation.capabilities,
+        ...(binding.provides ?? []),
+      ]) {
+        providedCapabilities.add(provided);
+      }
+    }
+  }
+
+  const capabilitiesUsed = Array.from(providedCapabilities);
+  const missingCapabilities = validateHostCapabilities(capabilitiesUsed);
+  if (missingCapabilities.length > 0) {
+    throw new Error(
+      `External host ${driverSlug} is missing capabilities: ${missingCapabilities.join(', ')}`
+    );
+  }
+
+  return {
+    config: effectiveConfig,
+    driver,
+    driverSlug,
+    displayName:
+      effectiveConfig.name ??
+      getRegisteredExternalHostDisplayName(driverSlug) ??
+      driverSlug,
+    loadedCapabilities,
+    capabilitiesUsed,
+  };
+}
+
+async function runLoadedExternalHost(
+  loaded: LoadedExternalHostConfig,
+  context: HostRunContext
+): Promise<ExternalHostRunResult> {
+  const state: ExternalHostRunState = {
+    driver: loaded.driver,
+    driverSlug: loaded.driverSlug,
+    displayName: loaded.displayName,
+    capabilitiesUsed: loaded.capabilitiesUsed,
+    data: {},
+  };
+
+  for (const loadedCapability of loaded.loadedCapabilities) {
+    const result = await loadedCapability.implementation.setup?.(
+      capabilityContext(loaded, context, state, loadedCapability)
+    );
+    if (result) {
+      return result;
+    }
+    if (state.result) {
+      return state.result;
+    }
+  }
+
+  for (const loadedCapability of loaded.loadedCapabilities) {
+    const result = await loadedCapability.implementation.run?.(
+      capabilityContext(loaded, context, state, loadedCapability)
+    );
+    if (result) {
+      return result;
+    }
+    if (state.result) {
+      return state.result;
+    }
+  }
+
+  return runtimeFailure(
+    loaded,
+    context,
+    `External host ${loaded.driverSlug} completed without producing a result.`
+  );
+}
+
+function capabilityContext(
+  loaded: LoadedExternalHostConfig,
+  run: HostRunContext,
+  state: ExternalHostRunState,
+  loadedCapability: LoadedExternalHostCapability
+): ExternalHostCapabilityContext {
+  return {
+    config: loaded.config,
+    run,
+    capability: loadedCapability.capability,
+    binding: loadedCapability.binding,
+    state,
+  };
+}
+
+function mergeExternalHostConfig(
+  config: ExternalHostConfig,
+  builtin: Partial<ExternalHostConfig> | undefined
+): ExternalHostConfig {
+  if (!builtin) {
+    return config;
+  }
+
+  return {
+    ...builtin,
+    ...config,
+    capabilities: mergeCapabilities(builtin.capabilities, config.capabilities),
+    correlation: {
+      ...builtin.correlation,
+      ...config.correlation,
+    },
+    options: {
+      ...builtin.options,
+      ...config.options,
+    },
+  };
+}
+
+function mergeCapabilities(
+  base: ExternalHostCapabilitiesConfig | undefined,
+  override: ExternalHostCapabilitiesConfig | undefined
+): ExternalHostCapabilitiesConfig | undefined {
+  if (!base) {
+    return override;
+  }
+  if (!override) {
+    return base;
+  }
+  return {
+    ...base,
+    ...override,
+  };
+}
+
+function normalizeCapabilityBindings(
+  binding:
+    | ExternalHostCapabilityBinding
+    | ExternalHostCapabilityBinding[]
+    | undefined
+): ExternalHostCapabilityBinding[] {
+  if (!binding) {
+    return [];
+  }
+  return Array.isArray(binding) ? binding : [binding];
+}
+
+async function loadModuleCapability(
+  uses: string
+): Promise<ExternalHostCapabilityImplementation | undefined> {
+  const target = uses.slice('module:'.length);
+  const [specifier, exportName = 'default'] = target.split('#');
+  if (!specifier) {
+    throw new Error(`Invalid external host module capability id: ${uses}`);
+  }
+
+  const module = (await import(specifier)) as Record<string, unknown>;
+  const implementation = module[exportName];
+  if (!isExternalHostCapabilityImplementation(implementation)) {
+    throw new Error(
+      `External host module capability ${uses} did not export a valid implementation.`
+    );
+  }
+  return implementation;
+}
+
+function isExternalHostCapabilityImplementation(
+  value: unknown
+): value is ExternalHostCapabilityImplementation {
+  return (
+    typeof value === 'object' &&
+    value !== null &&
+    typeof (value as ExternalHostCapabilityImplementation).id === 'string' &&
+    Array.isArray((value as ExternalHostCapabilityImplementation).capabilities)
+  );
+}
+
+function runtimeFailure(
+  loaded: LoadedExternalHostConfig,
+  context: HostRunContext,
+  error: string
+): ExternalHostRunResult {
+  return {
+    success: false,
+    toolCalls: [],
+    error,
+    externalHost: {
+      driver: loaded.driver,
+      driverSlug: loaded.driverSlug,
+      displayName: loaded.displayName,
+      hostName: loaded.displayName,
+      hostType: loaded.config.hostType ?? hostTypeFromDriver(loaded.driver),
+      hostVariant: loaded.config.variant,
+      capabilitiesUsed: loaded.capabilitiesUsed,
+      traceSource: 'none',
+      traceConfidence: 'unknown',
+      traceLimitations: [
+        'The external host capability runner did not produce a result.',
+      ],
+      artifacts: [],
+      session: { runMarker: context.marker },
+      correlation: context.correlation,
+      failureKind: 'unsupported_host',
+    },
+  };
+}
diff --git a/src/evals/externalHost/driverIdentity.ts b/src/evals/externalHost/driverIdentity.ts
new file mode 100644
index 0000000..4ca7d6f
--- /dev/null
+++ b/src/evals/externalHost/driverIdentity.ts
@@ -0,0 +1,77 @@
+import type {
+  ExternalHostType,
+  HostDriverConfig,
+  HostDriverId,
+} from './types.js';
+
+export const CLAUDE_CHAT_DESKTOP_MACOS_DRIVER: HostDriverId = {
+  provider: 'anthropic',
+  product: 'claude',
+  surface: 'chat',
+  runtime: 'desktop-app',
+  platform: 'macos',
+};
+
+export const CLAUDE_COWORK_DESKTOP_MACOS_DRIVER: HostDriverId = {
+  provider: 'anthropic',
+  product: 'claude',
+  surface: 'cowork',
+  runtime: 'desktop-app',
+  platform: 'macos',
+};
+
+export const CLAUDE_CODE_CLI_MACOS_DRIVER: HostDriverId = {
+  provider: 'anthropic',
+  product: 'claude',
+  surface: 'code',
+  runtime: 'cli',
+  platform: 'macos',
+};
+
+export function driverToSlug(driver: HostDriverId): string {
+  return [
+    driver.provider,
+    driver.product,
+    driver.surface,
+    driver.runtime,
+    driver.platform,
+    driver.channel,
+  ]
+    .filter((part): part is string => Boolean(part))
+    .join('.');
+}
+
+export function parseDriverSlug(slug: string): HostDriverId {
+  const [provider, product, surface, runtime, platform, ...rest] =
+    slug.split('.');
+
+  if (!provider || !product || !surface || !runtime) {
+    throw new Error(
+      `External host driver slug must include provider.product.surface.runtime: ${slug}`
+    );
+  }
+
+  return {
+    provider,
+    product,
+    surface,
+    runtime,
+    ...(platform ? { platform } : {}),
+    ...(rest.length > 0 ? { channel: rest.join('.') } : {}),
+  };
+}
+
+export function normalizeHostDriver(driver: HostDriverConfig): HostDriverId {
+  if (typeof driver === 'string') {
+    return parseDriverSlug(driver);
+  }
+
+  return driver;
+}
+
+export function hostTypeFromDriver(driver: HostDriverId): ExternalHostType {
+  if (driver.runtime === 'cli' || driver.runtime === 'tui') return 'cli';
+  if (driver.runtime === 'browser') return 'browser';
+  if (driver.runtime === 'desktop-app') return 'desktop';
+  return 'custom';
+}
diff --git a/src/evals/externalHost/hostRegistry.test.ts b/src/evals/externalHost/hostRegistry.test.ts
new file mode 100644
index 0000000..4550f9d
--- /dev/null
+++ b/src/evals/externalHost/hostRegistry.test.ts
@@ -0,0 +1,91 @@
+import { describe, expect, it } from 'vitest';
+import {
+  CLAUDE_COWORK_DESKTOP_MACOS_DRIVER,
+  driverToSlug,
+  getRegisteredExternalHostConfig,
+  loadExternalHostConfig,
+  listRegisteredExternalHostSlugs,
+  normalizeHostDriver,
+  parseDriverSlug,
+} from './index.js';
+
+describe('external host driver identity and built-in defaults', () => {
+  it('round-trips structured driver ids to slugs', () => {
+    const slug = driverToSlug(CLAUDE_COWORK_DESKTOP_MACOS_DRIVER);
+
+    expect(slug).toBe('anthropic.claude.cowork.desktop-app.macos');
+    expect(parseDriverSlug(slug)).toEqual(CLAUDE_COWORK_DESKTOP_MACOS_DRIVER);
+  });
+
+  it('normalizes driver slug strings to structured ids', () => {
+    expect(
+      normalizeHostDriver('anthropic.claude.cowork.desktop-app.macos')
+    ).toEqual(CLAUDE_COWORK_DESKTOP_MACOS_DRIVER);
+  });
+
+  it('declares Claude Cowork as capability bindings, not a concrete runner', () => {
+    const config = getRegisteredExternalHostConfig(
+      'anthropic.claude.cowork.desktop-app.macos'
+    );
+
+    expect(config?.name).toBe('Claude Cowork Desktop');
+    expect(config?.correlation).toEqual({
+      strategy: 'prompt_marker',
+      includeInPrompt: true,
+    });
+    expect(config?.capabilities).toMatchObject({
+      control: [
+        { uses: 'builtin:platform.macos' },
+        {
+          uses: 'builtin:anthropic.claude.activateCoworkSurface',
+          with: { appName: 'Claude' },
+        },
+      ],
+      input: { uses: 'builtin:desktop.macos.accessibilitySubmit' },
+      completion: {
+        uses: 'builtin:anthropic.claude.localAgentTrace',
+        provides: ['trace'],
+      },
+      normalize: {
+        uses: 'builtin:anthropic.claude.localAgentNormalize',
+      },
+    });
+  });
+
+  it('loads Claude Cowork defaults into concrete capability providers at runtime', async () => {
+    const loaded = await loadExternalHostConfig({
+      driver: 'anthropic.claude.cowork.desktop-app.macos',
+    });
+
+    expect(loaded.displayName).toBe('Claude Cowork Desktop');
+    expect(loaded.capabilitiesUsed).toEqual([
+      'control',
+      'input',
+      'completion',
+      'trace',
+      'normalize',
+    ]);
+    expect(
+      loaded.loadedCapabilities.map((capability) => capability.binding.uses)
+    ).toEqual([
+      'builtin:platform.macos',
+      'builtin:anthropic.claude.activateCoworkSurface',
+      'builtin:desktop.macos.accessibilitySubmit',
+      'builtin:anthropic.claude.localAgentTrace',
+      'builtin:anthropic.claude.localAgentNormalize',
+    ]);
+  });
+
+  it('returns no built-in defaults for syntactically valid unsupported drivers', () => {
+    expect(
+      getRegisteredExternalHostConfig('openai.chatgpt.chat.browser.web')
+    ).toBeUndefined();
+  });
+
+  it('lists registered external hosts by structured driver slug', () => {
+    expect(listRegisteredExternalHostSlugs()).toEqual([
+      'anthropic.claude.chat.desktop-app.macos',
+      'anthropic.claude.cowork.desktop-app.macos',
+    ]);
+  });
+});
diff --git a/src/evals/externalHost/hostRegistry.ts b/src/evals/externalHost/hostRegistry.ts
new file mode 100644
index 0000000..2f4b790
--- /dev/null
+++ b/src/evals/externalHost/hostRegistry.ts
@@ -0,0 +1,88 @@
+import type { ExternalHostConfig } from './types.js';
+import {
+  CLAUDE_CHAT_DESKTOP_MACOS_DRIVER,
+  CLAUDE_COWORK_DESKTOP_MACOS_DRIVER,
+  driverToSlug,
+} from './driverIdentity.js';
+
+const EXTERNAL_HOST_REGISTRY: Record<
+  string,
+  Partial<ExternalHostConfig> & { name: string; description: string }
+> = {
+  [driverToSlug(CLAUDE_CHAT_DESKTOP_MACOS_DRIVER)]: {
+    driver: CLAUDE_CHAT_DESKTOP_MACOS_DRIVER,
+    name: 'Claude Chat Desktop',
+    description:
+      'Drives the regular Claude Desktop chat surface on macOS and captures low-confidence visible response evidence via Accessibility.',
+    correlation: {
+      strategy: 'prompt_marker',
+      includeInPrompt: true,
+    },
+    capabilities: {
+      control: { uses: 'builtin:platform.macos' },
+      input: {
+        uses: 'builtin:desktop.macos.accessibilitySubmit',
+        with: {
+          appName: 'Claude',
+          createNewConversation: 'unless-disabled',
+        },
+      },
+      completion: {
+        uses: 'builtin:anthropic.claude.accessibilityTrace',
+        provides: ['trace', 'normalize'],
+      },
+    },
+  },
+  [driverToSlug(CLAUDE_COWORK_DESKTOP_MACOS_DRIVER)]: {
+    driver: CLAUDE_COWORK_DESKTOP_MACOS_DRIVER,
+    name: 'Claude Cowork Desktop',
+    description:
+      'Drives the Claude Desktop Cowork surface on macOS and captures high-confidence local-agent trace evidence.',
+    correlation: {
+      strategy: 'prompt_marker',
+      includeInPrompt: true,
+    },
+    capabilities: {
+      control: [
+        { uses: 'builtin:platform.macos' },
+        {
+          uses: 'builtin:anthropic.claude.activateCoworkSurface',
+          with: { appName: 'Claude' },
+        },
+      ],
+      input: {
+        uses: 'builtin:desktop.macos.accessibilitySubmit',
+        with: { appName: 'Claude', createNewConversation: true },
+      },
+      completion: {
+        uses: 'builtin:anthropic.claude.localAgentTrace',
+        provides: ['trace'],
+      },
+      normalize: {
+        uses: 'builtin:anthropic.claude.localAgentNormalize',
+      },
+    },
+  },
+};
+
+export function getRegisteredExternalHostConfig(
+  driverSlug: string
+): Partial<ExternalHostConfig> | undefined {
+  return EXTERNAL_HOST_REGISTRY[driverSlug];
+}
+
+export function getRegisteredExternalHostDisplayName(
+  driverSlug: string
+): string | undefined {
+  return EXTERNAL_HOST_REGISTRY[driverSlug]?.name;
+}
+
+export function getRegisteredExternalHostDescription(
+  driverSlug: string
+): string | undefined {
+  return EXTERNAL_HOST_REGISTRY[driverSlug]?.description;
+}
+
+export function listRegisteredExternalHostSlugs(): string[] {
+  return Object.keys(EXTERNAL_HOST_REGISTRY);
+}
diff --git a/src/evals/externalHost/index.ts b/src/evals/externalHost/index.ts
new file mode 100644
index 0000000..31d3e1a
--- /dev/null
+++ b/src/evals/externalHost/index.ts
@@ -0,0 +1,69 @@
+export { runExternalHostScenario } from './runtime.js';
+export {
+  REQUIRED_HOST_CAPABILITIES,
+  validateHostCapabilities,
+} from './capabilities.js';
+export {
+  getRegisteredExternalHostConfig,
+  getRegisteredExternalHostDescription,
+  getRegisteredExternalHostDisplayName,
+  listRegisteredExternalHostSlugs,
+} from './hostRegistry.js';
+export {
+  listBuiltinExternalHostCapabilities,
+  resolveBuiltinExternalHostCapability,
+} from './builtinCapabilities.js';
+export {
+  listExternalHostCapabilities,
+  loadExternalHostConfig,
+  loadExternalHostRunner,
+  registerExternalHostCapability,
+  resolveExternalHostCapability,
+} from './capabilityRuntime.js';
+export type {
+  LoadedExternalHostCapability,
+  LoadedExternalHostConfig,
+} from './capabilityRuntime.js';
+export {
+  CLAUDE_CHAT_DESKTOP_MACOS_DRIVER,
+  CLAUDE_CODE_CLI_MACOS_DRIVER,
+  CLAUDE_COWORK_DESKTOP_MACOS_DRIVER,
+  driverToSlug,
+  hostTypeFromDriver,
+  normalizeHostDriver,
+  parseDriverSlug,
+} from './driverIdentity.js';
+export {
+  ExternalHostCapabilityBindingSchema,
+  ExternalHostConfigSchema,
+  ExternalHostCorrelationSchema,
+  getExternalHostConfigJsonSchema,
+  getExternalHostReference,
+  HostCapabilitySchema,
+  HostDriverIdSchema,
+  listExternalHostDriverReferences,
+} from './schema.js';
+export type { ExternalHostDriverReference } from './schema.js';
+export type {
+  EvidenceSource,
+  ExternalHostCapabilityBinding,
+  ExternalHostCapabilityContext,
+  ExternalHostCapabilityImplementation,
+  ExternalHostCapabilitiesConfig,
+  ExternalHostConfig,
+  ExternalHostFailureKind,
+  ExternalHostMetadata,
+  ExternalHostRunState,
+  ExternalHostRunResult,
+  ExternalHostRunner,
+  ExternalHostSession,
+  ExternalHostSimulationResult,
+  ExternalHostType,
+  HostArtifact,
+  HostCapability,
+  HostDriverConfig,
+  HostDriverId,
+  HostRunContext,
+  ObservationConfidence,
+  TraceSource,
+} from './types.js';
diff --git a/src/evals/externalHost/runtime.test.ts b/src/evals/externalHost/runtime.test.ts
new file mode 100644
index 0000000..c936463
--- /dev/null
+++ b/src/evals/externalHost/runtime.test.ts
@@ -0,0 +1,56 @@
+import { describe, expect, it } from 'vitest';
+import { formatSubmittedScenario, runExternalHostScenario } from './runtime.js';
+
+describe('external host runtime', () => {
+  it('adds an evaluator marker with an instruction not to mention it', () => {
+    const submitted = formatSubmittedScenario(
+      'Reply with exactly: acknowledged.',
+      'MCP_SERVER_TESTER_run_123'
+    );
+
+    expect(submitted).toContain('Reply with exactly: acknowledged.');
+    expect(submitted).toContain('[eval-run-marker:MCP_SERVER_TESTER_run_123]');
+    expect(submitted).toContain('do not mention this marker');
+  });
+
+  it('leaves the submitted scenario unchanged when prompt correlation is disabled', () => {
+    const submitted = formatSubmittedScenario(
+      'Reply with exactly: acknowledged.',
+      'MCP_SERVER_TESTER_run_123',
+      { strategy: 'none' }
+    );
+
+    expect(submitted).toBe('Reply with exactly: acknowledged.');
+  });
+
+  it('supports prompt marker correlation without including it in the prompt', () => {
+    const submitted = formatSubmittedScenario(
+      'Reply with exactly: acknowledged.',
+      'MCP_SERVER_TESTER_run_123',
+      { strategy: 'prompt_marker', includeInPrompt: false }
+    );
+
+    expect(submitted).toBe('Reply with exactly: acknowledged.');
+  });
+
+  it('infers host type for unsupported driver failures', async () => {
+    const result = await runExternalHostScenario(
+      'hello',
+      { driver: 'openai.chatgpt.chat.browser.web' },
+      { runId: 'unsupported-browser' }
+    );
+
+    expect(result).toMatchObject({
+      success: false,
+      externalHost: {
+        driverSlug: 'openai.chatgpt.chat.browser.web',
+        hostType: 'browser',
+        failureKind: 'unsupported_host',
+        correlation: {
+          strategy: 'none',
+          includedInPrompt: false,
+        },
+      },
+    });
+  });
+});
diff --git a/src/evals/externalHost/runtime.ts b/src/evals/externalHost/runtime.ts
new file mode 100644
index 0000000..2b9ace7
--- /dev/null
+++ b/src/evals/externalHost/runtime.ts
@@ -0,0 +1,139 @@
+import { randomUUID } from 'node:crypto';
+import type {
+  ExternalHostCorrelationConfig,
+  ExternalHostCorrelationMetadata,
+  ExternalHostConfig,
+  ExternalHostRunResult,
+  HostRunContext,
+} from './types.js';
+import {
+  driverToSlug,
+  hostTypeFromDriver,
+  normalizeHostDriver,
+} from './driverIdentity.js';
+import {
+  createExternalHostRunner,
+  loadExternalHostConfig,
+} from './capabilityRuntime.js';
+
+const DEFAULT_TIMEOUT_MS = 120_000;
+const DEFAULT_PROMPT_MARKER_TEMPLATE =
+  'Trace marker for MCP Server Tester; do not mention this marker in your response: [eval-run-marker:{{marker}}]';
+
+export function formatSubmittedScenario(
+  scenario: string,
+  marker: string,
+  correlation: ExternalHostCorrelationConfig = {
+    strategy: 'prompt_marker',
+    includeInPrompt: true,
+  }
+): string {
+  const metadata = normalizeCorrelation(correlation, marker);
+  if (!metadata.includedInPrompt) {
+    return scenario;
+  }
+
+  const template = correlation.promptTemplate ?? DEFAULT_PROMPT_MARKER_TEMPLATE;
+  return `${scenario}\n\n${template.replaceAll('{{marker}}', marker)}`;
+}
+
+export async function runExternalHostScenario(
+  scenario: string,
+  config: ExternalHostConfig,
+  options: { caseId?: string; runId?: string } = {}
+): Promise<ExternalHostRunResult> {
+  const runId = options.runId ?? `external-host-${randomUUID()}`;
+  const marker = `MCP_SERVER_TESTER_${runId}`;
+
+  let loaded;
+  try {
+    loaded = await loadExternalHostConfig(config);
+  } catch (err) {
+    const message = err instanceof Error ? err.message : String(err);
+    return unsupportedHostResult(config, marker, message);
+  }
+
+  const timeoutMs = loaded.config.timeoutMs ?? DEFAULT_TIMEOUT_MS;
+  const correlation = normalizeCorrelation(loaded.config.correlation, marker);
+  const submittedScenario = formatSubmittedScenario(
+    scenario,
+    marker,
+    loaded.config.correlation
+  );
+
+  const context: HostRunContext = {
+    runId,
+    caseId: options.caseId ?? 'unknown',
+    scenario,
+    submittedScenario,
+    marker,
+    correlation,
+    timeoutMs,
+    startedAtMs: Date.now(),
+  };
+
+  const runner = createExternalHostRunner(loaded);
+
+  return runner.run(context);
+}
+
+function normalizeCorrelation(
+  correlation: ExternalHostCorrelationConfig | undefined,
+  marker: string
+): ExternalHostCorrelationMetadata {
+  const strategy = correlation?.strategy ?? 'none';
+  const includedInPrompt =
+    strategy === 'prompt_marker'
+      ? (correlation?.includeInPrompt ?? true)
+      : false;
+
+  return {
+    strategy,
+    marker,
+    includedInPrompt,
+  };
+}
+
+function unsupportedHostResult(
+  config: ExternalHostConfig,
+  marker: string,
+  error: string
+): ExternalHostRunResult {
+  const driver = (() => {
+    try {
+      return normalizeHostDriver(config.driver);
+    } catch {
+      return {
+        provider: 'unknown',
+        product: 'unknown',
+        surface: 'unknown',
+        runtime: 'unknown',
+      };
+    }
+  })();
+  const driverSlug = driverToSlug(driver);
+
+  return {
+    success: false as const,
+    toolCalls: [],
+    error,
+    externalHost: {
+      driver,
+      driverSlug,
+      displayName: config.name ?? driverSlug,
+      hostName: config.name ?? driverSlug,
+      hostType: config.hostType ?? hostTypeFromDriver(driver),
+      hostVariant: config.variant,
+      capabilitiesUsed: [],
+      traceSource: 'none',
+      traceConfidence: 'unknown',
+      traceLimitations: [
+        'The external host capability configuration could not be loaded.',
+      ],
+      artifacts: [],
+      session: { runMarker: marker },
+      correlation: normalizeCorrelation(config.correlation, marker),
+      failureKind: 'unsupported_host',
+    },
+  };
+}
diff --git a/src/evals/externalHost/schema.test.ts b/src/evals/externalHost/schema.test.ts
new file mode 100644
index 0000000..25431f0
--- /dev/null
+++ b/src/evals/externalHost/schema.test.ts
@@ -0,0 +1,68 @@
+import { describe, expect, it } from 'vitest';
+import {
+  ExternalHostConfigSchema,
+  getExternalHostConfigJsonSchema,
+  getExternalHostReference,
+  listExternalHostDriverReferences,
+} from './schema.js';
+
+describe('external host schema and reference', () => {
+  it('validates minimal built-in external host config', () => {
+    const parsed = ExternalHostConfigSchema.parse({
+      driver: 'anthropic.claude.cowork.desktop-app.macos',
+      timeoutMs: 60_000,
+    });
+
+    expect(parsed).toEqual({
+      driver: 'anthropic.claude.cowork.desktop-app.macos',
+      timeoutMs: 60_000,
+    });
+  });
+
+  it('exposes known driver slugs in the JSON schema for editor autocomplete', () => {
+    const schema = getExternalHostConfigJsonSchema();
+    const driver = (schema.properties as Record<string, unknown>)
+      .driver as Record<string, unknown>;
+    const choices = driver.anyOf as Array<Record<string, unknown>>;
+
+    expect(choices[0]).toMatchObject({
+      type: 'string',
+      enum: [
+        'anthropic.claude.chat.desktop-app.macos',
+        'anthropic.claude.cowork.desktop-app.macos',
+      ],
+    });
+  });
+
+  it('lists built-in drivers with examples and internal capability defaults', () => {
+    const references = listExternalHostDriverReferences();
+    const cowork = references.find(
+      (reference) =>
+        reference.slug === 'anthropic.claude.cowork.desktop-app.macos'
+    );
+
+    expect(cowork).toMatchObject({
+      name: 'Claude Cowork Desktop',
+      builtIn: true,
+      example: {
+        mode: 'external_host',
+        externalHost: {
+          driver: 'anthropic.claude.cowork.desktop-app.macos',
+        },
+      },
+    });
+    expect(cowork?.capabilities?.input).toMatchObject({
+      uses: 'builtin:desktop.macos.accessibilitySubmit',
+      with: { appName: 'Claude' },
+    });
+  });
+
+  it('bundles schema and driver references for agents and docs generators', () => {
+    const reference = getExternalHostReference();
+
+    expect(reference).toMatchObject({
+      schema: { title: 'MCP Server Tester ExternalHostConfig' },
+      drivers: expect.any(Array),
+    });
+  });
+});
diff --git a/src/evals/externalHost/schema.ts b/src/evals/externalHost/schema.ts
new file mode 100644
index 0000000..9c13e27
--- /dev/null
+++ b/src/evals/externalHost/schema.ts
@@ -0,0 +1,281 @@
+import { z } from 'zod';
+import {
+  getRegisteredExternalHostConfig,
+  getRegisteredExternalHostDescription,
+  listRegisteredExternalHostSlugs,
+} from './hostRegistry.js';
+import { driverToSlug, normalizeHostDriver } from './driverIdentity.js';
+import type {
+  ExternalHostCapabilitiesConfig,
+  ExternalHostConfig,
+  HostDriverId,
+} from './types.js';
+
+export const HostDriverIdSchema = z.object({
+  provider: z.string().min(1),
+  product: z.string().min(1),
+  surface: z.string().min(1),
+  runtime: z.string().min(1),
+  platform: z.string().optional(),
+  channel: z.string().optional(),
+});
+
+export const HostCapabilitySchema = z.enum([
+  'control',
+  'input',
+  'completion',
+  'trace',
+  'normalize',
+]);
+
+export const ExternalHostCapabilityBindingSchema = z.object({
+  uses: z.string().min(1),
+  with: z.record(z.string(), z.unknown()).optional(),
+  provides: z.array(HostCapabilitySchema).optional(),
+});
+
+export const ExternalHostCorrelationSchema = z.object({
+  strategy: z
+    .enum(['prompt_marker', 'host_session_metadata', 'none'])
+    .optional(),
+  includeInPrompt: z.boolean().optional(),
+  promptTemplate: z.string().optional(),
+});
+
+export const ExternalHostConfigSchema = z.object({
+  driver: z.union([HostDriverIdSchema, z.string().min(1)]),
+  name: z.string().optional(),
+  hostType: z.enum(['cli', 'browser', 'desktop', 'custom']).optional(),
+  variant: z.string().optional(),
+  timeoutMs: z.number().int().positive().optional(),
+  capabilities: z
+    .partialRecord(
+      HostCapabilitySchema,
+      z.union([
+        ExternalHostCapabilityBindingSchema,
+        z.array(ExternalHostCapabilityBindingSchema),
+      ])
+    )
+    .optional(),
+  correlation: ExternalHostCorrelationSchema.optional(),
+  options: z.record(z.string(), z.unknown()).optional(),
+});
+
+export interface ExternalHostDriverReference {
+  slug: string;
+  driver: HostDriverId;
+  name: string;
+  description?: string;
+  builtIn: true;
+  defaultConfig: ExternalHostConfig;
+  capabilities?: ExternalHostCapabilitiesConfig;
+  example: {
+    mode: 'external_host';
+    scenario: string;
+    externalHost: Pick<ExternalHostConfig, 'driver' | 'timeoutMs'>;
+    expect: { containsText: string };
+  };
+}
+
+export function listExternalHostDriverReferences(): ExternalHostDriverReference[] {
+  return listRegisteredExternalHostSlugs().map((slug) => {
+    const config = getRegisteredExternalHostConfig(slug);
+    const driver = normalizeHostDriver(slug);
+    const name = config?.name ?? slug;
+
+    return {
+      slug,
+      driver,
+      name,
+      description: getRegisteredExternalHostDescription(slug),
+      builtIn: true,
+      defaultConfig: {
+        driver,
+        ...(config ?? {}),
+      },
+      capabilities: config?.capabilities,
+      example: {
+        mode: 'external_host',
+        scenario: 'Ask the host to complete the task you want to evaluate.',
+        externalHost: {
+          driver: slug,
+          timeoutMs: config?.timeoutMs ?? 60_000,
+        },
+        expect: {
+          containsText: 'expected text',
+        },
+      },
+    };
+  });
+}
+
+export function getExternalHostConfigJsonSchema(): Record<string, unknown> {
+  const driverSlugs = listRegisteredExternalHostSlugs();
+
+  return {
+    $schema: 'https://json-schema.org/draft/2020-12/schema',
+    $id: 'https://github.com/gleanwork/mcp-server-tester/schemas/external-host-config.schema.json',
+    title: 'MCP Server Tester ExternalHostConfig',
+    description:
+      'Configuration for running an MCP eval through an external host driver.',
+    type: 'object',
+    additionalProperties: false,
+    required: ['driver'],
+    properties: {
+      driver: {
+        description:
+          'Canonical built-in driver slug, custom driver slug, or structured driver identity.',
+        anyOf: [
+          {
+            type: 'string',
+            enum: driverSlugs,
+            description:
+              'Known built-in driver slug. Prefer this form for normal eval datasets.',
+          },
+          {
+            type: 'string',
+            minLength: 1,
+            description:
+              'Custom driver slug. Use when registering project-local capabilities.',
+          },
+          hostDriverIdJsonSchema(),
+        ],
+      },
+      name: {
+        type: 'string',
+        description: 'Optional display name shown in reports.',
+      },
+      hostType: {
+        type: 'string',
+        enum: ['cli', 'browser', 'desktop', 'custom'],
+        description: 'Host category shown in reports.',
+      },
+      variant: {
+        type: 'string',
+        description: 'Optional variant label for matrix-style runs.',
+      },
+      timeoutMs: {
+        type: 'integer',
+        minimum: 1,
+        description: 'End-to-end timeout for the host run in milliseconds.',
+      },
+      correlation: externalHostCorrelationJsonSchema(),
+      options: {
+        type: 'object',
+        additionalProperties: true,
+        description:
+          'Driver-wide options interpreted by the selected driver or capability bindings.',
+      },
+      capabilities: externalHostCapabilitiesJsonSchema(),
+    },
+    examples: listExternalHostDriverReferences().map((reference) => ({
+      driver: reference.slug,
+      timeoutMs: reference.example.externalHost.timeoutMs,
+    })),
+  };
+}
+
+export function getExternalHostReference(): Record<string, unknown> {
+  return {
+    schema: getExternalHostConfigJsonSchema(),
+    drivers: listExternalHostDriverReferences(),
+  };
+}
+
+function hostDriverIdJsonSchema(): Record<string, unknown> {
+  return {
+    type: 'object',
+    additionalProperties: false,
+    required: ['provider', 'product', 'surface', 'runtime'],
+    properties: {
+      provider: { type: 'string', minLength: 1 },
+      product: { type: 'string', minLength: 1 },
+      surface: { type: 'string', minLength: 1 },
+      runtime: { type: 'string', minLength: 1 },
+      platform: { type: 'string' },
+      channel: { type: 'string' },
+    },
+  };
+}
+
+function externalHostCorrelationJsonSchema(): Record<string, unknown> {
+  return {
+    type: 'object',
+    additionalProperties: false,
+    description:
+      'How a submitted host run is correlated with host-native trace evidence.',
+    properties: {
+      strategy: {
+        type: 'string',
+        enum: ['prompt_marker', 'host_session_metadata', 'none'],
+      },
+      includeInPrompt: {
+        type: 'boolean',
+        description:
+          'Whether to include the generated run marker in the host-visible prompt.',
+      },
+      promptTemplate: {
+        type: 'string',
+        description: 'Prompt suffix template. Supports {{marker}}.',
+      },
+    },
+  };
+}
+
+function externalHostCapabilitiesJsonSchema(): Record<string, unknown> {
+  return {
+    type: 'object',
+    additionalProperties: false,
+    description:
+      'Advanced escape hatch for overriding the capability recipe. Most users should choose a built-in driver instead.',
+    properties: Object.fromEntries(
+      HostCapabilitySchema.options.map((capability) => [
+        capability,
+        {
+          anyOf: [
+            externalHostCapabilityBindingJsonSchema(),
+            {
+              type: 'array',
+              items: externalHostCapabilityBindingJsonSchema(),
+            },
+          ],
+        },
+      ])
+    ),
+  };
+}
+
+function externalHostCapabilityBindingJsonSchema(): Record<string, unknown> {
+  return {
+    type: 'object',
+    additionalProperties: false,
+    required: ['uses'],
+    properties: {
+      uses: {
+        type: 'string',
+        minLength: 1,
+        description:
+          'Capability implementation id. Built-ins use builtin:<id>; custom integrations may use module:<specifier>#<export>.',
+      },
+      with: {
+        type: 'object',
+        additionalProperties: true,
+        description:
+          'Binding-local options interpreted by the selected capability implementation.',
+      },
+      provides: {
+        type: 'array',
+        items: {
+          type: 'string',
+          enum: HostCapabilitySchema.options,
+        },
+      },
+    },
+  };
+}
+
+export function externalHostDriverSlugForConfig(
+  config: ExternalHostConfig
+): string {
+  return driverToSlug(normalizeHostDriver(config.driver));
+}
diff --git a/src/evals/externalHost/types.ts b/src/evals/externalHost/types.ts
new file mode 100644
index 0000000..650cc38
--- /dev/null
+++ b/src/evals/externalHost/types.ts
@@ -0,0 +1,274 @@
+import type {
+  LLMToolCall,
+  MCPHostSimulationResult,
+} from '../mcpHost/mcpHostTypes.js';
+import type { UsageMetrics } from '../../types/index.js';
+
+export type ExternalHostType = 'cli' | 'browser' | 'desktop' | 'custom';
+
+export type HostCapability =
+  | 'control'
+  | 'input'
+  | 'completion'
+  | 'trace'
+  | 'normalize';
+
+export type TraceSource =
+  | 'mcp-proxy'
+  | 'mcp-server-logs'
+  | 'host-local-transcript'
+  | 'host-native-export'
+  | 'browser-api'
+  | 'accessibility'
+  | 'dom'
+  | 'screenshot'
+  | 'stdout'
+  | 'manual-import'
+  | 'none';
+
+export type ObservationConfidence = 'high' | 'medium' | 'low' | 'unknown';
+
+export type ExternalHostCorrelationStrategy =
+  | 'prompt_marker'
+  | 'host_session_metadata'
+  | 'none';
+
+export interface HostDriverId {
+  provider: string;
+  product: string;
+  surface: string;
+  runtime: string;
+  platform?: string;
+  channel?: string;
+}
+
+export type HostDriverConfig = HostDriverId | string;
+
+export type ExternalHostFailureKind =
+  | 'app_unavailable'
+  | 'automation_permission_denied'
+  | 'submission_failed'
+  | 'no_matching_session'
+  | 'ambiguous_matching_sessions'
+  | 'timeout'
+  | 'parse_failure'
+  | 'host_run_failed'
+  | 'unsupported_host'
+  | 'unknown';
+
+export interface HostArtifact {
+  kind:
+    | 'stdout'
+    | 'stderr'
+    | 'log'
+    | 'transcript'
+    | 'audit'
+    | 'metadata'
+    | 'screenshot'
+    | 'video'
+    | 'har'
+    | 'trace';
+  name: string;
+  path?: string;
+  contentType?: string;
+  summary?: string;
+}
+
+export interface ExternalHostSession {
+  id?: string;
+  runMarker: string;
+  requestId?: string;
+  cliSessionId?: string;
+  cwd?: string;
+  startedAt?: string;
+  completedAt?: string;
+}
+
+export interface ExternalHostCorrelationConfig {
+  /**
+   * How this run should be correlated with host-native evidence.
+   *
+   * - prompt_marker: append a marker to the submitted prompt.
+   * - host_session_metadata: rely on host-native session metadata.
+   * - none: no host-visible marker is submitted.
+   */
+  strategy?: ExternalHostCorrelationStrategy;
+  /**
+   * Whether the marker should be included in the host-visible prompt.
+   * Defaults to true only for prompt_marker.
+   */
+  includeInPrompt?: boolean;
+  /**
+   * Optional prompt suffix template. Supports {{marker}}.
+   */
+  promptTemplate?: string;
+}
+
+export interface ExternalHostCorrelationMetadata {
+  strategy: ExternalHostCorrelationStrategy;
+  marker: string;
+  includedInPrompt: boolean;
+}
+
+export interface ExternalHostMetadata {
+  driver: HostDriverId;
+  driverSlug: string;
+  displayName: string;
+  hostName: string;
+  hostType: ExternalHostType;
+  hostVariant?: string;
+  capabilitiesUsed: HostCapability[];
+  traceSource: TraceSource;
+  traceConfidence: ObservationConfidence;
+  traceLimitations?: string[];
+  artifacts: HostArtifact[];
+  session: ExternalHostSession;
+  correlation: ExternalHostCorrelationMetadata;
+  failureKind?: ExternalHostFailureKind;
+  sources?: {
+    finalAnswer?: TraceSource;
+    toolCalls?: TraceSource;
+    usage?: TraceSource;
+    cost?: TraceSource;
+  };
+  evidence?: {
+    finalAnswer?: EvidenceSource;
+    toolCalls?: EvidenceSource;
+    usage?: EvidenceSource;
+    cost?: EvidenceSource;
+  };
+}
+
+export interface ExternalHostConfig {
+  /**
+   * Canonical structured driver identity or derived slug.
+   * Example: `anthropic.claude.cowork.desktop-app.macos`.
+   */
+  driver: HostDriverConfig;
+  /**
+   * Human-readable host name shown in reports.
+   */
+  name?: string;
+  /**
+   * Host type shown in reports.
+   */
+  hostType?: ExternalHostType;
+  /**
+   * Optional variant label for matrix-style runs.
+   */
+  variant?: string;
+  /**
+   * End-to-end timeout for the host run.
+   */
+  timeoutMs?: number;
+  /**
+   * Capability bindings used to compose this external host runner.
+   * If omitted, the runtime may provide a built-in default for known drivers.
+   */
+  capabilities?: ExternalHostCapabilitiesConfig;
+  /**
+   * Run correlation strategy. Built-in drivers may provide defaults.
+   */
+  correlation?: ExternalHostCorrelationConfig;
+  /**
+   * Driver-wide options available to capability implementations.
+   */
+  options?: Record<string, unknown>;
+}
+
+export interface HostRunContext {
+  runId: string;
+  caseId: string;
+  scenario: string;
+  submittedScenario: string;
+  marker: string;
+  correlation: ExternalHostCorrelationMetadata;
+  timeoutMs: number;
+  startedAtMs: number;
+}
+
+export interface ExternalHostSimulationResult extends MCPHostSimulationResult {
+  externalHost: ExternalHostMetadata;
+}
+
+export interface ExternalHostRunSuccess {
+  success: true;
+  response?: string;
+  toolCalls: LLMToolCall[];
+  conversationHistory?: MCPHostSimulationResult['conversationHistory'];
+  usage?: UsageMetrics;
+  llmDurationMs?: number;
+  mcpDurationMs?: number;
+  externalHost: ExternalHostMetadata;
+}
+
+export interface ExternalHostRunFailure {
+  success: false;
+  error: string;
+  toolCalls: LLMToolCall[];
+  externalHost: ExternalHostMetadata;
+}
+
+export type ExternalHostRunResult =
+  | ExternalHostRunSuccess
+  | ExternalHostRunFailure;
+
+export type ExternalHostCapabilitiesConfig = Partial<
+  Record<
+    HostCapability,
+    ExternalHostCapabilityBinding | ExternalHostCapabilityBinding[]
+  >
+>;
+
+export interface ExternalHostCapabilityBinding {
+  /**
+   * Implementation identifier. Built-ins use `builtin:<id>`; callers may use
+   * `module:<specifier>#<export>` to load project-local integrations.
+   */
+  uses: string;
+  /**
+   * Binding-local options interpreted only by the selected implementation.
+   */
+  with?: Record<string, unknown>;
+  /**
+   * Extra capabilities this binding should satisfy beyond its map key.
+   */
+  provides?: HostCapability[];
+}
+
+export interface ExternalHostRunState {
+  driver: HostDriverId;
+  driverSlug: string;
+  displayName: string;
+  capabilitiesUsed: HostCapability[];
+  data: Record<string, unknown>;
+  result?: ExternalHostRunResult;
+}
+
+export interface ExternalHostCapabilityContext {
+  config: ExternalHostConfig;
+  run: HostRunContext;
+  capability: HostCapability;
+  binding: ExternalHostCapabilityBinding;
+  state: ExternalHostRunState;
+}
+
+export interface ExternalHostCapabilityImplementation {
+  id: string;
+  capabilities: HostCapability[];
+  setup?(
+    context: ExternalHostCapabilityContext
+  ): Promise<ExternalHostRunResult | void>;
+  run?(
+    context: ExternalHostCapabilityContext
+  ): Promise<ExternalHostRunResult | void>;
+}
+
+export interface ExternalHostRunner {
+  run(context: HostRunContext): Promise<ExternalHostRunResult>;
+}
+
+export interface EvidenceSource {
+  source: TraceSource;
+  confidence: ObservationConfidence;
+}
diff --git a/src/index.ts b/src/index.ts
index b5b91d6..ddf422b 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -243,6 +243,40 @@ export {
   getMissingDependencyMessage,
 } from './evals/mcpHost/index.js';
 
+// External Host Evals (experimental)
+export type {
+  EvidenceSource,
+  ExternalHostCapabilityBinding,
+  ExternalHostCapabilityContext,
+  ExternalHostCapabilityImplementation,
+  ExternalHostCapabilitiesConfig,
+  ExternalHostConfig,
+  ExternalHostDriverReference,
+  ExternalHostFailureKind,
+  ExternalHostMetadata,
+  ExternalHostRunResult,
+  ExternalHostSession,
+  ExternalHostSimulationResult,
+  ExternalHostType,
+  HostArtifact,
+  HostCapability,
+  HostDriverConfig,
+  HostDriverId,
+  HostRunContext,
+  ObservationConfidence,
+  TraceSource,
+} from './evals/externalHost/index.js';
+export {
+  driverToSlug,
+  normalizeHostDriver,
+  parseDriverSlug,
+  getExternalHostConfigJsonSchema,
+  getExternalHostReference,
+  listExternalHostDriverReferences,
+  registerExternalHostCapability,
+  runExternalHostScenario,
+} from './evals/externalHost/index.js';
+
 // Judge
 export { createJudge } from './judge/judgeClient.js';
 export {
diff --git a/src/mcp/response.ts b/src/mcp/response.ts
index 1189250..5e5130b 100644
--- a/src/mcp/response.ts
+++ b/src/mcp/response.ts
@@ -190,6 +190,11 @@ export function extractText(response: unknown): string {
       return r.text;
     }
 
+    // Host simulation results expose the final answer as `response`.
+    if (typeof r.response === 'string') {
+      return r.response;
+    }
+
     // Fallback to JSON
     return JSON.stringify(r);
   }
diff --git a/src/reporters/mcpReporter.test.ts b/src/reporters/mcpReporter.test.ts
index 2cc490b..1bbeb75 100644
--- a/src/reporters/mcpReporter.test.ts
+++ b/src/reporters/mcpReporter.test.ts
@@ -382,6 +382,77 @@ describe('MCPReporter.buildRunData()', () => {
     });
   });
 
+  describe('external host metadata', () => {
+    it('preserves external host trace metadata in run data', () => {
+      setResults(reporter, [
+        makeResult({
+          pass: true,
+          toolName: 'external_host',
+          externalHost: {
+            driver: {
+              provider: 'anthropic',
+              product: 'claude',
+              surface: 'cowork',
+              runtime: 'desktop-app',
+              platform: 'macos',
+            },
+            driverSlug: 'anthropic.claude.cowork.desktop-app.macos',
+            displayName: 'Claude Cowork Desktop',
+            hostName: 'Claude Cowork Desktop',
+            hostType: 'desktop',
+            capabilitiesUsed: [
+              'control',
+              'input',
+              'completion',
+              'trace',
+              'normalize',
+            ],
+            traceSource: 'host-local-transcript',
+            traceConfidence: 'high',
+            traceLimitations: ['fixture limitation'],
+            artifacts: [
+              {
+                kind: 'audit',
+                name: 'Claude audit log',
+                path: '/tmp/audit.jsonl',
+              },
+            ],
+            session: {
+              id: 'local_123',
+              runMarker: 'MCP_SERVER_TESTER_TEST',
+              requestId: 'req_123',
+            },
+            correlation: {
+              strategy: 'prompt_marker',
+              marker: 'MCP_SERVER_TESTER_TEST',
+              includedInPrompt: true,
+            },
+            evidence: {
+              finalAnswer: {
+                source: 'host-local-transcript',
+                confidence: 'high',
+              },
+              toolCalls: {
+                source: 'host-local-transcript',
+                confidence: 'high',
+              },
+            },
+          },
+        }),
+      ]);
+
+      const data = callBuildRunData(reporter, 100);
+
+      expect(data.results[0]?.externalHost).toMatchObject({
+        driverSlug: 'anthropic.claude.cowork.desktop-app.macos',
+        hostName: 'Claude Cowork Desktop',
+        traceSource: 'host-local-transcript',
+        traceConfidence: 'high',
+        session: { id: 'local_123', requestId: 'req_123' },
+      });
+    });
+  });
+
   describe('conformanceChecks and serverCapabilities', () => {
     it('returns undefined conformanceChecks when none are recorded', () => {
       setResults(reporter, [makeResult({ pass: true })]);
diff --git a/src/reporters/ui-src/components/Results/DetailModal.tsx b/src/reporters/ui-src/components/Results/DetailModal.tsx
index d8dbcdf..bbe02ac 100644
--- a/src/reporters/ui-src/components/Results/DetailModal.tsx
+++ b/src/reporters/ui-src/components/Results/DetailModal.tsx
@@ -19,6 +19,213 @@ function formatResponsePreview(response: unknown): string {
   return JSON.stringify(response, null, 2) ?? '';
 }
 
+function getExternalHostEvidenceRows(
+  externalHost: NonNullable<EvalCaseResult['externalHost']>
+) {
+  const labels = {
+    finalAnswer: 'Final answer',
+    toolCalls: 'Tool calls',
+    usage: 'Usage',
+    cost: 'Cost',
+  } as const;
+  const keys = Object.keys(labels) as Array<keyof typeof labels>;
+
+  return keys
+    .map((key) => {
+      const evidence = externalHost.evidence?.[key];
+      const source = evidence?.source ?? externalHost.sources?.[key];
+      const confidence = evidence?.confidence;
+
+      if (!source && !confidence) {
+        return undefined;
+      }
+
+      return {
+        key,
+        label: labels[key],
+        source: source ?? 'unknown',
+        confidence: confidence ?? externalHost.traceConfidence,
+      };
+    })
+    .filter((row): row is NonNullable<typeof row> => row !== undefined);
+}
+
+function isRecord(value: unknown): value is Record<string, unknown> {
+  return typeof value === 'object' && value !== null && !Array.isArray(value);
+}
+
+function responseRecord(result: EvalCaseResult): Record<string, unknown> {
+  return isRecord(result.response) ? result.response : {};
+}
+
+function resultToolCalls(
+  result: EvalCaseResult
+): Array<{ id?: string; name: string; arguments: Record<string, unknown> }> {
+  const toolCalls = responseRecord(result).toolCalls;
+  if (!Array.isArray(toolCalls)) {
+    return [];
+  }
+
+  return toolCalls.filter(
+    (
+      call
+    ): call is {
+      id?: string;
+      name: string;
+      arguments: Record<string, unknown>;
+    } =>
+      isRecord(call) &&
+      typeof call.name === 'string' &&
+      isRecord(call.arguments)
+  );
+}
+
+function finalAnswer(result: EvalCaseResult): string | undefined {
+  const response = responseRecord(result).response;
+  return typeof response === 'string' ? response : undefined;
+}
+
+function usageForResult(
+  result: EvalCaseResult
+): Record<string, unknown> | undefined {
+  const responseUsage = responseRecord(result).usage;
+  return (
+    (result.hostUsage as unknown as Record<string, unknown> | undefined) ??
+    (isRecord(responseUsage) ? responseUsage : undefined)
+  );
+}
+
+function numberField(
+  value: Record<string, unknown> | undefined,
+  key: string
+): number | undefined {
+  const nested = value?.[key];
+  return typeof nested === 'number' ? nested : undefined;
+}
+
+function formatNumber(value: number | undefined): string {
+  return value === undefined ? 'unknown' : value.toLocaleString();
+}
+
+function formatCost(value: number | undefined): string {
+  if (value === undefined) {
+    return 'unknown';
+  }
+  return `$${value.toFixed(value === 0 ? 2 : 4)}`;
+}
+
+function formatMs(value: number | undefined): string {
+  if (value === undefined) {
+    return 'unknown';
+  }
+  return value >= 1000
+    ? `${(value / 1000).toFixed(1)}s`
+    : `${value.toFixed(0)}ms`;
+}
+
+function jsonPreview(value: unknown): string {
+  return JSON.stringify(value, null, 2) ?? '';
+}
+
+function InfoField({
+  label,
+  value,
+}: {
+  label: string;
+  value: React.ReactNode;
+}) {
+  return (
+    <div>
+      <h4 className="text-xs font-semibold uppercase tracking-wide text-muted-foreground mb-1">
+        {label}
+      </h4>
+      <div className="text-sm break-words">{value}</div>
+    </div>
+  );
+}
+
+function JsonBlock({ value }: { value: unknown }) {
+  return (
+    <pre className="text-xs font-mono bg-muted p-3 rounded-md overflow-x-auto whitespace-pre-wrap">
+      {jsonPreview(value)}
+    </pre>
+  );
+}
+
+function expectationEntries(result: EvalCaseResult) {
+  return Object.entries(result.expectations ?? {}).filter(
+    (entry): entry is [string, NonNullable<(typeof entry)[1]>] =>
+      entry[1] !== undefined
+  );
+}
+
+function failedExpectationEntries(result: EvalCaseResult) {
+  return expectationEntries(result).filter(([, expectation]) => {
+    return !expectation.pass;
+  });
+}
+
+function getVerdictSummary(result: EvalCaseResult): {
+  category: string;
+  reason: string;
+} {
+  const failedAssertions = failedExpectationEntries(result).map(
+    ([type]) => type
+  );
+
+  if (result.pass) {
+    return {
+      category: 'Pass',
+      reason: 'All configured assertions passed.',
+    };
+  }
+
+  if (result.externalHost?.failureKind) {
+    return {
+      category: 'Host or automation failure',
+      reason: `The driver failed before producing trustworthy eval evidence: ${result.externalHost.failureKind}.`,
+    };
+  }
+
+  if (result.error) {
+    const firstLine = stripAnsiCodes(result.error).split('\n')[0];
+    return {
+      category: 'Execution failure',
+      reason: firstLine,
+    };
+  }
+
+  if (failedAssertions.length > 0) {
+    return {
+      category: 'Assertion failure',
+      reason: `${failedAssertions.length} configured assertion${failedAssertions.length === 1 ? '' : 's'} failed: ${failedAssertions.join(', ')}.`,
+    };
+  }
+
+  return {
+    category: 'Failure',
+    reason:
+      'The run failed without a specific assertion or host-driver error in the report.',
+  };
+}
+
+function evidenceSummary(
+  externalHost: NonNullable<EvalCaseResult['externalHost']> | undefined,
+  key: 'finalAnswer' | 'toolCalls' | 'usage' | 'cost'
+): string {
+  if (!externalHost) {
+    return 'not reported';
+  }
+  const evidence = externalHost.evidence?.[key];
+  const source = evidence?.source ?? externalHost.sources?.[key];
+  const confidence = evidence?.confidence ?? externalHost.traceConfidence;
+
+  if (!source) {
+    return 'not reported';
+  }
+  return `${source} · ${confidence}`;
+}
+
 interface DetailModalProps {
   result: EvalCaseResult | null;
   onClose: () => void;
@@ -81,12 +288,24 @@ export function DetailModal({ result, onClose }: DetailModalProps) {
 
   const responseText = formatResponsePreview(result.response);
   const isLargeResponse = responseText.length > 500;
-  const hasAssertions = Object.keys(result.expectations ?? {}).length > 0;
+  const expectationRows = expectationEntries(result);
+  const failedExpectationRows = failedExpectationEntries(result);
+  const hasAssertions = expectationRows.length > 0;
   const hasIterations =
     result.iterationResults && result.iterationResults.length > 0;
   const iterations = result.iterationResults!;
   const displayRate = result.assertionPassRate;
   const infraErrorRate = result.infrastructureErrorRate;
+  const externalHostEvidenceRows = result.externalHost
+    ? getExternalHostEvidenceRows(result.externalHost)
+    : [];
+  const hostToolCalls = resultToolCalls(result);
+  const hostUsage = usageForResult(result);
+  const answer = finalAnswer(result);
+  const llmDurationMs = numberField(responseRecord(result), 'llmDurationMs');
+  const mcpDurationMs = numberField(responseRecord(result), 'mcpDurationMs');
+  const externalHostConfig = result.request?.externalHost;
+  const verdict = getVerdictSummary(result);
 
   return (
     <>
@@ -229,6 +448,24 @@ export function DetailModal({ result, onClose }: DetailModalProps) {
                   {result.project}
                 </span>
               )}
+              {result.externalHost && (
+                <>
+                  <span className="px-2 py-1 rounded text-xs font-medium bg-teal-100 text-teal-700 dark:bg-teal-900/30 dark:text-teal-300">
+                    {result.externalHost.hostName}
+                  </span>
+                  <span
+                    className={`px-2 py-1 rounded text-xs font-medium ${
+                      result.externalHost.traceConfidence === 'high'
+                        ? 'bg-green-100 text-green-700 dark:bg-green-900/30 dark:text-green-300'
+                        : result.externalHost.traceConfidence === 'medium'
+                          ? 'bg-amber-100 text-amber-700 dark:bg-amber-900/30 dark:text-amber-300'
+                          : 'bg-gray-100 text-gray-700 dark:bg-gray-800 dark:text-gray-300'
+                    }`}
+                  >
+                    {result.externalHost.traceConfidence} trace
+                  </span>
+                </>
+              )}
               <span className="px-2 py-1 rounded text-xs font-medium bg-gray-100 text-gray-600 dark:bg-gray-800 dark:text-gray-400">
                 {result.durationMs.toFixed(0)}ms
               </span>
@@ -244,18 +481,155 @@ export function DetailModal({ result, onClose }: DetailModalProps) {
               )}
             </div>
 
-            {/* Request — show what was sent */}
+            <CollapsibleSection title="Verdict" defaultOpen={true}>
+              <div className="space-y-4">
+                <div
+                  className={`rounded-md border p-4 ${
+                    result.pass
+                      ? 'border-green-500/30 bg-green-500/10'
+                      : result.externalHost?.failureKind || result.error
+                        ? 'border-orange-500/30 bg-orange-500/10'
+                        : 'border-red-500/30 bg-red-500/10'
+                  }`}
+                >
+                  <div className="flex flex-wrap items-center gap-2 mb-2">
+                    <span className="text-sm font-semibold">
+                      {verdict.category}
+                    </span>
+                    {failedExpectationRows.length > 0 && (
+                      <span className="text-xs text-muted-foreground">
+                        {failedExpectationRows.length} failed assertion
+                        {failedExpectationRows.length === 1 ? '' : 's'}
+                      </span>
+                    )}
+                  </div>
+                  <p className="text-sm text-muted-foreground">
+                    {verdict.reason}
+                  </p>
+                </div>
+
+                {result.externalHost && (
+                  <div className="grid grid-cols-1 sm:grid-cols-3 gap-3 text-sm">
+                    <InfoField
+                      label="Driver"
+                      value={
+                        <>
+                          <span className="font-medium">
+                            {result.externalHost.displayName}
+                          </span>
+                          <p className="font-mono text-xs text-muted-foreground break-all mt-1">
+                            {result.externalHost.driverSlug}
+                          </p>
+                        </>
+                      }
+                    />
+                    <InfoField
+                      label="Trace"
+                      value={`${result.externalHost.traceSource} · ${result.externalHost.traceConfidence}`}
+                    />
+                    <InfoField
+                      label="Correlation"
+                      value={
+                        result.externalHost.correlation.includedInPrompt
+                          ? `${result.externalHost.correlation.strategy} in prompt`
+                          : result.externalHost.correlation.strategy
+                      }
+                    />
+                    <InfoField
+                      label="Final Answer Source"
+                      value={evidenceSummary(
+                        result.externalHost,
+                        'finalAnswer'
+                      )}
+                    />
+                    <InfoField
+                      label="Tool Evidence"
+                      value={evidenceSummary(result.externalHost, 'toolCalls')}
+                    />
+                    <InfoField
+                      label="Usage Evidence"
+                      value={evidenceSummary(result.externalHost, 'usage')}
+                    />
+                  </div>
+                )}
+              </div>
+            </CollapsibleSection>
+
+            {/* Setup and configuration — show what the eval was configured to run */}
             {result.request &&
               (result.request.args ||
                 result.request.scenario ||
-                result.request.description) && (
-                <CollapsibleSection title="Request" defaultOpen={true}>
-                  <div className="space-y-3">
+                result.request.externalHost ||
+                result.request.description ||
+                result.request.expect) && (
+                <CollapsibleSection
+                  title="Setup & Configuration"
+                  defaultOpen={false}
+                >
+                  <div className="space-y-4">
                     {result.request.description && (
                       <p className="text-sm text-muted-foreground">
                         {result.request.description}
                       </p>
                     )}
+
+                    <div className="grid grid-cols-1 sm:grid-cols-3 gap-3">
+                      <InfoField
+                        label="Mode"
+                        value={
+                          <code className="text-xs bg-muted px-2 py-1 rounded">
+                            {result.request.mode ?? result.toolName}
+                          </code>
+                        }
+                      />
+                      <InfoField
+                        label="Dataset"
+                        value={
+                          <span className="font-medium">
+                            {result.datasetName}
+                          </span>
+                        }
+                      />
+                      <InfoField
+                        label="Iterations"
+                        value={
+                          result.request.iterations ??
+                          result.iterationResults?.length ??
+                          1
+                        }
+                      />
+                      {result.request.accuracyThreshold !== undefined && (
+                        <InfoField
+                          label="Accuracy Threshold"
+                          value={`${(result.request.accuracyThreshold * 100).toFixed(0)}%`}
+                        />
+                      )}
+                      {result.request.judgeReps !== undefined && (
+                        <InfoField
+                          label="Judge Reps"
+                          value={result.request.judgeReps}
+                        />
+                      )}
+                      {result.request.tags &&
+                        result.request.tags.length > 0 && (
+                          <InfoField
+                            label="Tags"
+                            value={
+                              <div className="flex flex-wrap gap-1">
+                                {result.request.tags.map((tag) => (
+                                  <span
+                                    key={tag}
+                                    className="px-2 py-1 rounded text-xs bg-muted text-muted-foreground"
+                                  >
+                                    {tag}
+                                  </span>
+                                ))}
+                              </div>
+                            }
+                          />
+                        )}
+                    </div>
+
                     {result.request.scenario && (
                       <div>
                         <h4 className="text-xs font-semibold uppercase tracking-wide text-muted-foreground mb-1">
@@ -266,6 +640,7 @@ export function DetailModal({ result, onClose }: DetailModalProps) {
                         </p>
                       </div>
                     )}
+
                     {result.request.mcpHostConfig && (
                       <div className="flex gap-2">
                         <span className="px-2 py-1 rounded text-xs font-medium bg-violet-100 text-violet-700 dark:bg-violet-900/30 dark:text-violet-300">
@@ -278,14 +653,210 @@ export function DetailModal({ result, onClose }: DetailModalProps) {
                         )}
                       </div>
                     )}
+
+                    {externalHostConfig && (
+                      <div className="space-y-3">
+                        <h4 className="text-xs font-semibold uppercase tracking-wide text-muted-foreground">
+                          External Host Driver
+                        </h4>
+                        <div className="grid grid-cols-1 sm:grid-cols-2 gap-3">
+                          <InfoField
+                            label="Driver Slug"
+                            value={
+                              <code className="text-xs break-all">
+                                {externalHostConfig.driverSlug ??
+                                  (typeof externalHostConfig.driver === 'string'
+                                    ? externalHostConfig.driver
+                                    : 'external host')}
+                              </code>
+                            }
+                          />
+                          <InfoField
+                            label="Display Name"
+                            value={
+                              externalHostConfig.name ??
+                              result.externalHost?.displayName ??
+                              'external host'
+                            }
+                          />
+                          {typeof externalHostConfig.driver === 'object' && (
+                            <>
+                              <InfoField
+                                label="Provider / Product"
+                                value={`${externalHostConfig.driver.provider} / ${externalHostConfig.driver.product}`}
+                              />
+                              <InfoField
+                                label="Surface / Runtime"
+                                value={`${externalHostConfig.driver.surface} / ${externalHostConfig.driver.runtime}`}
+                              />
+                              {externalHostConfig.driver.platform && (
+                                <InfoField
+                                  label="Platform"
+                                  value={externalHostConfig.driver.platform}
+                                />
+                              )}
+                              {externalHostConfig.driver.channel && (
+                                <InfoField
+                                  label="Channel"
+                                  value={externalHostConfig.driver.channel}
+                                />
+                              )}
+                            </>
+                          )}
+                          {externalHostConfig.hostType && (
+                            <InfoField
+                              label="Host Type"
+                              value={externalHostConfig.hostType}
+                            />
+                          )}
+                          {externalHostConfig.variant && (
+                            <InfoField
+                              label="Variant"
+                              value={externalHostConfig.variant}
+                            />
+                          )}
+                          {externalHostConfig.timeoutMs !== undefined && (
+                            <InfoField
+                              label="Timeout"
+                              value={formatMs(externalHostConfig.timeoutMs)}
+                            />
+                          )}
+                          {externalHostConfig.usesBuiltInDefaults !==
+                            undefined && (
+                            <InfoField
+                              label="Built-in Defaults"
+                              value={
+                                externalHostConfig.usesBuiltInDefaults
+                                  ? 'applied'
+                                  : 'not applied'
+                              }
+                            />
+                          )}
+                          {externalHostConfig.correlation && (
+                            <InfoField
+                              label="Correlation"
+                              value={
+                                <div className="space-y-1">
+                                  <code className="text-xs">
+                                    {externalHostConfig.correlation.strategy ??
+                                      'none'}
+                                  </code>
+                                  {externalHostConfig.correlation
+                                    .includeInPrompt !== undefined && (
+                                    <p className="text-xs text-muted-foreground">
+                                      prompt marker:{' '}
+                                      {externalHostConfig.correlation
+                                        .includeInPrompt
+                                        ? 'included'
+                                        : 'not included'}
+                                    </p>
+                                  )}
+                                  {externalHostConfig.correlation
+                                    .promptTemplate && (
+                                    <p className="text-xs text-muted-foreground break-all">
+                                      template:{' '}
+                                      {
+                                        externalHostConfig.correlation
+                                          .promptTemplate
+                                      }
+                                    </p>
+                                  )}
+                                </div>
+                              }
+                            />
+                          )}
+                        </div>
+
+                        {externalHostConfig.capabilities &&
+                          Object.keys(externalHostConfig.capabilities).length >
+                            0 && (
+                            <div>
+                              <h4 className="text-xs font-semibold uppercase tracking-wide text-muted-foreground mb-2">
+                                Capability Bindings
+                              </h4>
+                              <div className="overflow-x-auto rounded-md border">
+                                <table className="w-full text-xs">
+                                  <thead className="bg-muted text-muted-foreground">
+                                    <tr>
+                                      <th className="text-left p-2 font-medium">
+                                        Capability
+                                      </th>
+                                      <th className="text-left p-2 font-medium">
+                                        Implementation
+                                      </th>
+                                      <th className="text-left p-2 font-medium">
+                                        Provides
+                                      </th>
+                                      <th className="text-left p-2 font-medium">
+                                        Options
+                                      </th>
+                                    </tr>
+                                  </thead>
+                                  <tbody>
+                                    {Object.entries(
+                                      externalHostConfig.capabilities
+                                    ).flatMap(([capability, bindings]) =>
+                                      bindings.map((binding, index) => (
+                                        <tr
+                                          key={`${capability}-${index}`}
+                                          className="border-t"
+                                        >
+                                          <td className="p-2 font-mono">
+                                            {capability}
+                                          </td>
+                                          <td className="p-2 font-mono break-all">
+                                            {binding.uses}
+                                          </td>
+                                          <td className="p-2">
+                                            {binding.provides?.join(', ') ??
+                                              '-'}
+                                          </td>
+                                          <td className="p-2">
+                                            {binding.with ? (
+                                              <pre className="font-mono whitespace-pre-wrap">
+                                                {jsonPreview(binding.with)}
+                                              </pre>
+                                            ) : (
+                                              '-'
+                                            )}
+                                          </td>
+                                        </tr>
+                                      ))
+                                    )}
+                                  </tbody>
+                                </table>
+                              </div>
+                            </div>
+                          )}
+
+                        {externalHostConfig.options &&
+                          Object.keys(externalHostConfig.options).length >
+                            0 && (
+                            <div>
+                              <h4 className="text-xs font-semibold uppercase tracking-wide text-muted-foreground mb-1">
+                                Driver Options
+                              </h4>
+                              <JsonBlock value={externalHostConfig.options} />
+                            </div>
+                          )}
+                      </div>
+                    )}
+
+                    {result.request.expect && (
+                      <div>
+                        <h4 className="text-xs font-semibold uppercase tracking-wide text-muted-foreground mb-1">
+                          Configured Expectations
+                        </h4>
+                        <JsonBlock value={result.request.expect} />
+                      </div>
+                    )}
+
                     {result.request.args && (
                       <div>
                         <h4 className="text-xs font-semibold uppercase tracking-wide text-muted-foreground mb-1">
                           Arguments
                         </h4>
-                        <pre className="text-xs font-mono bg-muted p-3 rounded-md overflow-x-auto">
-                          {JSON.stringify(result.request.args, null, 2)}
-                        </pre>
+                        <JsonBlock value={result.request.args} />
                       </div>
                     )}
                   </div>
@@ -311,45 +882,334 @@ export function DetailModal({ result, onClose }: DetailModalProps) {
                 defaultOpen={true}
                 badge={
                   <span className="text-xs text-muted-foreground ml-auto">
-                    {
-                      Object.values(result.expectations).filter((e) => e?.pass)
-                        .length
-                    }
-                    /{Object.values(result.expectations).filter(Boolean).length}{' '}
-                    passed
+                    {expectationRows.filter(([, e]) => e.pass).length}/
+                    {expectationRows.length} passed
                   </span>
                 }
               >
                 <div className="space-y-2">
-                  {Object.entries(result.expectations)
-                    .filter(([_, exp]) => exp !== undefined)
-                    .map(([type, exp]) => (
-                      <div
-                        key={type}
-                        className={`p-3 rounded-md border-l-4 ${
-                          exp.pass
-                            ? 'border-green-500 bg-green-500/10'
-                            : 'border-red-500 bg-red-500/10'
-                        }`}
-                      >
-                        <div className="flex items-center gap-2 mb-1">
+                  {expectationRows.map(([type, exp]) => (
+                    <div
+                      key={type}
+                      className={`p-3 rounded-md border-l-4 ${
+                        exp.pass
+                          ? 'border-green-500 bg-green-500/10'
+                          : 'border-red-500 bg-red-500/10'
+                      }`}
+                    >
+                      <div className="flex items-center gap-2 mb-1">
+                        <span
+                          className={`text-sm font-semibold ${
+                            exp.pass
+                              ? 'text-green-700 dark:text-green-400'
+                              : 'text-red-700 dark:text-red-400'
+                          }`}
+                        >
+                          {exp.pass ? '✓' : '✗'} {type}
+                        </span>
+                      </div>
+                      {exp.details && (
+                        <pre className="text-xs text-muted-foreground font-mono whitespace-pre-wrap">
+                          {stripAnsiCodes(exp.details)}
+                        </pre>
+                      )}
+                    </div>
+                  ))}
+                </div>
+              </CollapsibleSection>
+            )}
+
+            {result.externalHost && (
+              <CollapsibleSection
+                title="Host Outcomes & Evidence"
+                defaultOpen={true}
+              >
+                <div className="space-y-4">
+                  {answer && (
+                    <div>
+                      <h4 className="text-xs font-semibold uppercase tracking-wide text-muted-foreground mb-1">
+                        Final Answer
+                      </h4>
+                      <p className="text-sm bg-muted p-3 rounded-md whitespace-pre-wrap">
+                        {answer}
+                      </p>
+                    </div>
+                  )}
+
+                  <div className="grid grid-cols-2 sm:grid-cols-4 gap-2">
+                    <div className="rounded-md bg-muted p-3">
+                      <div className="text-xs text-muted-foreground">
+                        Tool Calls
+                      </div>
+                      <div className="text-lg font-semibold">
+                        {hostToolCalls.length}
+                      </div>
+                    </div>
+                    <div className="rounded-md bg-muted p-3">
+                      <div className="text-xs text-muted-foreground">
+                        Input Tokens
+                      </div>
+                      <div className="text-lg font-semibold">
+                        {formatNumber(numberField(hostUsage, 'inputTokens'))}
+                      </div>
+                    </div>
+                    <div className="rounded-md bg-muted p-3">
+                      <div className="text-xs text-muted-foreground">
+                        Output Tokens
+                      </div>
+                      <div className="text-lg font-semibold">
+                        {formatNumber(numberField(hostUsage, 'outputTokens'))}
+                      </div>
+                    </div>
+                    <div className="rounded-md bg-muted p-3">
+                      <div className="text-xs text-muted-foreground">Cost</div>
+                      <div className="text-lg font-semibold">
+                        {formatCost(numberField(hostUsage, 'totalCostUsd'))}
+                      </div>
+                    </div>
+                  </div>
+
+                  {hostToolCalls.length > 0 && (
+                    <div>
+                      <h4 className="text-xs font-semibold uppercase tracking-wide text-muted-foreground mb-2">
+                        Observed Tool Calls
+                      </h4>
+                      <div className="space-y-2">
+                        {hostToolCalls.map((call, i) => (
+                          <div
+                            key={`${call.name}-${i}`}
+                            className="rounded-md border bg-muted/50 p-3 text-xs"
+                          >
+                            <div className="flex items-center gap-2 mb-2">
+                              <code className="font-semibold">{call.name}</code>
+                              {call.id && (
+                                <span className="text-muted-foreground font-mono">
+                                  {call.id}
+                                </span>
+                              )}
+                            </div>
+                            <JsonBlock value={call.arguments} />
+                          </div>
+                        ))}
+                      </div>
+                    </div>
+                  )}
+
+                  {hostUsage && (
+                    <div>
+                      <h4 className="text-xs font-semibold uppercase tracking-wide text-muted-foreground mb-2">
+                        Usage & Durations
+                      </h4>
+                      <div className="grid grid-cols-1 sm:grid-cols-2 gap-2 text-sm">
+                        <InfoField
+                          label="Total Cost"
+                          value={formatCost(
+                            numberField(hostUsage, 'totalCostUsd')
+                          )}
+                        />
+                        <InfoField
+                          label="Host Duration"
+                          value={formatMs(numberField(hostUsage, 'durationMs'))}
+                        />
+                        <InfoField
+                          label="API Duration"
+                          value={formatMs(
+                            numberField(hostUsage, 'durationApiMs')
+                          )}
+                        />
+                        <InfoField
+                          label="LLM Duration"
+                          value={formatMs(llmDurationMs)}
+                        />
+                        <InfoField
+                          label="MCP Duration"
+                          value={formatMs(mcpDurationMs)}
+                        />
+                        <InfoField
+                          label="Reporter Duration"
+                          value={formatMs(result.durationMs)}
+                        />
+                        {numberField(hostUsage, 'cacheReadInputTokens') !==
+                          undefined && (
+                          <InfoField
+                            label="Cache Read Tokens"
+                            value={formatNumber(
+                              numberField(hostUsage, 'cacheReadInputTokens')
+                            )}
+                          />
+                        )}
+                        {numberField(hostUsage, 'cacheCreationInputTokens') !==
+                          undefined && (
+                          <InfoField
+                            label="Cache Write Tokens"
+                            value={formatNumber(
+                              numberField(hostUsage, 'cacheCreationInputTokens')
+                            )}
+                          />
+                        )}
+                      </div>
+                    </div>
+                  )}
+
+                  <div className="grid grid-cols-1 sm:grid-cols-2 gap-3 text-sm">
+                    <InfoField
+                      label="Host"
+                      value={
+                        <>
+                          <p className="font-medium">
+                            {result.externalHost.displayName}
+                            {result.externalHost.hostVariant
+                              ? ` / ${result.externalHost.hostVariant}`
+                              : ''}
+                          </p>
+                          <p className="font-mono text-xs text-muted-foreground break-all mt-1">
+                            {result.externalHost.driverSlug}
+                          </p>
+                        </>
+                      }
+                    />
+                    <InfoField
+                      label="Evidence"
+                      value={`${result.externalHost.traceSource} · ${result.externalHost.traceConfidence}`}
+                    />
+                    <InfoField
+                      label="Session"
+                      value={
+                        <code className="text-xs break-all">
+                          {result.externalHost.session.id ?? 'unknown'}
+                        </code>
+                      }
+                    />
+                    <InfoField
+                      label="Request"
+                      value={
+                        <code className="text-xs break-all">
+                          {result.externalHost.session.requestId ?? 'unknown'}
+                        </code>
+                      }
+                    />
+                    <InfoField
+                      label="Run Marker"
+                      value={
+                        <code className="text-xs break-all">
+                          {result.externalHost.session.runMarker}
+                        </code>
+                      }
+                    />
+                    <InfoField
+                      label="Correlation Strategy"
+                      value={
+                        <>
+                          <code className="text-xs">
+                            {result.externalHost.correlation.strategy}
+                          </code>
+                          <p className="text-xs text-muted-foreground mt-1">
+                            prompt marker{' '}
+                            {result.externalHost.correlation.includedInPrompt
+                              ? 'included'
+                              : 'not included'}
+                          </p>
+                        </>
+                      }
+                    />
+                    {result.externalHost.session.cliSessionId && (
+                      <InfoField
+                        label="CLI Session"
+                        value={
+                          <code className="text-xs break-all">
+                            {result.externalHost.session.cliSessionId}
+                          </code>
+                        }
+                      />
+                    )}
+                  </div>
+
+                  <div>
+                    <h4 className="text-xs font-semibold uppercase tracking-wide text-muted-foreground mb-2">
+                      Capabilities
+                    </h4>
+                    <div className="flex flex-wrap gap-2">
+                      {result.externalHost.capabilitiesUsed.map(
+                        (capability) => (
                           <span
-                            className={`text-sm font-semibold ${
-                              exp.pass
-                                ? 'text-green-700 dark:text-green-400'
-                                : 'text-red-700 dark:text-red-400'
-                            }`}
+                            key={capability}
+                            className="px-2 py-1 rounded text-xs bg-muted text-muted-foreground"
                           >
-                            {exp.pass ? '✓' : '✗'} {type}
+                            {capability}
                           </span>
-                        </div>
-                        {exp.details && (
-                          <pre className="text-xs text-muted-foreground font-mono whitespace-pre-wrap">
-                            {stripAnsiCodes(exp.details)}
-                          </pre>
-                        )}
+                        )
+                      )}
+                    </div>
+                  </div>
+
+                  {externalHostEvidenceRows.length > 0 && (
+                    <div>
+                      <h4 className="text-xs font-semibold uppercase tracking-wide text-muted-foreground mb-2">
+                        Evidence Sources
+                      </h4>
+                      <div className="grid grid-cols-1 sm:grid-cols-2 gap-2 text-xs">
+                        {externalHostEvidenceRows.map((row) => (
+                          <div key={row.key} className="rounded bg-muted p-2">
+                            <div className="font-medium">{row.label}</div>
+                            <div className="text-muted-foreground">
+                              {row.source} · {row.confidence}
+                            </div>
+                          </div>
+                        ))}
                       </div>
-                    ))}
+                    </div>
+                  )}
+
+                  {result.externalHost.failureKind && (
+                    <div className="rounded-md bg-orange-500/10 text-orange-700 dark:text-orange-300 p-3 text-sm">
+                      Host failure: {result.externalHost.failureKind}
+                    </div>
+                  )}
+
+                  {result.externalHost.traceLimitations &&
+                    result.externalHost.traceLimitations.length > 0 && (
+                      <div>
+                        <h4 className="text-xs font-semibold uppercase tracking-wide text-muted-foreground mb-2">
+                          Limitations
+                        </h4>
+                        <ul className="space-y-1 text-sm text-muted-foreground">
+                          {result.externalHost.traceLimitations.map(
+                            (limitation, i) => (
+                              <li key={i}>{limitation}</li>
+                            )
+                          )}
+                        </ul>
+                      </div>
+                    )}
+
+                  {result.externalHost.artifacts.length > 0 && (
+                    <div>
+                      <h4 className="text-xs font-semibold uppercase tracking-wide text-muted-foreground mb-2">
+                        Artifacts
+                      </h4>
+                      <div className="space-y-2">
+                        {result.externalHost.artifacts.map((artifact, i) => (
+                          <div
+                            key={i}
+                            className="rounded-md bg-muted p-3 text-xs"
+                          >
+                            <div className="font-medium">{artifact.name}</div>
+                            <div className="text-muted-foreground">
+                              {artifact.kind}
+                              {artifact.contentType
+                                ? ` · ${artifact.contentType}`
+                                : ''}
+                            </div>
+                            {artifact.path && (
+                              <pre className="mt-1 font-mono whitespace-pre-wrap break-all">
+                                {artifact.path}
+                              </pre>
+                            )}
+                          </div>
+                        ))}
+                      </div>
+                    </div>
+                  )}
                 </div>
               </CollapsibleSection>
             )}
@@ -446,6 +1306,11 @@ export function DetailModal({ result, onClose }: DetailModalProps) {
                             Tools called
                           </th>
                         )}
+                        {iterations.some((r) => r.externalHost) && (
+                          <th className="text-left py-2 pr-4 font-medium">
+                            Host trace
+                          </th>
+                        )}
                         <th className="text-left py-2 font-medium">Error</th>
                       </tr>
                     </thead>
@@ -522,6 +1387,24 @@ export function DetailModal({ result, onClose }: DetailModalProps) {
                               )}
                             </td>
                           )}
+                          {iterations.some((r) => r.externalHost) && (
+                            <td className="py-2 pr-4">
+                              {iter.externalHost ? (
+                                <span
+                                  className="text-xs text-muted-foreground"
+                                  title={iter.externalHost.traceSource}
+                                >
+                                  {iter.externalHost.driverSlug ??
+                                    iter.externalHost.hostName}{' '}
+                                  · {iter.externalHost.traceConfidence}
+                                </span>
+                              ) : (
+                                <span className="text-xs text-muted-foreground">
+                                  —
+                                </span>
+                              )}
+                            </td>
+                          )}
                           <td className="py-2 text-xs text-muted-foreground font-mono">
                             {iter.error ? stripAnsiCodes(iter.error) : '—'}
                           </td>
diff --git a/src/reporters/ui-src/components/Results/ResultsTable.tsx b/src/reporters/ui-src/components/Results/ResultsTable.tsx
index f9885d2..171381a 100644
--- a/src/reporters/ui-src/components/Results/ResultsTable.tsx
+++ b/src/reporters/ui-src/components/Results/ResultsTable.tsx
@@ -26,6 +26,65 @@ function formatMs(ms: number): string {
   return ms >= 1000 ? `${(ms / 1000).toFixed(1)}s` : `${ms.toFixed(0)}ms`;
 }
 
+function isRecord(value: unknown): value is Record<string, unknown> {
+  return typeof value === 'object' && value !== null && !Array.isArray(value);
+}
+
+function toolCallCount(result: EvalCaseResult): number {
+  const response = isRecord(result.response) ? result.response : undefined;
+  const toolCalls = response?.toolCalls;
+  return Array.isArray(toolCalls) ? toolCalls.length : 0;
+}
+
+function usageRecord(
+  result: EvalCaseResult
+): Record<string, unknown> | undefined {
+  if (result.hostUsage) {
+    return result.hostUsage as unknown as Record<string, unknown>;
+  }
+  const response = isRecord(result.response) ? result.response : undefined;
+  return isRecord(response?.usage) ? response.usage : undefined;
+}
+
+function numberField(
+  value: Record<string, unknown> | undefined,
+  key: string
+): number | undefined {
+  const nested = value?.[key];
+  return typeof nested === 'number' ? nested : undefined;
+}
+
+function formatCost(value: number): string {
+  return `$${value.toFixed(value === 0 ? 2 : 4)}`;
+}
+
+function failedExpectationTypes(result: EvalCaseResult): string[] {
+  return Object.entries(result.expectations ?? {})
+    .filter(([, expectation]) => expectation !== undefined && !expectation.pass)
+    .map(([type]) => type);
+}
+
+function failureLabel(result: EvalCaseResult): string | undefined {
+  if (result.pass) {
+    return undefined;
+  }
+
+  if (result.externalHost?.failureKind) {
+    return `host: ${result.externalHost.failureKind}`;
+  }
+
+  if (result.error) {
+    return 'execution error';
+  }
+
+  const failedAssertions = failedExpectationTypes(result);
+  if (failedAssertions.length > 0) {
+    return `assertion: ${failedAssertions.join(', ')}`;
+  }
+
+  return 'failed';
+}
+
 interface ResultRowProps {
   result: EvalCaseResult;
   onSelectResult?: (result: EvalCaseResult) => void;
@@ -45,6 +104,13 @@ function ResultRow({
   const iterDots = result.iterationResults ?? [];
   const cappedDots = iterDots.slice(0, 10);
   const hasMore = iterDots.length > 10;
+  const observedToolCallCount = toolCallCount(result);
+  const usage = usageRecord(result);
+  const inputTokens = numberField(usage, 'inputTokens') ?? 0;
+  const outputTokens = numberField(usage, 'outputTokens') ?? 0;
+  const totalTokens = inputTokens + outputTokens;
+  const totalCostUsd = numberField(usage, 'totalCostUsd');
+  const rowFailureLabel = failureLabel(result);
 
   const ariaLabel = `${result.toolName ? result.toolName + ': ' : ''}${result.id}, ${result.pass ? 'passed' : 'failed'}`;
 
@@ -90,6 +156,14 @@ function ResultRow({
           ▲ fixed
         </span>
       )}
+      {rowFailureLabel && (
+        <span
+          className="inline-flex items-center px-2 py-0.5 rounded text-xs shrink-0 bg-red-500/10 text-red-700 dark:text-red-400 max-w-56 truncate"
+          title={rowFailureLabel}
+        >
+          {rowFailureLabel}
+        </span>
+      )}
 
       {result.assertionPassRate !== undefined && (
         <span
@@ -160,6 +234,63 @@ function ResultRow({
         </span>
       )}
 
+      {result.externalHost && (
+        <>
+          <span
+            className="inline-flex items-center px-2 py-0.5 rounded text-xs shrink-0 bg-teal-500/15 text-teal-700 dark:text-teal-300"
+            title={`External host: ${result.externalHost.driverSlug}`}
+          >
+            {result.externalHost.driver.provider}/
+            {result.externalHost.driver.product}
+          </span>
+          <span
+            className="inline-flex items-center px-2 py-0.5 rounded text-xs shrink-0 bg-cyan-500/15 text-cyan-700 dark:text-cyan-300"
+            title={result.externalHost.driverSlug}
+          >
+            {result.externalHost.driver.surface} ·{' '}
+            {result.externalHost.driver.runtime}
+            {result.externalHost.driver.platform
+              ? ` · ${result.externalHost.driver.platform}`
+              : ''}
+          </span>
+          <span
+            className={`inline-flex items-center px-2 py-0.5 rounded text-xs shrink-0 ${
+              result.externalHost.traceConfidence === 'high'
+                ? 'bg-green-500/15 text-green-700 dark:text-green-400'
+                : result.externalHost.traceConfidence === 'medium'
+                  ? 'bg-amber-500/15 text-amber-700 dark:text-amber-400'
+                  : 'bg-gray-500/15 text-gray-700 dark:text-gray-300'
+            }`}
+            title={`Trace source: ${result.externalHost.traceSource}`}
+          >
+            {result.externalHost.traceConfidence} trace
+          </span>
+          <span
+            className="inline-flex items-center px-2 py-0.5 rounded text-xs shrink-0 bg-muted text-muted-foreground"
+            title="Observed tool calls in the normalized host trace"
+          >
+            {observedToolCallCount} tool
+            {observedToolCallCount === 1 ? '' : 's'}
+          </span>
+          {totalTokens > 0 && (
+            <span
+              className="inline-flex items-center px-2 py-0.5 rounded text-xs shrink-0 bg-muted text-muted-foreground"
+              title="Input + output tokens reported by the host trace"
+            >
+              {totalTokens.toLocaleString()} tokens
+            </span>
+          )}
+          {totalCostUsd !== undefined && (
+            <span
+              className="inline-flex items-center px-2 py-0.5 rounded text-xs shrink-0 bg-muted text-muted-foreground"
+              title="Cost reported by the host trace"
+            >
+              {formatCost(totalCostUsd)}
+            </span>
+          )}
+        </>
+      )}
+
       <span className="shrink-0">
         {isEval ? (
           <BarChart3
@@ -178,7 +309,9 @@ function ResultRow({
 
       <span className="flex-1 text-sm font-medium truncate">{result.id}</span>
 
-      {result.toolName && result.toolName !== 'mcp_host' ? (
+      {result.toolName &&
+      result.toolName !== 'mcp_host' &&
+      result.toolName !== 'external_host' ? (
         <code className="text-xs bg-muted px-2 py-1 rounded shrink-0">
           {result.toolName}
         </code>
@@ -186,6 +319,10 @@ function ResultRow({
         <span className="text-xs text-muted-foreground shrink-0 italic">
           mcp_host
         </span>
+      ) : result.toolName === 'external_host' ? (
+        <span className="text-xs text-muted-foreground shrink-0 italic">
+          external_host
+        </span>
       ) : null}
 
       {showProjectBadge && result.project && (
diff --git a/src/types/reporter.ts b/src/types/reporter.ts
index 66cf00a..fdd1f43 100644
--- a/src/types/reporter.ts
+++ b/src/types/reporter.ts
@@ -14,6 +14,17 @@ import type {
   ExpectationBreakdown,
   UsageMetrics,
 } from './index.js';
+import type {
+  ExternalHostCorrelationConfig,
+  ExternalHostMetadata,
+  HostDriverId,
+} from '../evals/externalHost/types.js';
+
+export interface SerializedExternalHostCapabilityBinding {
+  uses: string;
+  provides?: string[];
+  with?: Record<string, unknown>;
+}
 
 /**
  * Configuration options for MCP Eval Reporter
@@ -194,6 +205,8 @@ export interface IterationResult {
   };
   /** Token usage from mcp_host LLM simulation in this iteration */
   hostUsage?: UsageMetrics;
+  /** External host metadata for this iteration */
+  externalHost?: ExternalHostMetadata;
 }
 
 /**
@@ -201,11 +214,29 @@ export interface IterationResult {
  * Preserves what was sent so results are self-contained for debugging.
  */
 export interface EvalCaseRequest {
+  /** Eval execution mode */
+  mode?: string;
+
   /** Human-readable description of the case */
   description?: string;
   /** Runtime tool override variant identifier, when one was used */
   toolOverrideVariantId?: string;
 
+  /** Number of iterations configured for this case */
+  iterations?: number;
+
+  /** Accuracy threshold configured for this case */
+  accuracyThreshold?: number;
+
+  /** Judge repetitions configured for this case */
+  judgeReps?: number;
+
+  /** Tags from the source eval case */
+  tags?: string[];
+
+  /** Configured expectation block, sanitized for reporter output */
+  expect?: Record<string, unknown>;
+
   // Direct mode fields
   /** Tool arguments (direct mode) */
   args?: Record<string, unknown>;
@@ -218,6 +249,19 @@ export interface EvalCaseRequest {
     provider?: string;
     model?: string;
   };
+  /** External host configuration summary (external_host mode) */
+  externalHost?: {
+    driver: HostDriverId | string;
+    driverSlug?: string;
+    name?: string;
+    hostType?: string;
+    variant?: string;
+    timeoutMs?: number;
+    usesBuiltInDefaults?: boolean;
+    correlation?: ExternalHostCorrelationConfig;
+    options?: Record<string, unknown>;
+    capabilities?: Record<string, SerializedExternalHostCapabilityBinding[]>;
+  };
 }
 
 /**
@@ -377,6 +421,12 @@ export interface EvalCaseResult {
    * Summed across all iterations. Only populated for mcp_host mode cases.
    */
   hostUsage?: UsageMetrics;
+
+  /**
+   * External host trace and evidence metadata.
+   * Populated for external_host mode cases.
+   */
+  externalHost?: ExternalHostMetadata;
 }
 
 /**
diff --git a/vitest.config.mts b/vitest.config.mts
index 63e3f76..8bddbb9 100644
--- a/vitest.config.mts
+++ b/vitest.config.mts
@@ -5,7 +5,12 @@ export default defineConfig({
     globals: true,
     environment: 'node',
     include: ['src/**/*.test.ts', 'tests/**/*.test.ts'],
-    exclude: ['node_modules', 'dist', 'tests/**/*.spec.ts'],
+    exclude: [
+      'node_modules',
+      'dist',
+      'tests/**/*.spec.ts',
+      'src/**/*.integration.test.ts',
+    ],
     coverage: {
       provider: 'v8',
       reporter: ['text', 'json', 'html'],
diff --git a/vitest.external-host.config.mts b/vitest.external-host.config.mts
new file mode 100644
index 0000000..438d7de
--- /dev/null
+++ b/vitest.external-host.config.mts
@@ -0,0 +1,10 @@
+import { defineConfig } from 'vitest/config';
+
+export default defineConfig({
+  test: {
+    globals: true,
+    environment: 'node',
+    include: ['src/**/*.integration.test.ts'],
+    testTimeout: 150_000,
+  },
+});