gleanwork · steve-calvert-glean · May 11, 2026 · May 27, 2026 · May 27, 2026 · May 27, 2026
@@ -435,10 +435,7 @@ The result includes pass-rate deltas, optional tool precision/recall/F1 deltas,
 
 **Result Structure:**
 
-```typescript snippet=src/evals/evalRunner.ts#L106-L184
-/**
- * Overall result of running an eval dataset
- */
+```typescript snippet=src/evals/evalRunner.ts#L121-L195
 export interface EvalRunnerResult {
   /**
    * Total number of cases
@@ -1043,7 +1040,12 @@ interface MCPConformanceResult {
 
 ### `EvalExpectBlock`
 
-```typescript snippet=src/evals/datasetTypes.ts#L186-L277
+```typescript snippet=src/evals/datasetTypes.ts#L190-L288
+/**
+ * Unified expectation block for eval cases
+ *
+ * Mirrors the Playwright matcher API for consistency.
+ */
 export interface EvalExpectBlock {
   /**
    * Exact response match (toMatchToolResponse)
@@ -1102,8 +1104,9 @@ export interface EvalExpectBlock {
   };
 
   /**
-   * Asserts which tools the LLM called during a mcp_host simulation.
-   * Only meaningful for mcp_host mode — direct mode has no tool call trace.
+   * Asserts which tools the LLM called during a host simulation.
+   * Only meaningful for mcp_host or external_host runs with high-confidence
+   * structured tool evidence — direct mode has no tool call trace.
    */
   toolsTriggered?: {
     /** Expected tool calls */
@@ -1125,7 +1128,8 @@ export interface EvalExpectBlock {
   };
 
   /**
-   * Asserts the number of tool calls made during a mcp_host simulation.
+   * Asserts the number of tool calls made during a host simulation.
+   * External-host runs require high-confidence structured tool evidence.
    */
   toolCallCount?: {
     /** Minimum number of tool calls */
@@ -1140,7 +1144,14 @@ export interface EvalExpectBlock {
 
 ### `EvalCase`
 
-````typescript snippet=src/evals/datasetTypes.ts#L27-L139
+````typescript snippet=src/evals/datasetTypes.ts#L23-L148
+/**
+ * A single eval test case
+ *
+ * For 'direct' mode: toolName and args are required
+ * For 'mcp_host' mode: scenario and mcpHostConfig are required
+ * For 'external_host' mode: scenario and externalHost are required
+ */
 export interface EvalCase {
   /**
    * Unique identifier for this test case
@@ -1155,7 +1166,8 @@ export interface EvalCase {
   /**
    * Evaluation mode
    * - 'direct': Direct API calls to MCP tools (default)
-   * - 'mcp_host': LLM-driven tool selection via natural language
+   * - 'mcp_host': SDK/CLI host simulation via natural language
+   * - 'external_host': Real external MCP host driven by configured capabilities
    *
    * @default 'direct'
    */
@@ -1172,7 +1184,7 @@ export interface EvalCase {
   args?: Record<string, unknown>;
 
   /**
-   * Natural language scenario for LLM to execute (optional, required for 'mcp_host' mode)
+   * Natural language scenario for LLM to execute (required for 'mcp_host' and 'external_host' modes)
    *
    * @example "Get the weather for London and tell me if I need an umbrella"
    */
@@ -1185,6 +1197,11 @@ export interface EvalCase {
    */
   mcpHostConfig?: MCPHostConfig;
 
+  /**
+   * External host configuration (required for 'external_host' mode)
+   */
+  externalHost?: ExternalHostConfig;
+
   /**
    * Additional metadata for this test case
    *
@@ -1256,18 +1273,6 @@ export interface EvalCase {
 }
 ````
 
-### `EvalDataset`
-
-```typescript
-interface EvalDataset {
-  name: string;
-  description?: string;
-  cases: EvalCase[];
-  metadata?: Record<string, unknown>;
-  schemas?: Record<string, ZodSchema>; // Zod schemas for toMatchToolSchema assertions
-}
-```
-
 ## Next Steps
 
 - See the [Authentication Guide](./authentication.md) for OAuth and token auth

@@ -72,6 +72,9 @@
     "preview-reporter": "tsx scripts/preview-reporter.ts",
     "test": "vitest run",
     "test:all": "npm run build && npm run format:check && npm run lint && npm run typecheck && npm test",
+    "test:external-host": "vitest run --config vitest.external-host.config.mts",
+    "test:external-host:chat": "vitest run --config vitest.external-host.config.mts -t \"Claude Chat\"",
+    "test:external-host:cowork": "vitest run --config vitest.external-host.config.mts -t \"Claude Cowork\"",
     "test:playwright": "playwright test",
     "test:watch": "vitest",
     "typecheck": "tsc --noEmit"
@@ -84,6 +87,7 @@
     "debug": "^4.4.3",
     "ink": "^5.2.1",
     "ink-spinner": "^5.0.0",
+    "ndjson": "^2.0.0",
     "oauth4webapi": "^3.0.0",
     "open": "^10.1.0",
     "react": "^18.3.1",
@@ -95,6 +99,7 @@
     "@playwright/test": "^1.49.0",
     "@release-it-plugins/lerna-changelog": "^8.0.1",
     "@types/debug": "^4.1.12",
+    "@types/ndjson": "^2.0.4",
     "@types/node": "^22.10.2",
     "@types/react": "^18.3.12",
     "@types/react-dom": "^18.3.1",

@@ -182,7 +182,7 @@ describe('validateToolCalls', () => {
       calls: [{ name: 'search' }],
     });
     expect(v.pass).toBe(false);
-    expect(v.message).toContain('mcp_host');
+    expect(v.message).toContain('host simulation response');
   });
 });
 
@@ -292,6 +292,6 @@ describe('validateToolCallCount', () => {
   it('returns error when response is not an MCPHostSimulationResult', () => {
     const v = validateToolCallCount('not a simulation result', { exact: 1 });
     expect(v.pass).toBe(false);
-    expect(v.message).toContain('mcp_host');
+    expect(v.message).toContain('host simulation response');
   });
 });
@@ -102,9 +102,9 @@ function findMatchingCall(
 }
 
 /**
- * Validates tool calls made during an MCP host simulation.
+ * Validates tool calls made during a host simulation.
  *
- * @param response - Must be an MCPHostSimulationResult (from mcp_host mode)
+ * @param response - Must be an MCPHostSimulationResult-compatible response
  * @param expectation - Expected tool call specification
  */
 export function validateToolCalls(
@@ -115,7 +115,7 @@ export function validateToolCalls(
     return {
       pass: false,
       message:
-        'toolsTriggered expectation requires mcp_host mode — response must be an MCPHostSimulationResult',
+        'toolsTriggered expectation requires a host simulation response with structured tool calls',
     };
   }
 
@@ -206,9 +206,9 @@ export function validateToolCalls(
 }
 
 /**
- * Validates the number of tool calls made during an MCP host simulation.
+ * Validates the number of tool calls made during a host simulation.
  *
- * @param response - Must be an MCPHostSimulationResult (from mcp_host mode)
+ * @param response - Must be an MCPHostSimulationResult-compatible response
  * @param options - Count constraints (min, max, exact)
  */
 export function validateToolCallCount(
@@ -219,7 +219,7 @@ export function validateToolCallCount(
     return {
       pass: false,
       message:
-        'toolCallCount expectation requires mcp_host mode — response must be an MCPHostSimulationResult',
+        'toolCallCount expectation requires a host simulation response with structured tool calls',
     };
   }
 

@@ -183,6 +183,18 @@ describe('validateText', () => {
       const result = validateText(response, 'result');
       expect(result.pass).toBe(true);
     });
+
+    it('should prefer host simulation final response over metadata JSON', () => {
+      const response = {
+        response: 'final answer text',
+        externalHost: {
+          traceLimitations: ['metadata-only text'],
+        },
+      };
+
+      expect(validateText(response, 'final answer').pass).toBe(true);
+      expect(validateText(response, 'metadata-only').pass).toBe(false);
+    });
   });
 });