Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 28 additions & 23 deletions docs/api-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -435,10 +435,7 @@ The result includes pass-rate deltas, optional tool precision/recall/F1 deltas,

**Result Structure:**

```typescript snippet=src/evals/evalRunner.ts#L106-L184
/**
* Overall result of running an eval dataset
*/
```typescript snippet=src/evals/evalRunner.ts#L121-L195
export interface EvalRunnerResult {
/**
* Total number of cases
Expand Down Expand Up @@ -1043,7 +1040,12 @@ interface MCPConformanceResult {

### `EvalExpectBlock`

```typescript snippet=src/evals/datasetTypes.ts#L186-L277
```typescript snippet=src/evals/datasetTypes.ts#L190-L288
/**
* Unified expectation block for eval cases
*
* Mirrors the Playwright matcher API for consistency.
*/
export interface EvalExpectBlock {
/**
* Exact response match (toMatchToolResponse)
Expand Down Expand Up @@ -1102,8 +1104,9 @@ export interface EvalExpectBlock {
};

/**
* Asserts which tools the LLM called during a mcp_host simulation.
* Only meaningful for mcp_host mode — direct mode has no tool call trace.
* Asserts which tools the LLM called during a host simulation.
* Only meaningful for mcp_host or external_host runs with high-confidence
* structured tool evidence — direct mode has no tool call trace.
*/
toolsTriggered?: {
/** Expected tool calls */
Expand All @@ -1125,7 +1128,8 @@ export interface EvalExpectBlock {
};

/**
* Asserts the number of tool calls made during a mcp_host simulation.
* Asserts the number of tool calls made during a host simulation.
* External-host runs require high-confidence structured tool evidence.
*/
toolCallCount?: {
/** Minimum number of tool calls */
Expand All @@ -1140,7 +1144,14 @@ export interface EvalExpectBlock {

### `EvalCase`

````typescript snippet=src/evals/datasetTypes.ts#L27-L139
````typescript snippet=src/evals/datasetTypes.ts#L23-L148
/**
* A single eval test case
*
* For 'direct' mode: toolName and args are required
* For 'mcp_host' mode: scenario and mcpHostConfig are required
* For 'external_host' mode: scenario and externalHost are required
*/
export interface EvalCase {
/**
* Unique identifier for this test case
Expand All @@ -1155,7 +1166,8 @@ export interface EvalCase {
/**
* Evaluation mode
* - 'direct': Direct API calls to MCP tools (default)
* - 'mcp_host': LLM-driven tool selection via natural language
* - 'mcp_host': SDK/CLI host simulation via natural language
* - 'external_host': Real external MCP host driven by configured capabilities
*
* @default 'direct'
*/
Expand All @@ -1172,7 +1184,7 @@ export interface EvalCase {
args?: Record<string, unknown>;

/**
* Natural language scenario for LLM to execute (optional, required for 'mcp_host' mode)
* Natural language scenario for LLM to execute (required for 'mcp_host' and 'external_host' modes)
*
* @example "Get the weather for London and tell me if I need an umbrella"
*/
Expand All @@ -1185,6 +1197,11 @@ export interface EvalCase {
*/
mcpHostConfig?: MCPHostConfig;

/**
* External host configuration (required for 'external_host' mode)
*/
externalHost?: ExternalHostConfig;

/**
* Additional metadata for this test case
*
Expand Down Expand Up @@ -1256,18 +1273,6 @@ export interface EvalCase {
}
````

### `EvalDataset`

```typescript
interface EvalDataset {
name: string;
description?: string;
cases: EvalCase[];
metadata?: Record<string, unknown>;
schemas?: Record<string, ZodSchema>; // Zod schemas for toMatchToolSchema assertions
}
```

## Next Steps

- See the [Authentication Guide](./authentication.md) for OAuth and token auth
Expand Down
32 changes: 23 additions & 9 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@
"preview-reporter": "tsx scripts/preview-reporter.ts",
"test": "vitest run",
"test:all": "npm run build && npm run format:check && npm run lint && npm run typecheck && npm test",
"test:external-host": "vitest run --config vitest.external-host.config.mts",
"test:external-host:chat": "vitest run --config vitest.external-host.config.mts -t \"Claude Chat\"",
"test:external-host:cowork": "vitest run --config vitest.external-host.config.mts -t \"Claude Cowork\"",
"test:playwright": "playwright test",
"test:watch": "vitest",
"typecheck": "tsc --noEmit"
Expand All @@ -84,6 +87,7 @@
"debug": "^4.4.3",
"ink": "^5.2.1",
"ink-spinner": "^5.0.0",
"ndjson": "^2.0.0",
"oauth4webapi": "^3.0.0",
"open": "^10.1.0",
"react": "^18.3.1",
Expand All @@ -95,6 +99,7 @@
"@playwright/test": "^1.49.0",
"@release-it-plugins/lerna-changelog": "^8.0.1",
"@types/debug": "^4.1.12",
"@types/ndjson": "^2.0.4",
"@types/node": "^22.10.2",
"@types/react": "^18.3.12",
"@types/react-dom": "^18.3.1",
Expand Down
4 changes: 2 additions & 2 deletions src/assertions/validators/toolCalls.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ describe('validateToolCalls', () => {
calls: [{ name: 'search' }],
});
expect(v.pass).toBe(false);
expect(v.message).toContain('mcp_host');
expect(v.message).toContain('host simulation response');
});
});

Expand Down Expand Up @@ -292,6 +292,6 @@ describe('validateToolCallCount', () => {
it('returns error when response is not an MCPHostSimulationResult', () => {
const v = validateToolCallCount('not a simulation result', { exact: 1 });
expect(v.pass).toBe(false);
expect(v.message).toContain('mcp_host');
expect(v.message).toContain('host simulation response');
});
});
12 changes: 6 additions & 6 deletions src/assertions/validators/toolCalls.ts
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,9 @@ function findMatchingCall(
}

/**
* Validates tool calls made during an MCP host simulation.
* Validates tool calls made during a host simulation.
*
* @param response - Must be an MCPHostSimulationResult (from mcp_host mode)
* @param response - Must be an MCPHostSimulationResult-compatible response
* @param expectation - Expected tool call specification
*/
export function validateToolCalls(
Expand All @@ -115,7 +115,7 @@ export function validateToolCalls(
return {
pass: false,
message:
'toolsTriggered expectation requires mcp_host mode — response must be an MCPHostSimulationResult',
'toolsTriggered expectation requires a host simulation response with structured tool calls',
};
}

Expand Down Expand Up @@ -206,9 +206,9 @@ export function validateToolCalls(
}

/**
* Validates the number of tool calls made during an MCP host simulation.
* Validates the number of tool calls made during a host simulation.
*
* @param response - Must be an MCPHostSimulationResult (from mcp_host mode)
* @param response - Must be an MCPHostSimulationResult-compatible response
* @param options - Count constraints (min, max, exact)
*/
export function validateToolCallCount(
Expand All @@ -219,7 +219,7 @@ export function validateToolCallCount(
return {
pass: false,
message:
'toolCallCount expectation requires mcp_host mode — response must be an MCPHostSimulationResult',
'toolCallCount expectation requires a host simulation response with structured tool calls',
};
}

Expand Down
12 changes: 12 additions & 0 deletions src/assertions/validators/validators.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,18 @@ describe('validateText', () => {
const result = validateText(response, 'result');
expect(result.pass).toBe(true);
});

it('should prefer host simulation final response over metadata JSON', () => {
const response = {
response: 'final answer text',
externalHost: {
traceLimitations: ['metadata-only text'],
},
};

expect(validateText(response, 'final answer').pass).toBe(true);
expect(validateText(response, 'metadata-only').pass).toBe(false);
});
});
});

Expand Down
Loading
Loading