From 70eeb8cbbc0ba155ff1a0d3fa891f1115b6326d1 Mon Sep 17 00:00:00 2001 From: Frederik Prijck Date: Tue, 9 Jun 2026 15:04:41 +0200 Subject: [PATCH 1/3] feat: inject compile_command guidance into agent context files Add an optional `compile_command` PROMPT.md frontmatter field. When set, a verify-compiles instruction is appended to the agent's native context file (CLAUDE.md / GEMINI.md / AGENTS.md / copilot-instructions.md) alongside the existing "no docs files" guidance, so the agent verifies the project compiles and the command appears in the tool trace. Wires the field into all 10 quickstart evals. --- AGENTS.md | 3 +- .../src/evals/quickstarts/angular/PROMPT.md | 1 + .../src/evals/quickstarts/express/PROMPT.md | 1 + .../src/evals/quickstarts/fastapi/PROMPT.md | 1 + .../evals/quickstarts/fastify-api/PROMPT.md | 1 + .../src/evals/quickstarts/flask/PROMPT.md | 1 + .../src/evals/quickstarts/nextjs/PROMPT.md | 1 + .../src/evals/quickstarts/nuxt/PROMPT.md | 1 + .../src/evals/quickstarts/react/PROMPT.md | 1 + .../src/evals/quickstarts/spa-js/PROMPT.md | 1 + .../src/evals/quickstarts/vue/PROMPT.md | 1 + docs/ADDING_EVALS.md | 3 ++ packages/eval-core/src/index.ts | 1 + packages/eval-core/src/loader.ts | 2 ++ packages/eval-core/src/types/eval.ts | 1 + packages/eval-core/src/workspace/index.ts | 1 + packages/eval-core/src/workspace/workspace.ts | 20 ++++++++--- packages/eval-core/tests/loader.test.ts | 22 +++++++++++-- packages/eval-core/tests/workspace.test.ts | 33 +++++++++++++++++++ packages/eval/src/cli/run.ts | 2 +- 20 files changed, 90 insertions(+), 8 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index d5845885..a428d7ab 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -137,6 +137,7 @@ When you make a change, update every doc whose described behavior is affected. T |---|---| | New eval added (`PROMPT.md` + `graders.ts`) | `AGENTS.md` eval list (if maintaining one); `docs/ADDING_EVALS.md` if the change reveals a gap in the guide | | `setup_command` behaviour changed (e.g. new syntax supported) | `docs/ADDING_EVALS.md` — frontmatter table and example; `AGENTS.md` checklist if relevant | +| `compile_command` added to an eval or its context-injection behaviour changed | `docs/ADDING_EVALS.md` — frontmatter table and example; `AGENTS.md` checklist if relevant | | New skill added or skill resolution logic changed | `docs/TESTING_SKILLS.md`; `AGENTS.md` if skill tooling or config changed | | New CLI flag or runner added | `AGENTS.md` CLI flags table and Agent runners table; `README.md` quick-start if the flag is commonly used | | Scoring dimension added, changed, or removed | `docs/SCORING_METHODOLOGY.md` first (per the workflow); then `AGENTS.md` scoring section once merged | @@ -151,7 +152,7 @@ When you make a change, update every doc whose described behavior is affected. T ## Adding an eval — checklist 1. `src/evals///PROMPT.md` + `graders.ts` -2. Add `id` (required) and optionally `name`/`category` to `PROMPT.md` frontmatter — the framework auto-discovers evals from `evalsDir` +2. Add `id` (required) and optionally `name`/`category`/`compile_command` to `PROMPT.md` frontmatter — the framework auto-discovers evals from `evalsDir`. Set `compile_command` to point the agent at a verify-compiles command (injected into the agent's context file, e.g. `CLAUDE.md`); omit for evals with no CLI compile step (e.g. mobile). 3. All imports use `.js` extensions; `import type` for type-only 4. All graders have `GraderLevel`; one final holistic `judge` with no level 5. `npm run build && npm test` passes diff --git a/apps/auth0-evals/src/evals/quickstarts/angular/PROMPT.md b/apps/auth0-evals/src/evals/quickstarts/angular/PROMPT.md index ed416e0b..671b0745 100644 --- a/apps/auth0-evals/src/evals/quickstarts/angular/PROMPT.md +++ b/apps/auth0-evals/src/evals/quickstarts/angular/PROMPT.md @@ -3,6 +3,7 @@ id: angular_quickstart name: Angular Quickstart skills: auth0-angular setup_command: npm install +compile_command: npm run build --- ## Task diff --git a/apps/auth0-evals/src/evals/quickstarts/express/PROMPT.md b/apps/auth0-evals/src/evals/quickstarts/express/PROMPT.md index 47b3d612..27b66e30 100644 --- a/apps/auth0-evals/src/evals/quickstarts/express/PROMPT.md +++ b/apps/auth0-evals/src/evals/quickstarts/express/PROMPT.md @@ -3,6 +3,7 @@ id: express_quickstart name: Express Quickstart skills: auth0-express setup_command: npm install +compile_command: node --check server.js --- ## Task diff --git a/apps/auth0-evals/src/evals/quickstarts/fastapi/PROMPT.md b/apps/auth0-evals/src/evals/quickstarts/fastapi/PROMPT.md index a1f475b5..0f3fa1a0 100644 --- a/apps/auth0-evals/src/evals/quickstarts/fastapi/PROMPT.md +++ b/apps/auth0-evals/src/evals/quickstarts/fastapi/PROMPT.md @@ -3,6 +3,7 @@ id: fastapi_quickstart name: FastAPI Quickstart skills: auth0-fastapi-api setup_command: python3 -m venv .venv && .venv/bin/pip install -r requirements.txt +compile_command: .venv/bin/python -m py_compile main.py --- ## Task diff --git a/apps/auth0-evals/src/evals/quickstarts/fastify-api/PROMPT.md b/apps/auth0-evals/src/evals/quickstarts/fastify-api/PROMPT.md index ce6e8ceb..531afdfb 100644 --- a/apps/auth0-evals/src/evals/quickstarts/fastify-api/PROMPT.md +++ b/apps/auth0-evals/src/evals/quickstarts/fastify-api/PROMPT.md @@ -3,6 +3,7 @@ id: fastify_api_quickstart name: Fastify API Quickstart skills: auth0-fastify-api setup_command: npm install +compile_command: node --check server.js --- ## Task diff --git a/apps/auth0-evals/src/evals/quickstarts/flask/PROMPT.md b/apps/auth0-evals/src/evals/quickstarts/flask/PROMPT.md index 19bd9bea..51710e4f 100644 --- a/apps/auth0-evals/src/evals/quickstarts/flask/PROMPT.md +++ b/apps/auth0-evals/src/evals/quickstarts/flask/PROMPT.md @@ -3,6 +3,7 @@ id: flask_quickstart name: Flask Quickstart skills: auth0-flask setup_command: python3 -m venv .venv && .venv/bin/pip install -r requirements.txt +compile_command: .venv/bin/python -m py_compile app.py --- ## Task diff --git a/apps/auth0-evals/src/evals/quickstarts/nextjs/PROMPT.md b/apps/auth0-evals/src/evals/quickstarts/nextjs/PROMPT.md index 14629d1c..049ddc6c 100644 --- a/apps/auth0-evals/src/evals/quickstarts/nextjs/PROMPT.md +++ b/apps/auth0-evals/src/evals/quickstarts/nextjs/PROMPT.md @@ -3,6 +3,7 @@ id: nextjs_quickstart name: Next.js App Router Quickstart skills: auth0-nextjs setup_command: npm install +compile_command: npm run build --- ## Task diff --git a/apps/auth0-evals/src/evals/quickstarts/nuxt/PROMPT.md b/apps/auth0-evals/src/evals/quickstarts/nuxt/PROMPT.md index 0f7a6b19..2dd5a460 100644 --- a/apps/auth0-evals/src/evals/quickstarts/nuxt/PROMPT.md +++ b/apps/auth0-evals/src/evals/quickstarts/nuxt/PROMPT.md @@ -3,6 +3,7 @@ id: nuxt_quickstart name: Nuxt Quickstart skills: auth0-nuxt setup_command: npm install +compile_command: npm run build --- ## Task diff --git a/apps/auth0-evals/src/evals/quickstarts/react/PROMPT.md b/apps/auth0-evals/src/evals/quickstarts/react/PROMPT.md index 35f5f915..926e56e5 100644 --- a/apps/auth0-evals/src/evals/quickstarts/react/PROMPT.md +++ b/apps/auth0-evals/src/evals/quickstarts/react/PROMPT.md @@ -4,6 +4,7 @@ name: React Quickstart scaffold: src/evals/scaffolds/react/basic skills: auth0-react setup_command: npm install +compile_command: npm run build --- ## Task diff --git a/apps/auth0-evals/src/evals/quickstarts/spa-js/PROMPT.md b/apps/auth0-evals/src/evals/quickstarts/spa-js/PROMPT.md index 5712b20f..a26f79c4 100644 --- a/apps/auth0-evals/src/evals/quickstarts/spa-js/PROMPT.md +++ b/apps/auth0-evals/src/evals/quickstarts/spa-js/PROMPT.md @@ -3,6 +3,7 @@ id: spa_js_quickstart name: SPA JS Quickstart skills: auth0-spa-js setup_command: npm install +compile_command: npm run build --- ## Task diff --git a/apps/auth0-evals/src/evals/quickstarts/vue/PROMPT.md b/apps/auth0-evals/src/evals/quickstarts/vue/PROMPT.md index 812b87b2..28b0eb0d 100644 --- a/apps/auth0-evals/src/evals/quickstarts/vue/PROMPT.md +++ b/apps/auth0-evals/src/evals/quickstarts/vue/PROMPT.md @@ -3,6 +3,7 @@ id: vue_quickstart name: Vue Quickstart skills: auth0-vue setup_command: npm install +compile_command: npm run build --- ## Task diff --git a/docs/ADDING_EVALS.md b/docs/ADDING_EVALS.md index fa045bd7..f1b89356 100644 --- a/docs/ADDING_EVALS.md +++ b/docs/ADDING_EVALS.md @@ -31,6 +31,7 @@ id: my_new_eval name: My New Eval skills: auth0-react setup_command: npm install +compile_command: npm run build --- ``` @@ -41,6 +42,7 @@ setup_command: npm install | `category` | no | Defaults to the parent directory name (e.g. `quickstarts`) | | `skills` | no | Comma-separated skill names from [auth0/agent-skills](https://github.com/auth0/agent-skills). Injected into agent context when running with `--tools skills` | | `setup_command` | no | Command run before the agent starts (e.g. `npm install`). Split on whitespace and executed directly via `spawnSync` — no shell, no operators (`&&`, `\|`, etc.), no quoting. One command only. | +| `compile_command` | no | Compile/build command (e.g. `npm run build`, `node --check server.js`, `.venv/bin/python -m py_compile main.py`). When set, an instruction pointing the agent at this command is appended to the agent's native context file (`CLAUDE.md` / `GEMINI.md` / `AGENTS.md` / `.github/copilot-instructions.md`) alongside the "no docs files" guidance, so the agent verifies the project compiles and the command shows up in the tool trace. Agent modes only — baseline ignores it. Omit for evals with no CLI compile step (e.g. mobile). | To test a skill before it is pushed to the remote repo, see [TESTING_SKILLS.md](TESTING_SKILLS.md). @@ -61,6 +63,7 @@ id: react_quickstart name: React Quickstart skills: auth0-react setup_command: npm install +compile_command: npm run build --- ## Task diff --git a/packages/eval-core/src/index.ts b/packages/eval-core/src/index.ts index e6147132..dcb39c17 100644 --- a/packages/eval-core/src/index.ts +++ b/packages/eval-core/src/index.ts @@ -82,6 +82,7 @@ export { writeAgentGuidance, AGENT_GUIDANCE, AGENT_CONTEXT_FILENAMES, + compileGuidance, collectFiles, readWorkspaceFile, isPathInside, diff --git a/packages/eval-core/src/loader.ts b/packages/eval-core/src/loader.ts index 4979b65f..51f33416 100644 --- a/packages/eval-core/src/loader.ts +++ b/packages/eval-core/src/loader.ts @@ -63,6 +63,7 @@ export async function loadEval( .filter(Boolean); const setupCommand = meta.setup_command || undefined; + const compileCommand = meta.compile_command || undefined; return { id: evalConfig.id, @@ -74,6 +75,7 @@ export async function loadEval( graders, scaffold, setupCommand, + compileCommand, skills, metadata: { provider_name: meta.provider_name ?? 'Auth0', diff --git a/packages/eval-core/src/types/eval.ts b/packages/eval-core/src/types/eval.ts index ab11647e..1f518ad0 100644 --- a/packages/eval-core/src/types/eval.ts +++ b/packages/eval-core/src/types/eval.ts @@ -19,6 +19,7 @@ export interface EvalDefinition { graders: GraderDef[]; scaffold: Record; setupCommand?: string; + compileCommand?: string; skills: string[]; metadata: Record; } diff --git a/packages/eval-core/src/workspace/index.ts b/packages/eval-core/src/workspace/index.ts index dc4e26af..5a69b748 100644 --- a/packages/eval-core/src/workspace/index.ts +++ b/packages/eval-core/src/workspace/index.ts @@ -5,6 +5,7 @@ export { writeAgentGuidance, AGENT_GUIDANCE, AGENT_CONTEXT_FILENAMES, + compileGuidance, } from './workspace.js'; export type { SetupWorkspaceOptions, RunSetupCommandOptions } from './workspace.js'; export { collectFiles, readWorkspaceFile } from './file-utils.js'; diff --git a/packages/eval-core/src/workspace/workspace.ts b/packages/eval-core/src/workspace/workspace.ts index c3890d6e..c19d6b1b 100644 --- a/packages/eval-core/src/workspace/workspace.ts +++ b/packages/eval-core/src/workspace/workspace.ts @@ -32,6 +32,15 @@ import { resolveInside } from './path-utils.js'; export const AGENT_GUIDANCE = `Do not create any documentation files (README.md, SETUP.md, QUICKSTART.md, IMPLEMENTATION_SUMMARY.md, or any other .md files). Do not create any .txt summary or verification files. Do not create standalone summary or status files of any kind (e.g. AUTH0_SETUP.ts, IMPLEMENTATION_COMPLETE.ts, QUICK_START.ts, FILES_CREATED.txt) — these are not application source code. Only create and modify source code files that are part of the application. `; +/** + * Builds the compile-verification guidance appended to the agent's context file + * when the eval declares a `compileCommand`. Pointing the agent at the command + * means it appears in the tool trace and the agent can fix any failures. + */ +export function compileGuidance(compileCommand: string): string { + return `To verify your integration compiles, you can use this command:\n\n\`${compileCommand}\`\n`; +} + /** * The context/memory file each runner reads, relative to the workspace root. * Writing guidance to the wrong file means the agent silently ignores it: @@ -51,12 +60,15 @@ export const AGENT_CONTEXT_FILENAMES: Record = { /** * Writes {@link AGENT_GUIDANCE} into the context file the given runner reads. * Appends (preserving any scaffold-provided content) when the file already - * exists; creates it otherwise. + * exists; creates it otherwise. When `compileCommand` is provided, the + * compile-verification guidance (see {@link compileGuidance}) is appended too. */ -export function writeAgentGuidance(workspace: string, agentType: AgentType): void { +export function writeAgentGuidance(workspace: string, agentType: AgentType, compileCommand?: string): void { const filename = AGENT_CONTEXT_FILENAMES[agentType]; const dest = join(workspace, filename); + const guidance = compileCommand ? `${AGENT_GUIDANCE}\n${compileGuidance(compileCommand)}` : AGENT_GUIDANCE; + // If the scaffold shipped AGENTS.md but the active runner reads a different // file, rename it so the guidance reaches the right runner. const scaffoldAgentsMd = join(workspace, 'AGENTS.md'); @@ -66,10 +78,10 @@ export function writeAgentGuidance(workspace: string, agentType: AgentType): voi } if (existsSync(dest)) { - appendFileSync(dest, `\n${AGENT_GUIDANCE}`, 'utf-8'); + appendFileSync(dest, `\n${guidance}`, 'utf-8'); } else { mkdirSync(join(dest, '..'), { recursive: true }); - writeFileSync(dest, AGENT_GUIDANCE, 'utf-8'); + writeFileSync(dest, guidance, 'utf-8'); } } diff --git a/packages/eval-core/tests/loader.test.ts b/packages/eval-core/tests/loader.test.ts index fb9fb916..dbee9f40 100644 --- a/packages/eval-core/tests/loader.test.ts +++ b/packages/eval-core/tests/loader.test.ts @@ -151,6 +151,26 @@ describe('loadEval - setupCommand', () => { }); }); +// ── compile_command frontmatter tests ──────────────────────────────────────── + +describe('loadEval - compileCommand', () => { + it('parses compile_command from frontmatter', async () => { + makeEvalDir(tmpBase, '---\nskills: auth0-react\ncompile_command: npm run build\n---\n\n## Task\nDo the task.\n'); + + const result = await loadEval(EVAL_CONFIG, tmpBase); + + expect(result.compileCommand).toBe('npm run build'); + }); + + it('returns undefined when compile_command is absent', async () => { + makeEvalDir(tmpBase, '---\nskills: auth0-react\n---\n\n## Task\nDo the task.\n'); + + const result = await loadEval(EVAL_CONFIG, tmpBase); + + expect(result.compileCommand).toBeUndefined(); + }); +}); + // ── System prompt tests ─────────────────────────────────────────────────────── describe('loadEval - system prompt', () => { @@ -239,10 +259,8 @@ describe('loadEval - scaffold loading', () => { expect(result.scaffold['readable.txt']).toBe('this is fine'); expect(result.scaffold['unreadable.txt']).toBeUndefined(); }); - }); - // ── frontmatter scaffold field tests ───────────────────────────────────────── function makePromptWithScaffold(scaffoldPath: string): string { diff --git a/packages/eval-core/tests/workspace.test.ts b/packages/eval-core/tests/workspace.test.ts index 9120771e..ec2db233 100644 --- a/packages/eval-core/tests/workspace.test.ts +++ b/packages/eval-core/tests/workspace.test.ts @@ -8,6 +8,7 @@ import { writeAgentGuidance, AGENT_GUIDANCE, AGENT_CONTEXT_FILENAMES, + compileGuidance, } from '../src/workspace/workspace.js'; import { resolveInside } from '../src/workspace/path-utils.js'; @@ -139,6 +140,38 @@ describe('writeAgentGuidance - runner-aware context file', () => { cleanupWorkspace(workspace); }); + + it('appends compile guidance containing the command when compileCommand is provided', () => { + const workspace = setupWorkspace({ 'index.js': 'ok' }); + writeAgentGuidance(workspace, 'claude-code', 'npm run build'); + + const content = readFileSync(join(workspace, 'CLAUDE.md'), 'utf-8'); + expect(content).toContain(AGENT_GUIDANCE); + expect(content).toContain('npm run build'); + expect(content).toContain('verify your integration compiles'); + + cleanupWorkspace(workspace); + }); + + it('omits compile guidance when compileCommand is not provided', () => { + const workspace = setupWorkspace({ 'index.js': 'ok' }); + writeAgentGuidance(workspace, 'claude-code'); + + const content = readFileSync(join(workspace, 'CLAUDE.md'), 'utf-8'); + expect(content).toBe(AGENT_GUIDANCE); + expect(content).not.toContain('verify your integration compiles'); + + cleanupWorkspace(workspace); + }); +}); + +describe('compileGuidance', () => { + it('embeds the command and the verification instruction', () => { + const result = compileGuidance('npm run build'); + + expect(result).toContain('npm run build'); + expect(result).toContain('verify your integration compiles'); + }); }); describe('resolveInside - symlink escape protection', () => { diff --git a/packages/eval/src/cli/run.ts b/packages/eval/src/cli/run.ts index 19da6f06..a2cd082b 100644 --- a/packages/eval/src/cli/run.ts +++ b/packages/eval/src/cli/run.ts @@ -134,7 +134,7 @@ async function runAgentJob( // Inject "no docs files" guidance into the context file this runner reads // (CLAUDE.md / GEMINI.md / AGENTS.md). Must run before both the docker and // local execution paths so every runner picks it up. - writeAgentGuidance(workspace, agentType); + writeAgentGuidance(workspace, agentType, evalDef.compileCommand); try { if (!sandbox && evalDef.setupCommand) { runSetupCommand(workspace, evalDef.setupCommand); From 0c521e78c1df5df3ad3f9d90e31aeb9ca2612188 Mon Sep 17 00:00:00 2001 From: Frederik Prijck Date: Tue, 9 Jun 2026 15:43:55 +0200 Subject: [PATCH 2/3] test: enable build-verification graders for frontend quickstarts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Uncomment the ranCommand/ranCommandOneOf build graders in the six frontend quickstarts (react, vue, spa-js, angular, nuxt, nextjs) now that compile_command instructs the agent to run the build. Install graders stay disabled — a valid solution may edit package.json then run a bare `npm install`, which the install grader would not match. --- .../src/evals/quickstarts/angular/graders.ts | 14 +++++++------- .../src/evals/quickstarts/nextjs/graders.ts | 15 ++++++++++++--- .../src/evals/quickstarts/nuxt/graders.ts | 15 ++++++++++++--- .../src/evals/quickstarts/react/graders.ts | 6 +++--- .../src/evals/quickstarts/spa-js/graders.ts | 6 +++--- .../src/evals/quickstarts/vue/graders.ts | 6 +++--- 6 files changed, 40 insertions(+), 22 deletions(-) diff --git a/apps/auth0-evals/src/evals/quickstarts/angular/graders.ts b/apps/auth0-evals/src/evals/quickstarts/angular/graders.ts index 8f057e25..3cf3c02a 100644 --- a/apps/auth0-evals/src/evals/quickstarts/angular/graders.ts +++ b/apps/auth0-evals/src/evals/quickstarts/angular/graders.ts @@ -1,4 +1,4 @@ -import { contains, notContains, matches, judge, GraderLevel } from '@a0/eval-graders'; +import { contains, notContains, matches, judge, ranCommandOneOf, GraderLevel } from '@a0/eval-graders'; export function defineGraders() { return [ @@ -20,13 +20,13 @@ export function defineGraders() { notContains('sessionStorage.setItem', 'No tokens stored in sessionStorage', GraderLevel.L3), // ── L4: Structural / behavioral correctness ─────────────────────────────── - // Event-based install/build verification temporarily disabled — see PR scoping discussion. + // Install verification left disabled — a valid solution may edit package.json then run a bare `npm install`. // ranCommand('npm install', '@auth0/auth0-angular', 'Ran npm install for @auth0/auth0-angular', GraderLevel.L4), - // ranCommandOneOf( - // ['npm run build', 'ng build'], - // 'Ran build to verify compilation (npm run build, ng build, or npx ng build)', - // GraderLevel.L4, - // ), + ranCommandOneOf( + ['npm run build', 'ng build'], + 'Ran build to verify compilation (npm run build, ng build, or npx ng build)', + GraderLevel.L4, + ), matches(String.raw`provideAuth0\s*\(`, 'Auth0 configured via provideAuth0()', GraderLevel.L4), matches( String.raw`canActivate\s*:\s*\[?\s*(AuthGuard|authGuardFn)`, diff --git a/apps/auth0-evals/src/evals/quickstarts/nextjs/graders.ts b/apps/auth0-evals/src/evals/quickstarts/nextjs/graders.ts index 842a116a..5894aa0e 100644 --- a/apps/auth0-evals/src/evals/quickstarts/nextjs/graders.ts +++ b/apps/auth0-evals/src/evals/quickstarts/nextjs/graders.ts @@ -1,4 +1,13 @@ -import { contains, notContains, notContainsInSource, matches, judge, wroteFile, GraderLevel } from '@a0/eval-graders'; +import { + contains, + notContains, + notContainsInSource, + matches, + judge, + wroteFile, + ranCommandOneOf, + GraderLevel, +} from '@a0/eval-graders'; export function defineGraders() { return [ @@ -35,9 +44,9 @@ export function defineGraders() { ), // ── L4: Structural / behavioral correctness ─────────────────────────────── - // Event-based install/build verification temporarily disabled — see PR scoping discussion. + // Install verification left disabled — a valid solution may edit package.json then run a bare `npm install`. // ranCommand('npm install', '@auth0/nextjs-auth0', 'Ran npm install for @auth0/nextjs-auth0', GraderLevel.L4), - // ranCommandOneOf(['npm run build', 'next build'], 'Ran build to verify compilation', GraderLevel.L4), + ranCommandOneOf(['npm run build', 'next build'], 'Ran build to verify compilation', GraderLevel.L4), wroteFile('.env', 'Wrote Auth0 credentials to .env file', GraderLevel.L4, [ 'dev-barkbook.us.auth0.com', 'barkbook_client_abc123xyz', diff --git a/apps/auth0-evals/src/evals/quickstarts/nuxt/graders.ts b/apps/auth0-evals/src/evals/quickstarts/nuxt/graders.ts index 95cff92d..978d0b8c 100644 --- a/apps/auth0-evals/src/evals/quickstarts/nuxt/graders.ts +++ b/apps/auth0-evals/src/evals/quickstarts/nuxt/graders.ts @@ -1,4 +1,13 @@ -import { contains, notContains, notContainsInSource, matches, judge, wroteFile, GraderLevel } from '@a0/eval-graders'; +import { + contains, + notContains, + notContainsInSource, + matches, + judge, + wroteFile, + ranCommandOneOf, + GraderLevel, +} from '@a0/eval-graders'; export function defineGraders() { return [ @@ -44,9 +53,9 @@ export function defineGraders() { notContains('sessionStorage.setItem', 'No tokens stored in sessionStorage', GraderLevel.L3), // ── L4: Structural / behavioral correctness ─────────────────────────────── - // Event-based install/build verification temporarily disabled — see PR scoping discussion. + // Install verification left disabled — a valid solution may edit package.json then run a bare `npm install`. // ranCommand('npm install', '@auth0/auth0-nuxt', 'Ran npm install for @auth0/auth0-nuxt', GraderLevel.L4), - // ranCommandOneOf(['npm run build', 'nuxt build'], 'Ran build to verify compilation', GraderLevel.L4), + ranCommandOneOf(['npm run build', 'nuxt build'], 'Ran build to verify compilation', GraderLevel.L4), wroteFile('.env', 'Wrote Auth0 credentials to .env file', GraderLevel.L4, [ 'dev-playground.us.auth0.com', 'playground_client_abc123xyz', diff --git a/apps/auth0-evals/src/evals/quickstarts/react/graders.ts b/apps/auth0-evals/src/evals/quickstarts/react/graders.ts index f9981a16..f9c27eb4 100644 --- a/apps/auth0-evals/src/evals/quickstarts/react/graders.ts +++ b/apps/auth0-evals/src/evals/quickstarts/react/graders.ts @@ -1,4 +1,4 @@ -import { contains, notContains, matches, judge, GraderLevel } from '@a0/eval-graders'; +import { contains, notContains, matches, judge, ranCommand, GraderLevel } from '@a0/eval-graders'; export function defineGraders() { return [ @@ -21,9 +21,9 @@ export function defineGraders() { notContains('sessionStorage.setItem', 'No tokens stored in sessionStorage', GraderLevel.L3), // ── L4: Structural / behavioral correctness ─────────────────────────────── - // Event-based install/build verification temporarily disabled — see PR scoping discussion. + // Install verification left disabled — a valid solution may edit package.json then run a bare `npm install`. // ranCommand('npm install', '@auth0/auth0-react', 'Ran npm install for @auth0/auth0-react', GraderLevel.L4), - // ranCommand('npm run', 'build', 'Ran build to verify compilation', GraderLevel.L4), + ranCommand('npm run', 'build', 'Ran build to verify compilation', GraderLevel.L4), matches(String.raw` Date: Wed, 10 Jun 2026 14:55:03 +0200 Subject: [PATCH 3/3] fix: make compile_command guidance imperative so agents run it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The injected compile-verification guidance used permissive wording ("you can use this command"), so capable models produced correct code but skipped the build — failing the mandatory build-verification grader. Rephrase as a "you MUST run" instruction and assert the mandatory wording in tests. --- packages/eval-core/src/workspace/workspace.ts | 2 +- packages/eval-core/tests/workspace.test.ts | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/packages/eval-core/src/workspace/workspace.ts b/packages/eval-core/src/workspace/workspace.ts index c19d6b1b..edf03895 100644 --- a/packages/eval-core/src/workspace/workspace.ts +++ b/packages/eval-core/src/workspace/workspace.ts @@ -38,7 +38,7 @@ export const AGENT_GUIDANCE = `Do not create any documentation files (README.md, * means it appears in the tool trace and the agent can fix any failures. */ export function compileGuidance(compileCommand: string): string { - return `To verify your integration compiles, you can use this command:\n\n\`${compileCommand}\`\n`; + return `After making your changes, you MUST run this command to verify your integration compiles, and fix any errors it reports before finishing:\n\n\`${compileCommand}\`\n`; } /** diff --git a/packages/eval-core/tests/workspace.test.ts b/packages/eval-core/tests/workspace.test.ts index ec2db233..ee8d4cfd 100644 --- a/packages/eval-core/tests/workspace.test.ts +++ b/packages/eval-core/tests/workspace.test.ts @@ -172,6 +172,13 @@ describe('compileGuidance', () => { expect(result).toContain('npm run build'); expect(result).toContain('verify your integration compiles'); }); + + it('phrases the instruction as mandatory, not optional', () => { + const result = compileGuidance('npm run build'); + + expect(result).toContain('MUST'); + expect(result).not.toContain('you can use'); + }); }); describe('resolveInside - symlink escape protection', () => {