From 70eeb8cbbc0ba155ff1a0d3fa891f1115b6326d1 Mon Sep 17 00:00:00 2001
From: Frederik Prijck <frederik.prijck@okta.com>
Date: Tue, 9 Jun 2026 15:04:41 +0200
Subject: [PATCH 1/3] feat: inject compile_command guidance into agent context
 files

Add an optional `compile_command` PROMPT.md frontmatter field. When set,
a verify-compiles instruction is appended to the agent's native context
file (CLAUDE.md / GEMINI.md / AGENTS.md / copilot-instructions.md)
alongside the existing "no docs files" guidance, so the agent verifies
the project compiles and the command appears in the tool trace.

Wires the field into all 10 quickstart evals.
---
 AGENTS.md                                     |  3 +-
 .../src/evals/quickstarts/angular/PROMPT.md   |  1 +
 .../src/evals/quickstarts/express/PROMPT.md   |  1 +
 .../src/evals/quickstarts/fastapi/PROMPT.md   |  1 +
 .../evals/quickstarts/fastify-api/PROMPT.md   |  1 +
 .../src/evals/quickstarts/flask/PROMPT.md     |  1 +
 .../src/evals/quickstarts/nextjs/PROMPT.md    |  1 +
 .../src/evals/quickstarts/nuxt/PROMPT.md      |  1 +
 .../src/evals/quickstarts/react/PROMPT.md     |  1 +
 .../src/evals/quickstarts/spa-js/PROMPT.md    |  1 +
 .../src/evals/quickstarts/vue/PROMPT.md       |  1 +
 docs/ADDING_EVALS.md                          |  3 ++
 packages/eval-core/src/index.ts               |  1 +
 packages/eval-core/src/loader.ts              |  2 ++
 packages/eval-core/src/types/eval.ts          |  1 +
 packages/eval-core/src/workspace/index.ts     |  1 +
 packages/eval-core/src/workspace/workspace.ts | 20 ++++++++---
 packages/eval-core/tests/loader.test.ts       | 22 +++++++++++--
 packages/eval-core/tests/workspace.test.ts    | 33 +++++++++++++++++++
 packages/eval/src/cli/run.ts                  |  2 +-
 20 files changed, 90 insertions(+), 8 deletions(-)
diff --git a/AGENTS.md b/AGENTS.md
index d5845885..a428d7ab 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -137,6 +137,7 @@ When you make a change, update every doc whose described behavior is affected. T
 |---|---|
 | New eval added (`PROMPT.md` + `graders.ts`) | `AGENTS.md` eval list (if maintaining one); `docs/ADDING_EVALS.md` if the change reveals a gap in the guide |
 | `setup_command` behaviour changed (e.g. new syntax supported) | `docs/ADDING_EVALS.md` — frontmatter table and example; `AGENTS.md` checklist if relevant |
+| `compile_command` added to an eval or its context-injection behaviour changed | `docs/ADDING_EVALS.md` — frontmatter table and example; `AGENTS.md` checklist if relevant |
 | New skill added or skill resolution logic changed | `docs/TESTING_SKILLS.md`; `AGENTS.md` if skill tooling or config changed |
 | New CLI flag or runner added | `AGENTS.md` CLI flags table and Agent runners table; `README.md` quick-start if the flag is commonly used |
 | Scoring dimension added, changed, or removed | `docs/SCORING_METHODOLOGY.md` first (per the workflow); then `AGENTS.md` scoring section once merged |
@@ -151,7 +152,7 @@ When you make a change, update every doc whose described behavior is affected. T
 ## Adding an eval — checklist
 
 1. `src/evals/<category>/<eval-dir>/PROMPT.md` + `graders.ts`
-2. Add `id` (required) and optionally `name`/`category` to `PROMPT.md` frontmatter — the framework auto-discovers evals from `evalsDir`
+2. Add `id` (required) and optionally `name`/`category`/`compile_command` to `PROMPT.md` frontmatter — the framework auto-discovers evals from `evalsDir`. Set `compile_command` to point the agent at a verify-compiles command (injected into the agent's context file, e.g. `CLAUDE.md`); omit for evals with no CLI compile step (e.g. mobile).
 3. All imports use `.js` extensions; `import type` for type-only
 4. All graders have `GraderLevel`; one final holistic `judge` with no level
 5. `npm run build && npm test` passes
diff --git a/apps/auth0-evals/src/evals/quickstarts/angular/PROMPT.md b/apps/auth0-evals/src/evals/quickstarts/angular/PROMPT.md
index ed416e0b..671b0745 100644
--- a/apps/auth0-evals/src/evals/quickstarts/angular/PROMPT.md
+++ b/apps/auth0-evals/src/evals/quickstarts/angular/PROMPT.md
@@ -3,6 +3,7 @@ id: angular_quickstart
 name: Angular Quickstart
 skills: auth0-angular
 setup_command: npm install
+compile_command: npm run build
 ---
 
 ## Task
diff --git a/apps/auth0-evals/src/evals/quickstarts/express/PROMPT.md b/apps/auth0-evals/src/evals/quickstarts/express/PROMPT.md
index 47b3d612..27b66e30 100644
--- a/apps/auth0-evals/src/evals/quickstarts/express/PROMPT.md
+++ b/apps/auth0-evals/src/evals/quickstarts/express/PROMPT.md
@@ -3,6 +3,7 @@ id: express_quickstart
 name: Express Quickstart
 skills: auth0-express
 setup_command: npm install
+compile_command: node --check server.js
 ---
 
 ## Task
diff --git a/apps/auth0-evals/src/evals/quickstarts/fastapi/PROMPT.md b/apps/auth0-evals/src/evals/quickstarts/fastapi/PROMPT.md
index a1f475b5..0f3fa1a0 100644
--- a/apps/auth0-evals/src/evals/quickstarts/fastapi/PROMPT.md
+++ b/apps/auth0-evals/src/evals/quickstarts/fastapi/PROMPT.md
@@ -3,6 +3,7 @@ id: fastapi_quickstart
 name: FastAPI Quickstart
 skills: auth0-fastapi-api
 setup_command: python3 -m venv .venv && .venv/bin/pip install -r requirements.txt
+compile_command: .venv/bin/python -m py_compile main.py
 ---
 
 ## Task
diff --git a/apps/auth0-evals/src/evals/quickstarts/fastify-api/PROMPT.md b/apps/auth0-evals/src/evals/quickstarts/fastify-api/PROMPT.md
index ce6e8ceb..531afdfb 100644
--- a/apps/auth0-evals/src/evals/quickstarts/fastify-api/PROMPT.md
+++ b/apps/auth0-evals/src/evals/quickstarts/fastify-api/PROMPT.md
@@ -3,6 +3,7 @@ id: fastify_api_quickstart
 name: Fastify API Quickstart
 skills: auth0-fastify-api
 setup_command: npm install
+compile_command: node --check server.js
 ---
 
 ## Task
diff --git a/apps/auth0-evals/src/evals/quickstarts/flask/PROMPT.md b/apps/auth0-evals/src/evals/quickstarts/flask/PROMPT.md
index 19bd9bea..51710e4f 100644
--- a/apps/auth0-evals/src/evals/quickstarts/flask/PROMPT.md
+++ b/apps/auth0-evals/src/evals/quickstarts/flask/PROMPT.md
@@ -3,6 +3,7 @@ id: flask_quickstart
 name: Flask Quickstart
 skills: auth0-flask
 setup_command: python3 -m venv .venv && .venv/bin/pip install -r requirements.txt
+compile_command: .venv/bin/python -m py_compile app.py
 ---
 
 ## Task
diff --git a/apps/auth0-evals/src/evals/quickstarts/nextjs/PROMPT.md b/apps/auth0-evals/src/evals/quickstarts/nextjs/PROMPT.md
index 14629d1c..049ddc6c 100644
--- a/apps/auth0-evals/src/evals/quickstarts/nextjs/PROMPT.md
+++ b/apps/auth0-evals/src/evals/quickstarts/nextjs/PROMPT.md
@@ -3,6 +3,7 @@ id: nextjs_quickstart
 name: Next.js App Router Quickstart
 skills: auth0-nextjs
 setup_command: npm install
+compile_command: npm run build
 ---
 
 ## Task
diff --git a/apps/auth0-evals/src/evals/quickstarts/nuxt/PROMPT.md b/apps/auth0-evals/src/evals/quickstarts/nuxt/PROMPT.md
index 0f7a6b19..2dd5a460 100644
--- a/apps/auth0-evals/src/evals/quickstarts/nuxt/PROMPT.md
+++ b/apps/auth0-evals/src/evals/quickstarts/nuxt/PROMPT.md
@@ -3,6 +3,7 @@ id: nuxt_quickstart
 name: Nuxt Quickstart
 skills: auth0-nuxt
 setup_command: npm install
+compile_command: npm run build
 ---
 
 ## Task
diff --git a/apps/auth0-evals/src/evals/quickstarts/react/PROMPT.md b/apps/auth0-evals/src/evals/quickstarts/react/PROMPT.md
index 35f5f915..926e56e5 100644
--- a/apps/auth0-evals/src/evals/quickstarts/react/PROMPT.md
+++ b/apps/auth0-evals/src/evals/quickstarts/react/PROMPT.md
@@ -4,6 +4,7 @@ name: React Quickstart
 scaffold: src/evals/scaffolds/react/basic
 skills: auth0-react
 setup_command: npm install
+compile_command: npm run build
 ---
 
 ## Task
diff --git a/apps/auth0-evals/src/evals/quickstarts/spa-js/PROMPT.md b/apps/auth0-evals/src/evals/quickstarts/spa-js/PROMPT.md
index 5712b20f..a26f79c4 100644
--- a/apps/auth0-evals/src/evals/quickstarts/spa-js/PROMPT.md
+++ b/apps/auth0-evals/src/evals/quickstarts/spa-js/PROMPT.md
@@ -3,6 +3,7 @@ id: spa_js_quickstart
 name: SPA JS Quickstart
 skills: auth0-spa-js
 setup_command: npm install
+compile_command: npm run build
 ---
 
 ## Task
diff --git a/apps/auth0-evals/src/evals/quickstarts/vue/PROMPT.md b/apps/auth0-evals/src/evals/quickstarts/vue/PROMPT.md
index 812b87b2..28b0eb0d 100644
--- a/apps/auth0-evals/src/evals/quickstarts/vue/PROMPT.md
+++ b/apps/auth0-evals/src/evals/quickstarts/vue/PROMPT.md
@@ -3,6 +3,7 @@ id: vue_quickstart
 name: Vue Quickstart
 skills: auth0-vue
 setup_command: npm install
+compile_command: npm run build
 ---
 
 ## Task
diff --git a/docs/ADDING_EVALS.md b/docs/ADDING_EVALS.md
index fa045bd7..f1b89356 100644
--- a/docs/ADDING_EVALS.md
+++ b/docs/ADDING_EVALS.md
@@ -31,6 +31,7 @@ id: my_new_eval
 name: My New Eval
 skills: auth0-react
 setup_command: npm install
+compile_command: npm run build
 ---
 ```
 
@@ -41,6 +42,7 @@ setup_command: npm install
 | `category` | no | Defaults to the parent directory name (e.g. `quickstarts`) |
 | `skills` | no | Comma-separated skill names from [auth0/agent-skills](https://github.com/auth0/agent-skills). Injected into agent context when running with `--tools skills` |
 | `setup_command` | no | Command run before the agent starts (e.g. `npm install`). Split on whitespace and executed directly via `spawnSync` — no shell, no operators (`&&`, `\|`, etc.), no quoting. One command only. |
+| `compile_command` | no | Compile/build command (e.g. `npm run build`, `node --check server.js`, `.venv/bin/python -m py_compile main.py`). When set, an instruction pointing the agent at this command is appended to the agent's native context file (`CLAUDE.md` / `GEMINI.md` / `AGENTS.md` / `.github/copilot-instructions.md`) alongside the "no docs files" guidance, so the agent verifies the project compiles and the command shows up in the tool trace. Agent modes only — baseline ignores it. Omit for evals with no CLI compile step (e.g. mobile). |
 
 To test a skill before it is pushed to the remote repo, see [TESTING_SKILLS.md](TESTING_SKILLS.md).
 
@@ -61,6 +63,7 @@ id: react_quickstart
 name: React Quickstart
 skills: auth0-react
 setup_command: npm install
+compile_command: npm run build
 ---
 
 ## Task
diff --git a/packages/eval-core/src/index.ts b/packages/eval-core/src/index.ts
index e6147132..dcb39c17 100644
--- a/packages/eval-core/src/index.ts
+++ b/packages/eval-core/src/index.ts
@@ -82,6 +82,7 @@ export {
   writeAgentGuidance,
   AGENT_GUIDANCE,
   AGENT_CONTEXT_FILENAMES,
+  compileGuidance,
   collectFiles,
   readWorkspaceFile,
   isPathInside,
diff --git a/packages/eval-core/src/loader.ts b/packages/eval-core/src/loader.ts
index 4979b65f..51f33416 100644
--- a/packages/eval-core/src/loader.ts
+++ b/packages/eval-core/src/loader.ts
@@ -63,6 +63,7 @@ export async function loadEval(
     .filter(Boolean);
 
   const setupCommand = meta.setup_command || undefined;
+  const compileCommand = meta.compile_command || undefined;
 
   return {
     id: evalConfig.id,
@@ -74,6 +75,7 @@ export async function loadEval(
     graders,
     scaffold,
     setupCommand,
+    compileCommand,
     skills,
     metadata: {
       provider_name: meta.provider_name ?? 'Auth0',
diff --git a/packages/eval-core/src/types/eval.ts b/packages/eval-core/src/types/eval.ts
index ab11647e..1f518ad0 100644
--- a/packages/eval-core/src/types/eval.ts
+++ b/packages/eval-core/src/types/eval.ts
@@ -19,6 +19,7 @@ export interface EvalDefinition {
   graders: GraderDef[];
   scaffold: Record<string, string>;
   setupCommand?: string;
+  compileCommand?: string;
   skills: string[];
   metadata: Record<string, string>;
 }
diff --git a/packages/eval-core/src/workspace/index.ts b/packages/eval-core/src/workspace/index.ts
index dc4e26af..5a69b748 100644
--- a/packages/eval-core/src/workspace/index.ts
+++ b/packages/eval-core/src/workspace/index.ts
@@ -5,6 +5,7 @@ export {
   writeAgentGuidance,
   AGENT_GUIDANCE,
   AGENT_CONTEXT_FILENAMES,
+  compileGuidance,
 } from './workspace.js';
 export type { SetupWorkspaceOptions, RunSetupCommandOptions } from './workspace.js';
 export { collectFiles, readWorkspaceFile } from './file-utils.js';
diff --git a/packages/eval-core/src/workspace/workspace.ts b/packages/eval-core/src/workspace/workspace.ts
index c3890d6e..c19d6b1b 100644
--- a/packages/eval-core/src/workspace/workspace.ts
+++ b/packages/eval-core/src/workspace/workspace.ts
@@ -32,6 +32,15 @@ import { resolveInside } from './path-utils.js';
 export const AGENT_GUIDANCE = `Do not create any documentation files (README.md, SETUP.md, QUICKSTART.md, IMPLEMENTATION_SUMMARY.md, or any other .md files). Do not create any .txt summary or verification files. Do not create standalone summary or status files of any kind (e.g. AUTH0_SETUP.ts, IMPLEMENTATION_COMPLETE.ts, QUICK_START.ts, FILES_CREATED.txt) — these are not application source code. Only create and modify source code files that are part of the application.
 `;
 
+/**
+ * Builds the compile-verification guidance appended to the agent's context file
+ * when the eval declares a `compileCommand`. Pointing the agent at the command
+ * means it appears in the tool trace and the agent can fix any failures.
+ */
+export function compileGuidance(compileCommand: string): string {
+  return `To verify your integration compiles, you can use this command:\n\n\`${compileCommand}\`\n`;
+}
+
 /**
  * The context/memory file each runner reads, relative to the workspace root.
  * Writing guidance to the wrong file means the agent silently ignores it:
@@ -51,12 +60,15 @@ export const AGENT_CONTEXT_FILENAMES: Record<AgentType, string> = {
 /**
  * Writes {@link AGENT_GUIDANCE} into the context file the given runner reads.
  * Appends (preserving any scaffold-provided content) when the file already
- * exists; creates it otherwise.
+ * exists; creates it otherwise. When `compileCommand` is provided, the
+ * compile-verification guidance (see {@link compileGuidance}) is appended too.
  */
-export function writeAgentGuidance(workspace: string, agentType: AgentType): void {
+export function writeAgentGuidance(workspace: string, agentType: AgentType, compileCommand?: string): void {
   const filename = AGENT_CONTEXT_FILENAMES[agentType];
   const dest = join(workspace, filename);
 
+  const guidance = compileCommand ? `${AGENT_GUIDANCE}\n${compileGuidance(compileCommand)}` : AGENT_GUIDANCE;
+
   // If the scaffold shipped AGENTS.md but the active runner reads a different
   // file, rename it so the guidance reaches the right runner.
   const scaffoldAgentsMd = join(workspace, 'AGENTS.md');
@@ -66,10 +78,10 @@ export function writeAgentGuidance(workspace: string, agentType: AgentType): voi
   }
 
   if (existsSync(dest)) {
-    appendFileSync(dest, `\n${AGENT_GUIDANCE}`, 'utf-8');
+    appendFileSync(dest, `\n${guidance}`, 'utf-8');
   } else {
     mkdirSync(join(dest, '..'), { recursive: true });
-    writeFileSync(dest, AGENT_GUIDANCE, 'utf-8');
+    writeFileSync(dest, guidance, 'utf-8');
   }
 }
 
diff --git a/packages/eval-core/tests/loader.test.ts b/packages/eval-core/tests/loader.test.ts
index fb9fb916..dbee9f40 100644
--- a/packages/eval-core/tests/loader.test.ts
+++ b/packages/eval-core/tests/loader.test.ts
@@ -151,6 +151,26 @@ describe('loadEval - setupCommand', () => {
   });
 });
 
+// ── compile_command frontmatter tests ────────────────────────────────────────
+
+describe('loadEval - compileCommand', () => {
+  it('parses compile_command from frontmatter', async () => {
+    makeEvalDir(tmpBase, '---\nskills: auth0-react\ncompile_command: npm run build\n---\n\n## Task\nDo the task.\n');
+
+    const result = await loadEval(EVAL_CONFIG, tmpBase);
+
+    expect(result.compileCommand).toBe('npm run build');
+  });
+
+  it('returns undefined when compile_command is absent', async () => {
+    makeEvalDir(tmpBase, '---\nskills: auth0-react\n---\n\n## Task\nDo the task.\n');
+
+    const result = await loadEval(EVAL_CONFIG, tmpBase);
+
+    expect(result.compileCommand).toBeUndefined();
+  });
+});
+
 // ── System prompt tests ───────────────────────────────────────────────────────
 
 describe('loadEval - system prompt', () => {
@@ -239,10 +259,8 @@ describe('loadEval - scaffold loading', () => {
     expect(result.scaffold['readable.txt']).toBe('this is fine');
     expect(result.scaffold['unreadable.txt']).toBeUndefined();
   });
-
 });
 
-
 // ── frontmatter scaffold field tests ─────────────────────────────────────────
 
 function makePromptWithScaffold(scaffoldPath: string): string {
diff --git a/packages/eval-core/tests/workspace.test.ts b/packages/eval-core/tests/workspace.test.ts
index 9120771e..ec2db233 100644
--- a/packages/eval-core/tests/workspace.test.ts
+++ b/packages/eval-core/tests/workspace.test.ts
@@ -8,6 +8,7 @@ import {
   writeAgentGuidance,
   AGENT_GUIDANCE,
   AGENT_CONTEXT_FILENAMES,
+  compileGuidance,
 } from '../src/workspace/workspace.js';
 import { resolveInside } from '../src/workspace/path-utils.js';
 
@@ -139,6 +140,38 @@ describe('writeAgentGuidance - runner-aware context file', () => {
 
     cleanupWorkspace(workspace);
   });
+
+  it('appends compile guidance containing the command when compileCommand is provided', () => {
+    const workspace = setupWorkspace({ 'index.js': 'ok' });
+    writeAgentGuidance(workspace, 'claude-code', 'npm run build');
+
+    const content = readFileSync(join(workspace, 'CLAUDE.md'), 'utf-8');
+    expect(content).toContain(AGENT_GUIDANCE);
+    expect(content).toContain('npm run build');
+    expect(content).toContain('verify your integration compiles');
+
+    cleanupWorkspace(workspace);
+  });
+
+  it('omits compile guidance when compileCommand is not provided', () => {
+    const workspace = setupWorkspace({ 'index.js': 'ok' });
+    writeAgentGuidance(workspace, 'claude-code');
+
+    const content = readFileSync(join(workspace, 'CLAUDE.md'), 'utf-8');
+    expect(content).toBe(AGENT_GUIDANCE);
+    expect(content).not.toContain('verify your integration compiles');
+
+    cleanupWorkspace(workspace);
+  });
+});
+
+describe('compileGuidance', () => {
+  it('embeds the command and the verification instruction', () => {
+    const result = compileGuidance('npm run build');
+
+    expect(result).toContain('npm run build');
+    expect(result).toContain('verify your integration compiles');
+  });
 });
 
 describe('resolveInside - symlink escape protection', () => {
diff --git a/packages/eval/src/cli/run.ts b/packages/eval/src/cli/run.ts
index 19da6f06..a2cd082b 100644
--- a/packages/eval/src/cli/run.ts
+++ b/packages/eval/src/cli/run.ts
@@ -134,7 +134,7 @@ async function runAgentJob(
   // Inject "no docs files" guidance into the context file this runner reads
   // (CLAUDE.md / GEMINI.md / AGENTS.md). Must run before both the docker and
   // local execution paths so every runner picks it up.
-  writeAgentGuidance(workspace, agentType);
+  writeAgentGuidance(workspace, agentType, evalDef.compileCommand);
   try {
     if (!sandbox && evalDef.setupCommand) {
       runSetupCommand(workspace, evalDef.setupCommand);

From 0c521e78c1df5df3ad3f9d90e31aeb9ca2612188 Mon Sep 17 00:00:00 2001
From: Frederik Prijck <frederik.prijck@okta.com>
Date: Tue, 9 Jun 2026 15:43:55 +0200
Subject: [PATCH 2/3] test: enable build-verification graders for frontend
 quickstarts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Uncomment the ranCommand/ranCommandOneOf build graders in the six
frontend quickstarts (react, vue, spa-js, angular, nuxt, nextjs) now
that compile_command instructs the agent to run the build. Install
graders stay disabled — a valid solution may edit package.json then run
a bare `npm install`, which the install grader would not match.
---
 .../src/evals/quickstarts/angular/graders.ts      | 14 +++++++-------
 .../src/evals/quickstarts/nextjs/graders.ts       | 15 ++++++++++++---
 .../src/evals/quickstarts/nuxt/graders.ts         | 15 ++++++++++++---
 .../src/evals/quickstarts/react/graders.ts        |  6 +++---
 .../src/evals/quickstarts/spa-js/graders.ts       |  6 +++---
 .../src/evals/quickstarts/vue/graders.ts          |  6 +++---
 6 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/apps/auth0-evals/src/evals/quickstarts/angular/graders.ts b/apps/auth0-evals/src/evals/quickstarts/angular/graders.ts
index 8f057e25..3cf3c02a 100644
--- a/apps/auth0-evals/src/evals/quickstarts/angular/graders.ts
+++ b/apps/auth0-evals/src/evals/quickstarts/angular/graders.ts
@@ -1,4 +1,4 @@
-import { contains, notContains, matches, judge, GraderLevel } from '@a0/eval-graders';
+import { contains, notContains, matches, judge, ranCommandOneOf, GraderLevel } from '@a0/eval-graders';
 
 export function defineGraders() {
   return [
@@ -20,13 +20,13 @@ export function defineGraders() {
     notContains('sessionStorage.setItem', 'No tokens stored in sessionStorage', GraderLevel.L3),
 
     // ── L4: Structural / behavioral correctness ───────────────────────────────
-    // Event-based install/build verification temporarily disabled — see PR scoping discussion.
+    // Install verification left disabled — a valid solution may edit package.json then run a bare `npm install`.
     // ranCommand('npm install', '@auth0/auth0-angular', 'Ran npm install for @auth0/auth0-angular', GraderLevel.L4),
-    // ranCommandOneOf(
-    //   ['npm run build', 'ng build'],
-    //   'Ran build to verify compilation (npm run build, ng build, or npx ng build)',
-    //   GraderLevel.L4,
-    // ),
+    ranCommandOneOf(
+      ['npm run build', 'ng build'],
+      'Ran build to verify compilation (npm run build, ng build, or npx ng build)',
+      GraderLevel.L4,
+    ),
     matches(String.raw`provideAuth0\s*\(`, 'Auth0 configured via provideAuth0()', GraderLevel.L4),
     matches(
       String.raw`canActivate\s*:\s*\[?\s*(AuthGuard|authGuardFn)`,
diff --git a/apps/auth0-evals/src/evals/quickstarts/nextjs/graders.ts b/apps/auth0-evals/src/evals/quickstarts/nextjs/graders.ts
index 842a116a..5894aa0e 100644
--- a/apps/auth0-evals/src/evals/quickstarts/nextjs/graders.ts
+++ b/apps/auth0-evals/src/evals/quickstarts/nextjs/graders.ts
@@ -1,4 +1,13 @@
-import { contains, notContains, notContainsInSource, matches, judge, wroteFile, GraderLevel } from '@a0/eval-graders';
+import {
+  contains,
+  notContains,
+  notContainsInSource,
+  matches,
+  judge,
+  wroteFile,
+  ranCommandOneOf,
+  GraderLevel,
+} from '@a0/eval-graders';
 
 export function defineGraders() {
   return [
@@ -35,9 +44,9 @@ export function defineGraders() {
     ),
 
     // ── L4: Structural / behavioral correctness ───────────────────────────────
-    // Event-based install/build verification temporarily disabled — see PR scoping discussion.
+    // Install verification left disabled — a valid solution may edit package.json then run a bare `npm install`.
     // ranCommand('npm install', '@auth0/nextjs-auth0', 'Ran npm install for @auth0/nextjs-auth0', GraderLevel.L4),
-    // ranCommandOneOf(['npm run build', 'next build'], 'Ran build to verify compilation', GraderLevel.L4),
+    ranCommandOneOf(['npm run build', 'next build'], 'Ran build to verify compilation', GraderLevel.L4),
     wroteFile('.env', 'Wrote Auth0 credentials to .env file', GraderLevel.L4, [
       'dev-barkbook.us.auth0.com',
       'barkbook_client_abc123xyz',
diff --git a/apps/auth0-evals/src/evals/quickstarts/nuxt/graders.ts b/apps/auth0-evals/src/evals/quickstarts/nuxt/graders.ts
index 95cff92d..978d0b8c 100644
--- a/apps/auth0-evals/src/evals/quickstarts/nuxt/graders.ts
+++ b/apps/auth0-evals/src/evals/quickstarts/nuxt/graders.ts
@@ -1,4 +1,13 @@
-import { contains, notContains, notContainsInSource, matches, judge, wroteFile, GraderLevel } from '@a0/eval-graders';
+import {
+  contains,
+  notContains,
+  notContainsInSource,
+  matches,
+  judge,
+  wroteFile,
+  ranCommandOneOf,
+  GraderLevel,
+} from '@a0/eval-graders';
 
 export function defineGraders() {
   return [
@@ -44,9 +53,9 @@ export function defineGraders() {
     notContains('sessionStorage.setItem', 'No tokens stored in sessionStorage', GraderLevel.L3),
 
     // ── L4: Structural / behavioral correctness ───────────────────────────────
-    // Event-based install/build verification temporarily disabled — see PR scoping discussion.
+    // Install verification left disabled — a valid solution may edit package.json then run a bare `npm install`.
     // ranCommand('npm install', '@auth0/auth0-nuxt', 'Ran npm install for @auth0/auth0-nuxt', GraderLevel.L4),
-    // ranCommandOneOf(['npm run build', 'nuxt build'], 'Ran build to verify compilation', GraderLevel.L4),
+    ranCommandOneOf(['npm run build', 'nuxt build'], 'Ran build to verify compilation', GraderLevel.L4),
     wroteFile('.env', 'Wrote Auth0 credentials to .env file', GraderLevel.L4, [
       'dev-playground.us.auth0.com',
       'playground_client_abc123xyz',
diff --git a/apps/auth0-evals/src/evals/quickstarts/react/graders.ts b/apps/auth0-evals/src/evals/quickstarts/react/graders.ts
index f9981a16..f9c27eb4 100644
--- a/apps/auth0-evals/src/evals/quickstarts/react/graders.ts
+++ b/apps/auth0-evals/src/evals/quickstarts/react/graders.ts
@@ -1,4 +1,4 @@
-import { contains, notContains, matches, judge, GraderLevel } from '@a0/eval-graders';
+import { contains, notContains, matches, judge, ranCommand, GraderLevel } from '@a0/eval-graders';
 
 export function defineGraders() {
   return [
@@ -21,9 +21,9 @@ export function defineGraders() {
     notContains('sessionStorage.setItem', 'No tokens stored in sessionStorage', GraderLevel.L3),
 
     // ── L4: Structural / behavioral correctness ───────────────────────────────
-    // Event-based install/build verification temporarily disabled — see PR scoping discussion.
+    // Install verification left disabled — a valid solution may edit package.json then run a bare `npm install`.
     // ranCommand('npm install', '@auth0/auth0-react', 'Ran npm install for @auth0/auth0-react', GraderLevel.L4),
-    // ranCommand('npm run', 'build', 'Ran build to verify compilation', GraderLevel.L4),
+    ranCommand('npm run', 'build', 'Ran build to verify compilation', GraderLevel.L4),
     matches(String.raw`<Auth0Provider[\s\S]*?domain`, 'Auth0Provider configured with domain prop', GraderLevel.L4),
     contains('getAccessTokenSilently', 'Uses getAccessTokenSilently to retrieve access token', GraderLevel.L4),
     judge(
diff --git a/apps/auth0-evals/src/evals/quickstarts/spa-js/graders.ts b/apps/auth0-evals/src/evals/quickstarts/spa-js/graders.ts
index 3081a0b5..2eeeeba1 100644
--- a/apps/auth0-evals/src/evals/quickstarts/spa-js/graders.ts
+++ b/apps/auth0-evals/src/evals/quickstarts/spa-js/graders.ts
@@ -1,4 +1,4 @@
-import { contains, notContains, matches, judge, GraderLevel } from '@a0/eval-graders';
+import { contains, notContains, matches, judge, ranCommand, GraderLevel } from '@a0/eval-graders';
 
 export function defineGraders() {
   return [
@@ -22,9 +22,9 @@ export function defineGraders() {
     notContains('sessionStorage.setItem', 'No tokens manually stored in sessionStorage', GraderLevel.L3),
 
     // ── L4: Structural / behavioral correctness ───────────────────────────────
-    // Event-based install/build verification temporarily disabled — see PR scoping discussion.
+    // Install verification left disabled — a valid solution may edit package.json then run a bare `npm install`.
     // ranCommand('npm install', '@auth0/auth0-spa-js', 'Ran npm install for @auth0/auth0-spa-js', GraderLevel.L4),
-    // ranCommand('npm run', 'build', 'Ran build to verify compilation', GraderLevel.L4),
+    ranCommand('npm run', 'build', 'Ran build to verify compilation', GraderLevel.L4),
     matches(
       String.raw`createAuth0Client\s*\(\s*\{[\s\S]*?domain`,
       'Auth0Client configured with domain',
diff --git a/apps/auth0-evals/src/evals/quickstarts/vue/graders.ts b/apps/auth0-evals/src/evals/quickstarts/vue/graders.ts
index d092b9d5..9f13172e 100644
--- a/apps/auth0-evals/src/evals/quickstarts/vue/graders.ts
+++ b/apps/auth0-evals/src/evals/quickstarts/vue/graders.ts
@@ -1,4 +1,4 @@
-import { contains, notContains, matches, judge, GraderLevel } from '@a0/eval-graders';
+import { contains, notContains, matches, judge, ranCommand, GraderLevel } from '@a0/eval-graders';
 
 export function defineGraders() {
   return [
@@ -21,9 +21,9 @@ export function defineGraders() {
     notContains('sessionStorage.setItem', 'No tokens stored in sessionStorage', GraderLevel.L3),
 
     // ── L4: Structural / behavioral correctness ───────────────────────────────
-    // Event-based install/build verification temporarily disabled — see PR scoping discussion.
+    // Install verification left disabled — a valid solution may edit package.json then run a bare `npm install`.
     // ranCommand('npm install', '@auth0/auth0-vue', 'Ran npm install for @auth0/auth0-vue', GraderLevel.L4),
-    // ranCommand('npm run', 'build', 'Ran build to verify compilation', GraderLevel.L4),
+    ranCommand('npm run', 'build', 'Ran build to verify compilation', GraderLevel.L4),
     matches(String.raw`app\.use\s*\(\s*createAuth0`, 'Plugin installed via app.use(createAuth0(...))', GraderLevel.L4),
     contains('getAccessTokenSilently', 'Uses getAccessTokenSilently to retrieve access token', GraderLevel.L4),
     judge(

From 7d89312e973c746c5eb4aabb7d3b432abc13f104 Mon Sep 17 00:00:00 2001
From: Frederik Prijck <frederik.prijck@okta.com>
Date: Wed, 10 Jun 2026 14:55:03 +0200
Subject: [PATCH 3/3] fix: make compile_command guidance imperative so agents
 run it
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The injected compile-verification guidance used permissive wording
("you can use this command"), so capable models produced correct code
but skipped the build — failing the mandatory build-verification grader.
Rephrase as a "you MUST run" instruction and assert the mandatory
wording in tests.
---
 packages/eval-core/src/workspace/workspace.ts | 2 +-
 packages/eval-core/tests/workspace.test.ts    | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/packages/eval-core/src/workspace/workspace.ts b/packages/eval-core/src/workspace/workspace.ts
index c19d6b1b..edf03895 100644
--- a/packages/eval-core/src/workspace/workspace.ts
+++ b/packages/eval-core/src/workspace/workspace.ts
@@ -38,7 +38,7 @@ export const AGENT_GUIDANCE = `Do not create any documentation files (README.md,
  * means it appears in the tool trace and the agent can fix any failures.
  */
 export function compileGuidance(compileCommand: string): string {
-  return `To verify your integration compiles, you can use this command:\n\n\`${compileCommand}\`\n`;
+  return `After making your changes, you MUST run this command to verify your integration compiles, and fix any errors it reports before finishing:\n\n\`${compileCommand}\`\n`;
 }
 
 /**
diff --git a/packages/eval-core/tests/workspace.test.ts b/packages/eval-core/tests/workspace.test.ts
index ec2db233..ee8d4cfd 100644
--- a/packages/eval-core/tests/workspace.test.ts
+++ b/packages/eval-core/tests/workspace.test.ts
@@ -172,6 +172,13 @@ describe('compileGuidance', () => {
     expect(result).toContain('npm run build');
     expect(result).toContain('verify your integration compiles');
   });
+
+  it('phrases the instruction as mandatory, not optional', () => {
+    const result = compileGuidance('npm run build');
+
+    expect(result).toContain('MUST');
+    expect(result).not.toContain('you can use');
+  });
 });
 
 describe('resolveInside - symlink escape protection', () => {