Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ When you make a change, update every doc whose described behavior is affected. T
|---|---|
| New eval added (`PROMPT.md` + `graders.ts`) | `AGENTS.md` eval list (if maintaining one); `docs/ADDING_EVALS.md` if the change reveals a gap in the guide |
| `setup_command` behaviour changed (e.g. new syntax supported) | `docs/ADDING_EVALS.md` — frontmatter table and example; `AGENTS.md` checklist if relevant |
| `compile_command` added to an eval or its context-injection behaviour changed | `docs/ADDING_EVALS.md` — frontmatter table and example; `AGENTS.md` checklist if relevant |
| New skill added or skill resolution logic changed | `docs/TESTING_SKILLS.md`; `AGENTS.md` if skill tooling or config changed |
| New CLI flag or runner added | `AGENTS.md` CLI flags table and Agent runners table; `README.md` quick-start if the flag is commonly used |
| Scoring dimension added, changed, or removed | `docs/SCORING_METHODOLOGY.md` first (per the workflow); then `AGENTS.md` scoring section once merged |
Expand All @@ -151,7 +152,7 @@ When you make a change, update every doc whose described behavior is affected. T
## Adding an eval — checklist

1. `src/evals/<category>/<eval-dir>/PROMPT.md` + `graders.ts`
2. Add `id` (required) and optionally `name`/`category` to `PROMPT.md` frontmatter — the framework auto-discovers evals from `evalsDir`
2. Add `id` (required) and optionally `name`/`category`/`compile_command` to `PROMPT.md` frontmatter — the framework auto-discovers evals from `evalsDir`. Set `compile_command` to point the agent at a verify-compiles command (injected into the agent's context file, e.g. `CLAUDE.md`); omit for evals with no CLI compile step (e.g. mobile).
3. All imports use `.js` extensions; `import type` for type-only
4. All graders have `GraderLevel`; one final holistic `judge` with no level
5. `npm run build && npm test` passes
Expand Down
1 change: 1 addition & 0 deletions apps/auth0-evals/src/evals/quickstarts/angular/PROMPT.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ id: angular_quickstart
name: Angular Quickstart
skills: auth0-angular
setup_command: npm install
compile_command: npm run build
---

## Task
Expand Down
14 changes: 7 additions & 7 deletions apps/auth0-evals/src/evals/quickstarts/angular/graders.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { contains, notContains, matches, judge, GraderLevel } from '@a0/eval-graders';
import { contains, notContains, matches, judge, ranCommandOneOf, GraderLevel } from '@a0/eval-graders';

export function defineGraders() {
return [
Expand All @@ -20,13 +20,13 @@ export function defineGraders() {
notContains('sessionStorage.setItem', 'No tokens stored in sessionStorage', GraderLevel.L3),

// ── L4: Structural / behavioral correctness ───────────────────────────────
// Event-based install/build verification temporarily disabled — see PR scoping discussion.
// Install verification left disabled — a valid solution may edit package.json then run a bare `npm install`.
// ranCommand('npm install', '@auth0/auth0-angular', 'Ran npm install for @auth0/auth0-angular', GraderLevel.L4),
// ranCommandOneOf(
// ['npm run build', 'ng build'],
// 'Ran build to verify compilation (npm run build, ng build, or npx ng build)',
// GraderLevel.L4,
// ),
ranCommandOneOf(
['npm run build', 'ng build'],
'Ran build to verify compilation (npm run build, ng build, or npx ng build)',
GraderLevel.L4,
),
matches(String.raw`provideAuth0\s*\(`, 'Auth0 configured via provideAuth0()', GraderLevel.L4),
matches(
String.raw`canActivate\s*:\s*\[?\s*(AuthGuard|authGuardFn)`,
Expand Down
1 change: 1 addition & 0 deletions apps/auth0-evals/src/evals/quickstarts/express/PROMPT.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ id: express_quickstart
name: Express Quickstart
skills: auth0-express
setup_command: npm install
compile_command: node --check server.js
---

## Task
Expand Down
1 change: 1 addition & 0 deletions apps/auth0-evals/src/evals/quickstarts/fastapi/PROMPT.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ id: fastapi_quickstart
name: FastAPI Quickstart
skills: auth0-fastapi-api
setup_command: python3 -m venv .venv && .venv/bin/pip install -r requirements.txt
compile_command: .venv/bin/python -m py_compile main.py
---

## Task
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ id: fastify_api_quickstart
name: Fastify API Quickstart
skills: auth0-fastify-api
setup_command: npm install
compile_command: node --check server.js
---

## Task
Expand Down
1 change: 1 addition & 0 deletions apps/auth0-evals/src/evals/quickstarts/flask/PROMPT.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ id: flask_quickstart
name: Flask Quickstart
skills: auth0-flask
setup_command: python3 -m venv .venv && .venv/bin/pip install -r requirements.txt
compile_command: .venv/bin/python -m py_compile app.py
---

## Task
Expand Down
1 change: 1 addition & 0 deletions apps/auth0-evals/src/evals/quickstarts/nextjs/PROMPT.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ id: nextjs_quickstart
name: Next.js App Router Quickstart
skills: auth0-nextjs
setup_command: npm install
compile_command: npm run build
---

## Task
Expand Down
15 changes: 12 additions & 3 deletions apps/auth0-evals/src/evals/quickstarts/nextjs/graders.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
import { contains, notContains, notContainsInSource, matches, judge, wroteFile, GraderLevel } from '@a0/eval-graders';
import {
contains,
notContains,
notContainsInSource,
matches,
judge,
wroteFile,
ranCommandOneOf,
GraderLevel,
} from '@a0/eval-graders';

export function defineGraders() {
return [
Expand Down Expand Up @@ -35,9 +44,9 @@ export function defineGraders() {
),

// ── L4: Structural / behavioral correctness ───────────────────────────────
// Event-based install/build verification temporarily disabled — see PR scoping discussion.
// Install verification left disabled — a valid solution may edit package.json then run a bare `npm install`.
// ranCommand('npm install', '@auth0/nextjs-auth0', 'Ran npm install for @auth0/nextjs-auth0', GraderLevel.L4),
// ranCommandOneOf(['npm run build', 'next build'], 'Ran build to verify compilation', GraderLevel.L4),
ranCommandOneOf(['npm run build', 'next build'], 'Ran build to verify compilation', GraderLevel.L4),
wroteFile('.env', 'Wrote Auth0 credentials to .env file', GraderLevel.L4, [
'dev-barkbook.us.auth0.com',
'barkbook_client_abc123xyz',
Expand Down
1 change: 1 addition & 0 deletions apps/auth0-evals/src/evals/quickstarts/nuxt/PROMPT.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ id: nuxt_quickstart
name: Nuxt Quickstart
skills: auth0-nuxt
setup_command: npm install
compile_command: npm run build
---

## Task
Expand Down
15 changes: 12 additions & 3 deletions apps/auth0-evals/src/evals/quickstarts/nuxt/graders.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
import { contains, notContains, notContainsInSource, matches, judge, wroteFile, GraderLevel } from '@a0/eval-graders';
import {
contains,
notContains,
notContainsInSource,
matches,
judge,
wroteFile,
ranCommandOneOf,
GraderLevel,
} from '@a0/eval-graders';

export function defineGraders() {
return [
Expand Down Expand Up @@ -44,9 +53,9 @@ export function defineGraders() {
notContains('sessionStorage.setItem', 'No tokens stored in sessionStorage', GraderLevel.L3),

// ── L4: Structural / behavioral correctness ───────────────────────────────
// Event-based install/build verification temporarily disabled — see PR scoping discussion.
// Install verification left disabled — a valid solution may edit package.json then run a bare `npm install`.
// ranCommand('npm install', '@auth0/auth0-nuxt', 'Ran npm install for @auth0/auth0-nuxt', GraderLevel.L4),
// ranCommandOneOf(['npm run build', 'nuxt build'], 'Ran build to verify compilation', GraderLevel.L4),
ranCommandOneOf(['npm run build', 'nuxt build'], 'Ran build to verify compilation', GraderLevel.L4),
wroteFile('.env', 'Wrote Auth0 credentials to .env file', GraderLevel.L4, [
'dev-playground.us.auth0.com',
'playground_client_abc123xyz',
Expand Down
1 change: 1 addition & 0 deletions apps/auth0-evals/src/evals/quickstarts/react/PROMPT.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ name: React Quickstart
scaffold: src/evals/scaffolds/react/basic
skills: auth0-react
setup_command: npm install
compile_command: npm run build
---

## Task
Expand Down
6 changes: 3 additions & 3 deletions apps/auth0-evals/src/evals/quickstarts/react/graders.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { contains, notContains, matches, judge, GraderLevel } from '@a0/eval-graders';
import { contains, notContains, matches, judge, ranCommand, GraderLevel } from '@a0/eval-graders';

export function defineGraders() {
return [
Expand All @@ -21,9 +21,9 @@ export function defineGraders() {
notContains('sessionStorage.setItem', 'No tokens stored in sessionStorage', GraderLevel.L3),

// ── L4: Structural / behavioral correctness ───────────────────────────────
// Event-based install/build verification temporarily disabled — see PR scoping discussion.
// Install verification left disabled — a valid solution may edit package.json then run a bare `npm install`.
// ranCommand('npm install', '@auth0/auth0-react', 'Ran npm install for @auth0/auth0-react', GraderLevel.L4),
// ranCommand('npm run', 'build', 'Ran build to verify compilation', GraderLevel.L4),
ranCommand('npm run', 'build', 'Ran build to verify compilation', GraderLevel.L4),
matches(String.raw`<Auth0Provider[\s\S]*?domain`, 'Auth0Provider configured with domain prop', GraderLevel.L4),
contains('getAccessTokenSilently', 'Uses getAccessTokenSilently to retrieve access token', GraderLevel.L4),
judge(
Expand Down
1 change: 1 addition & 0 deletions apps/auth0-evals/src/evals/quickstarts/spa-js/PROMPT.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ id: spa_js_quickstart
name: SPA JS Quickstart
skills: auth0-spa-js
setup_command: npm install
compile_command: npm run build
---

## Task
Expand Down
6 changes: 3 additions & 3 deletions apps/auth0-evals/src/evals/quickstarts/spa-js/graders.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { contains, notContains, matches, judge, GraderLevel } from '@a0/eval-graders';
import { contains, notContains, matches, judge, ranCommand, GraderLevel } from '@a0/eval-graders';

export function defineGraders() {
return [
Expand All @@ -22,9 +22,9 @@ export function defineGraders() {
notContains('sessionStorage.setItem', 'No tokens manually stored in sessionStorage', GraderLevel.L3),

// ── L4: Structural / behavioral correctness ───────────────────────────────
// Event-based install/build verification temporarily disabled — see PR scoping discussion.
// Install verification left disabled — a valid solution may edit package.json then run a bare `npm install`.
// ranCommand('npm install', '@auth0/auth0-spa-js', 'Ran npm install for @auth0/auth0-spa-js', GraderLevel.L4),
// ranCommand('npm run', 'build', 'Ran build to verify compilation', GraderLevel.L4),
ranCommand('npm run', 'build', 'Ran build to verify compilation', GraderLevel.L4),
matches(
String.raw`createAuth0Client\s*\(\s*\{[\s\S]*?domain`,
'Auth0Client configured with domain',
Expand Down
1 change: 1 addition & 0 deletions apps/auth0-evals/src/evals/quickstarts/vue/PROMPT.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ id: vue_quickstart
name: Vue Quickstart
skills: auth0-vue
setup_command: npm install
compile_command: npm run build
---

## Task
Expand Down
6 changes: 3 additions & 3 deletions apps/auth0-evals/src/evals/quickstarts/vue/graders.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { contains, notContains, matches, judge, GraderLevel } from '@a0/eval-graders';
import { contains, notContains, matches, judge, ranCommand, GraderLevel } from '@a0/eval-graders';

export function defineGraders() {
return [
Expand All @@ -21,9 +21,9 @@ export function defineGraders() {
notContains('sessionStorage.setItem', 'No tokens stored in sessionStorage', GraderLevel.L3),

// ── L4: Structural / behavioral correctness ───────────────────────────────
// Event-based install/build verification temporarily disabled — see PR scoping discussion.
// Install verification left disabled — a valid solution may edit package.json then run a bare `npm install`.
// ranCommand('npm install', '@auth0/auth0-vue', 'Ran npm install for @auth0/auth0-vue', GraderLevel.L4),
// ranCommand('npm run', 'build', 'Ran build to verify compilation', GraderLevel.L4),
ranCommand('npm run', 'build', 'Ran build to verify compilation', GraderLevel.L4),
matches(String.raw`app\.use\s*\(\s*createAuth0`, 'Plugin installed via app.use(createAuth0(...))', GraderLevel.L4),
contains('getAccessTokenSilently', 'Uses getAccessTokenSilently to retrieve access token', GraderLevel.L4),
judge(
Expand Down
3 changes: 3 additions & 0 deletions docs/ADDING_EVALS.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ id: my_new_eval
name: My New Eval
skills: auth0-react
setup_command: npm install
compile_command: npm run build
---
```

Expand All @@ -41,6 +42,7 @@ setup_command: npm install
| `category` | no | Defaults to the parent directory name (e.g. `quickstarts`) |
| `skills` | no | Comma-separated skill names from [auth0/agent-skills](https://github.com/auth0/agent-skills). Injected into agent context when running with `--tools skills` |
| `setup_command` | no | Command run before the agent starts (e.g. `npm install`). Split on whitespace and executed directly via `spawnSync` — no shell, no operators (`&&`, `\|`, etc.), no quoting. One command only. |
| `compile_command` | no | Compile/build command (e.g. `npm run build`, `node --check server.js`, `.venv/bin/python -m py_compile main.py`). When set, an instruction pointing the agent at this command is appended to the agent's native context file (`CLAUDE.md` / `GEMINI.md` / `AGENTS.md` / `.github/copilot-instructions.md`) alongside the "no docs files" guidance, so the agent verifies the project compiles and the command shows up in the tool trace. Agent modes only — baseline ignores it. Omit for evals with no CLI compile step (e.g. mobile). |

To test a skill before it is pushed to the remote repo, see [TESTING_SKILLS.md](TESTING_SKILLS.md).

Expand All @@ -61,6 +63,7 @@ id: react_quickstart
name: React Quickstart
skills: auth0-react
setup_command: npm install
compile_command: npm run build
---

## Task
Expand Down
1 change: 1 addition & 0 deletions packages/eval-core/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ export {
writeAgentGuidance,
AGENT_GUIDANCE,
AGENT_CONTEXT_FILENAMES,
compileGuidance,
collectFiles,
readWorkspaceFile,
isPathInside,
Expand Down
2 changes: 2 additions & 0 deletions packages/eval-core/src/loader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ export async function loadEval(
.filter(Boolean);

const setupCommand = meta.setup_command || undefined;
const compileCommand = meta.compile_command || undefined;

return {
id: evalConfig.id,
Expand All @@ -74,6 +75,7 @@ export async function loadEval(
graders,
scaffold,
setupCommand,
compileCommand,
skills,
metadata: {
provider_name: meta.provider_name ?? 'Auth0',
Expand Down
1 change: 1 addition & 0 deletions packages/eval-core/src/types/eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ export interface EvalDefinition {
graders: GraderDef[];
scaffold: Record<string, string>;
setupCommand?: string;
compileCommand?: string;
skills: string[];
metadata: Record<string, string>;
}
1 change: 1 addition & 0 deletions packages/eval-core/src/workspace/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ export {
writeAgentGuidance,
AGENT_GUIDANCE,
AGENT_CONTEXT_FILENAMES,
compileGuidance,
} from './workspace.js';
export type { SetupWorkspaceOptions, RunSetupCommandOptions } from './workspace.js';
export { collectFiles, readWorkspaceFile } from './file-utils.js';
Expand Down
20 changes: 16 additions & 4 deletions packages/eval-core/src/workspace/workspace.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,15 @@ import { resolveInside } from './path-utils.js';
export const AGENT_GUIDANCE = `Do not create any documentation files (README.md, SETUP.md, QUICKSTART.md, IMPLEMENTATION_SUMMARY.md, or any other .md files). Do not create any .txt summary or verification files. Do not create standalone summary or status files of any kind (e.g. AUTH0_SETUP.ts, IMPLEMENTATION_COMPLETE.ts, QUICK_START.ts, FILES_CREATED.txt) — these are not application source code. Only create and modify source code files that are part of the application.
`;

/**
* Builds the compile-verification guidance appended to the agent's context file
* when the eval declares a `compileCommand`. Pointing the agent at the command
* means it appears in the tool trace and the agent can fix any failures.
*/
export function compileGuidance(compileCommand: string): string {
return `After making your changes, you MUST run this command to verify your integration compiles, and fix any errors it reports before finishing:\n\n\`${compileCommand}\`\n`;
}

/**
* The context/memory file each runner reads, relative to the workspace root.
* Writing guidance to the wrong file means the agent silently ignores it:
Expand All @@ -51,12 +60,15 @@ export const AGENT_CONTEXT_FILENAMES: Record<AgentType, string> = {
/**
* Writes {@link AGENT_GUIDANCE} into the context file the given runner reads.
* Appends (preserving any scaffold-provided content) when the file already
* exists; creates it otherwise.
* exists; creates it otherwise. When `compileCommand` is provided, the
* compile-verification guidance (see {@link compileGuidance}) is appended too.
*/
export function writeAgentGuidance(workspace: string, agentType: AgentType): void {
export function writeAgentGuidance(workspace: string, agentType: AgentType, compileCommand?: string): void {
const filename = AGENT_CONTEXT_FILENAMES[agentType];
const dest = join(workspace, filename);

const guidance = compileCommand ? `${AGENT_GUIDANCE}\n${compileGuidance(compileCommand)}` : AGENT_GUIDANCE;

// If the scaffold shipped AGENTS.md but the active runner reads a different
// file, rename it so the guidance reaches the right runner.
const scaffoldAgentsMd = join(workspace, 'AGENTS.md');
Expand All @@ -66,10 +78,10 @@ export function writeAgentGuidance(workspace: string, agentType: AgentType): voi
}

if (existsSync(dest)) {
appendFileSync(dest, `\n${AGENT_GUIDANCE}`, 'utf-8');
appendFileSync(dest, `\n${guidance}`, 'utf-8');
} else {
mkdirSync(join(dest, '..'), { recursive: true });
writeFileSync(dest, AGENT_GUIDANCE, 'utf-8');
writeFileSync(dest, guidance, 'utf-8');
}
}

Expand Down
22 changes: 20 additions & 2 deletions packages/eval-core/tests/loader.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,26 @@ describe('loadEval - setupCommand', () => {
});
});

// ── compile_command frontmatter tests ────────────────────────────────────────

describe('loadEval - compileCommand', () => {
it('parses compile_command from frontmatter', async () => {
makeEvalDir(tmpBase, '---\nskills: auth0-react\ncompile_command: npm run build\n---\n\n## Task\nDo the task.\n');

const result = await loadEval(EVAL_CONFIG, tmpBase);

expect(result.compileCommand).toBe('npm run build');
});

it('returns undefined when compile_command is absent', async () => {
makeEvalDir(tmpBase, '---\nskills: auth0-react\n---\n\n## Task\nDo the task.\n');

const result = await loadEval(EVAL_CONFIG, tmpBase);

expect(result.compileCommand).toBeUndefined();
});
});

// ── System prompt tests ───────────────────────────────────────────────────────

describe('loadEval - system prompt', () => {
Expand Down Expand Up @@ -239,10 +259,8 @@ describe('loadEval - scaffold loading', () => {
expect(result.scaffold['readable.txt']).toBe('this is fine');
expect(result.scaffold['unreadable.txt']).toBeUndefined();
});

});


// ── frontmatter scaffold field tests ─────────────────────────────────────────

function makePromptWithScaffold(scaffoldPath: string): string {
Expand Down
Loading
Loading