Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/multi-model-eval.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ jobs:
concurrency: 20
- model: opus
concurrency: 20
- model: gpt-4.1
concurrency: 5
- model: gpt-5.4
concurrency: 1
- model: gemini-2.5-pro
concurrency: 20
steps:
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,5 @@ CLAUDE.local.md
resources/deno/
/scripts/dist
evals/promptfoo/results.json
evals/promptfoo/promptfooconfig.yaml
evals/promptfoo/node_modules/
2 changes: 1 addition & 1 deletion deno.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"audit": "deno run --allow-read --allow-net=api.osv.dev --allow-env=GITHUB_STEP_SUMMARY --allow-write scripts/audit_deps.ts",
"audit-actions": "deno run --allow-read --allow-net=api.github.com --allow-env=GITHUB_STEP_SUMMARY,GITHUB_TOKEN --allow-write scripts/audit_actions.ts",
"review-skills": "deno run --allow-read --allow-run --allow-env=GITHUB_STEP_SUMMARY --allow-write scripts/review_skills.ts",
"eval-skill-triggers": "deno run --allow-read --allow-run --allow-env --allow-write scripts/eval_skill_triggers_promptfoo.ts"
"eval-skill-triggers": "deno run --allow-read --allow-run --allow-env --allow-write --allow-net scripts/eval_skill_triggers_promptfoo.ts"
},
"imports": {
"@std/assert": "jsr:@std/assert@^1.0.18",
Expand Down
11 changes: 7 additions & 4 deletions evals/promptfoo/generate_config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ import { parse as parseYaml, stringify as stringifyYaml } from "@std/yaml";
interface ProviderDefinition {
id: string;
apiKeyEnv: string;
delay?: number;
}

const PROVIDER_REGISTRY: Record<string, ProviderDefinition> = {
Expand All @@ -46,9 +47,10 @@ const PROVIDER_REGISTRY: Record<string, ProviderDefinition> = {
id: "anthropic:messages:claude-opus-4-6",
apiKeyEnv: "ANTHROPIC_API_KEY",
},
"gpt-4.1": {
id: "openai:gpt-4.1",
"gpt-5.4": {
id: "openai:gpt-5.4",
apiKeyEnv: "OPENAI_API_KEY",
delay: 500,
},
"gemini-2.5-pro": {
id: "google:gemini-2.5-pro",
Expand Down Expand Up @@ -206,7 +208,7 @@ async function main(): Promise<void> {
`try {`,
` const parsed = typeof output === 'object' ? (Array.isArray(output) ? output : [output]) : JSON.parse(str);`,
` const calls = Array.isArray(parsed) ? parsed : [parsed];`,
` return calls.some(c => c.function?.name === needle || c.name === needle);`,
` return calls.some(c => c.function?.name === needle || c.name === needle || c.functionCall?.name === needle);`,
`} catch {}`,
`return str.includes('"' + needle + '"');`,
].join("\n"),
Expand All @@ -226,7 +228,7 @@ async function main(): Promise<void> {
`try {`,
` const parsed = typeof output === 'object' ? (Array.isArray(output) ? output : [output]) : JSON.parse(str);`,
` const calls = Array.isArray(parsed) ? parsed : [parsed];`,
` return !calls.some(c => c.function?.name === needle || c.name === needle);`,
` return !calls.some(c => c.function?.name === needle || c.name === needle || c.functionCall?.name === needle);`,
`} catch {}`,
`return !str.includes('"' + needle + '"');`,
].join("\n"),
Expand All @@ -252,6 +254,7 @@ async function main(): Promise<void> {
systemMessage,
tools,
},
...(provider.delay ? { delay: provider.delay } : {}),
},
],
tests,
Expand Down
4 changes: 3 additions & 1 deletion evals/promptfoo/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
"overrides": {
"@anthropic-ai/sdk": "^0.81.0",
"hono": ">=4.12.12",
"@hono/node-server": ">=1.19.13"
"@hono/node-server": ">=1.19.13",
"axios": ">=1.15.0",
"basic-ftp": ">=5.2.1"
}
}
Loading
Loading