Update judge to gpt 5.4

jahooma · jahooma · commit bd97765940bb · 2026-04-14T10:33:30.000-07:00
diff --git a/evals/buffbench/judge.ts b/evals/buffbench/judge.ts
@@ -123,7 +123,7 @@ Provide detailed analysis, strengths, weaknesses, and numerical scores.`,
 const judgeAgents: Record<string, AgentDefinition> = {
   'judge-gpt': {
     id: 'judge-gpt',
-    model: 'openai/gpt-5.1',
+    model: 'openai/gpt-5.4',
     ...judgeAgentBase,
   },
   'judge-gemini': {
@@ -133,7 +133,7 @@ const judgeAgents: Record<string, AgentDefinition> = {
   },
   'judge-sonnet': {
     id: 'judge-claude',
-    model: 'anthropic/claude-sonnet-4.5',
+    model: 'anthropic/claude-sonnet-4.6',
     ...judgeAgentBase,
   },
 }
diff --git a/evals/buffbench/main-nightly.ts b/evals/buffbench/main-nightly.ts
@@ -17,7 +17,7 @@ async function main() {
   const results = await runBuffBench({
     evalDataPaths: [ path.join(__dirname, 'eval-codebuff.json')],
     agents: ['base2-free'],
-    taskConcurrency: 6,
+    taskConcurrency: 5,
     saveTraces,
   })
 
diff --git a/evals/buffbench/main.ts b/evals/buffbench/main.ts
@@ -11,7 +11,7 @@ async function main() {
   await runBuffBench({
     evalDataPaths: [path.join(__dirname, 'eval-codebuff.json')],
     agents: ['base2-free-evals'],
-    taskConcurrency: 10,
+    taskConcurrency: 6,
     saveTraces,
   })