PunchTheDev · PunchTheDev · Jun 3, 2026 · Jun 3, 2026 · Jun 3, 2026 · Jun 3, 2026
diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml
@@ -5,6 +5,12 @@ on:
     paths:
       - "agents/**"
 
+# Cancel any in-progress eval for the same PR branch when a new push arrives.
+# Prevents CI queue pile-up when a miner pushes multiple commits in quick succession.
+concurrency:
+  group: eval-${{ github.head_ref }}
+  cancel-in-progress: true
+
 env:
   FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
 
@@ -141,15 +147,37 @@ jobs:
               round_003: 'Absolute Stiffness \u2193',
             };
 
-            // Fetch live SOTA for each spec
+            // Fetch live SOTA + eligibility for each spec.
+            // sotaBySpec[id]: { score: number, eligible: boolean } | 'unclaimed' | null (API down)
+            // We check the eligibility endpoint with the actual eval score to enforce the
+            // marginal-gain rule (1% improvement required for 0–7 day old SOTAs, etc.).
             const sotaBySpec = {};
+            let apiReachable = !apiUrl; // treat "no URL configured" as reachable-but-empty
             for (const r of results) {
               if (!apiUrl) break;
               try {
-                const resp = await fetch(`${apiUrl}/sota/${r.spec_id}`).catch(() => null);
-                if (resp && resp.ok) {
-                  const d = await resp.json();
-                  sotaBySpec[r.spec_id] = d.score ?? d.score_grams ?? null;
+                // Fetch current SOTA score
+                const sotaResp = await fetch(`${apiUrl}/sota/${r.spec_id}`).catch(() => null);
+                if (sotaResp && sotaResp.status === 404) {
+                  apiReachable = true;
+                  sotaBySpec[r.spec_id] = 'unclaimed';
+                } else if (sotaResp && sotaResp.ok) {
+                  apiReachable = true;
+                  const d = await sotaResp.json();
+                  const currentScore = d.score ?? d.score_grams ?? null;
+                  // Check marginal-gain eligibility with the eval score
+                  const evalScore = r.result?.score;
+                  let eligible = false;
+                  if (evalScore !== null && evalScore !== undefined) {
+                    const elResp = await fetch(
+                      `${apiUrl}/sota/${r.spec_id}/eligibility?score=${evalScore}`
+                    ).catch(() => null);
+                    if (elResp && elResp.ok) {
+                      const el = await elResp.json();
+                      eligible = el.eligible === true;
+                    }
+                  }
+                  sotaBySpec[r.spec_id] = { score: currentScore, eligible };
                 }
               } catch {}
             }
@@ -172,7 +200,9 @@ jobs:
               const rname = ROUND_NAME[r.round_id] || r.round_id;
               const score = res.score;  // null when eval failed
               const baseline = r.baseline;
-              const sota = sotaBySpec[r.spec_id] ?? null;
+              // sotaBySpec[id]: { score, eligible } | 'unclaimed' | undefined (API was down)
+              const sotaData = sotaBySpec[r.spec_id];
+              const sotaKnown = sotaData !== undefined;
               const dir = r.direction;
               const icon = res.passed ? '\u2705' : '\u274c';
 
@@ -198,17 +228,24 @@ jobs:
               const vsBaseline = (pct >= 0 ? '+' : '') + pct.toFixed(1) + '%';
 
               let sotaCell = '\u2014';
-              if (sota === null) {
-                // Unclaimed spec — any passing submission sets new SOTA
+              if (!apiReachable || !sotaKnown) {
+                // API was down — can't determine SOTA status; don't guess
+                sotaCell = '\u2014 (API unreachable)';
+              } else if (sotaData === 'unclaimed') {
+                // No existing SOTA — any passing submission sets it
                 anyBeatsSota = true;
                 sotaCell = '\u2b50 unclaimed \u2014 sets new SOTA';
-              } else {
-                const beats = dir === 'maximize' ? score > sota : score < sota;
+              } else if (sotaData && sotaData.eligible) {
+                // Beats current SOTA with sufficient margin (marginal-gain rule satisfied)
+                anyBeatsSota = true;
+                sotaCell = `\u2713 beats ${fmt(sotaData.score, r.metric)} ${unit} (margin ok)`;
+              } else if (sotaData && sotaData.score !== null) {
+                const beats = dir === 'maximize' ? score > sotaData.score : score < sotaData.score;
                 if (beats) {
-                  anyBeatsSota = true;
-                  sotaCell = `\u2713 beats ${fmt(sota, r.metric)} ${unit}`;
+                  // Beats raw score but margin threshold not met
+                  sotaCell = `\u26a0\ufe0f beats ${fmt(sotaData.score, r.metric)} ${unit} — margin too small`;
                 } else {
-                  sotaCell = `${fmt(sota, r.metric)} ${unit}`;
+                  sotaCell = `${fmt(sotaData.score, r.metric)} ${unit}`;
                 }
               }
 

diff --git a/QUICKSTART.md b/QUICKSTART.md
@@ -83,16 +83,8 @@ curl http://143.244.191.193:8000/specs/r01_001_easy
 forge new <your-name>
 ```
 
-Edit `agents/<your-name>/agent.py`. Two supported signatures:
+Edit `agents/<your-name>/agent.py`. The required signature is:
 
-**Static agent** (no LLM):
-```python
-def generate(spec: dict) -> bytes:
-    """Takes the spec dict, returns STEP file bytes."""
-    ...
-```
-
-**LLM agent** (recommended — harness injects the client):
 ```python
 from forge.sdk.llm import LLMClient
 
@@ -102,13 +94,14 @@ def generate(spec: dict, llm: LLMClient) -> bytes:
     ...
 ```
 
-No API key needed — the harness injects `LLMClient` automatically using whitelisted models. See `examples/metric-aware-agent/agent.py` for a recommended starting point that adapts strategy to all three competition categories.
+The harness injects `LLMClient` automatically — no API key needed. Agents that don't accept the `llm` parameter are rejected at eval time.
+
+Whitelisted models: `claude-haiku-4-5`, `claude-3-5-haiku`, `gpt-4o-mini`.
 
 Reference implementations:
 - `agents/baseline/` — solid bracket baseline; sets the upper-bound score every submission must beat
-- `examples/metric-aware-agent/` — adapts geometry to mass / stiffness / deflection objectives
+- `examples/metric-aware-agent/` — adapts geometry to mass / stiffness / deflection objectives (recommended starting point)
 - `examples/llm-agent/` — minimal LLM integration example
-- `examples/deterministic-agent/` — pure geometry math, no LLM; shows algorithms are welcome
 
 ---
 

diff --git a/README.md b/README.md
@@ -103,16 +103,8 @@ See [CONTRIBUTING.md](CONTRIBUTING.md) for full guidelines.
 
 ## Agent interface
 
-Two supported signatures — the harness detects which one you use automatically:
+Your `agent.py` must export a single function:
 
-**Static agent** (no LLM):
-```python
-def generate(spec: dict) -> bytes:
-    """Build and return STEP file bytes for the given spec."""
-    ...
-```
-
-**LLM agent** (recommended):
 ```python
 from forge.sdk.llm import LLMClient
 
@@ -122,7 +114,7 @@ def generate(spec: dict, llm: LLMClient) -> bytes:
     ...
 ```
 
-The harness injects `LLMClient` automatically — no API key required. Whitelisted models: `claude-haiku-4-5`, `claude-3-5-haiku`, `gpt-4o-mini`.
+The harness injects `LLMClient` automatically — no API key required. Whitelisted models: `claude-haiku-4-5`, `claude-3-5-haiku`, `gpt-4o-mini`. Agents without the `llm` parameter are rejected at eval time.
 
 Three example agents in `examples/`:
 

diff --git a/agents/baseline/agent.py b/agents/baseline/agent.py
@@ -4,15 +4,21 @@
 Score: ~165g. Miners beat this by removing material where stress is low.
 The bracket has a vertical mounting plate (bolt holes), a horizontal shelf
 (reaches the load point), and no topology optimization whatsoever.
+
+This agent accepts the LLMClient parameter but does not use it — it is
+purely deterministic geometry. This is permitted; the harness requires
+the parameter to be present in the signature, not necessarily used.
 """
 
 from __future__ import annotations
 
 import os
 import tempfile
 
+from forge.sdk.llm import LLMClient
+
 
-def generate(spec: dict) -> bytes:
+def generate(spec: dict, llm: LLMClient) -> bytes:  # noqa: ARG001
     """Build a parametric L-bracket and return STEP bytes."""
     from OCP.BRepAlgoAPI import BRepAlgoAPI_Cut, BRepAlgoAPI_Fuse
     from OCP.BRepPrimAPI import BRepPrimAPI_MakeBox, BRepPrimAPI_MakeCylinder

diff --git a/agents/template/agent.py b/agents/template/agent.py
@@ -1,33 +1,32 @@
 """
 Template agent — start here.
 
-Two supported signatures:
+Required signature:
 
-    generate(spec: dict) -> bytes              # static agent
-    generate(spec: dict, llm: LLMClient) -> bytes  # LLM agent (recommended)
+    generate(spec: dict, llm: LLMClient) -> bytes
 
-The harness detects which you use via inspect.signature and injects LLMClient
-automatically if present — no API key required from you.
+The harness injects LLMClient automatically — no API key required.
+Agents without the `llm` parameter are rejected at eval time.
 
 See QUICKSTART.md for a full walkthrough. For a recommended starting point
 that adapts to all three competition categories, see examples/metric-aware-agent/.
 """
 
 from __future__ import annotations
 
-# To use the LLM client, uncomment:
-# from forge.sdk.llm import LLMClient
+from forge.sdk.llm import LLMClient
 
 # TODO: import your geometry library
 # from build123d import ...          # recommended
 # from OCP.BRepPrimAPI import ...    # raw OCP (see agents/baseline/)
 
 
-def generate(spec: dict) -> bytes:
+def generate(spec: dict, llm: LLMClient) -> bytes:
     """
     Build and return a STEP file for the given spec.
 
-    To use an LLM, change the signature to: generate(spec, llm: LLMClient)
+    Use `llm.chat(messages)` to call the whitelisted LLM
+    (claude-haiku-4-5, claude-3-5-haiku, or gpt-4o-mini).
 
     Args:
         spec: Problem specification dict. Key fields:
@@ -51,6 +50,15 @@ def generate(spec: dict) -> bytes:
     """
 
     constraints = spec["constraints"]
+    metric = spec["scoring"]["metric"]  # "mass_grams" | "stiffness_to_weight" | "deflection_mm"
+
+    # TODO: use the LLM to reason about geometry parameters
+    # response = llm.chat([
+    #     {"role": "system", "content": "You are a structural engineering assistant."},
+    #     {"role": "user", "content": f"Suggest wall thickness (mm) for a bracket optimizing {metric}. "
+    #                                 f"Load: {constraints['load_newtons']} N. Reply with a single number."},
+    # ])
+    # thickness_mm = float(response.strip())
 
     # TODO: read the constraints you need
     # load_n = constraints["load_newtons"]
@@ -59,14 +67,14 @@ def generate(spec: dict) -> bytes:
     # bolt_pattern = constraints["bolt_pattern_mm"]  # [[y, z], ...]
     # bolt_d = constraints["bolt_diameter_clearance_mm"]
 
-    # TODO: build your geometry
+    # TODO: build your geometry using the parameters above
     # shape = ...
 
     # TODO: write to STEP and return bytes
     # return _to_step_bytes(shape)
 
     raise NotImplementedError(
-        "Replace this with your geometry. See QUICKSTART.md for examples."
+        "Replace this with your geometry. See examples/metric-aware-agent/ for a working example."
     )
 
 

diff --git a/docs/threat-model.md b/docs/threat-model.md
@@ -71,7 +71,7 @@ This document describes the attack surface of the Forge benchmark and the mitiga
 - FEA gate: CalculiX linear statics must pass; max stress ≤ allowable stress.
 - Geometry gate: `min_wall_thickness` constraint enforced via Shapely cross-section sampling.
 - Build volume, bolt pattern, overhang angle checked by `benchmark/geometry.py`.
-- 3× determinism check: eval runs three times with the same seed; non-deterministic outputs are rejected.
+- 2× determinism check on the first spec: CI runs spec 0 twice; if the score differs, the submission is flagged non-deterministic and rejected. Remaining specs run once to keep CI time manageable. (Note: stochastic agents that vary only on later specs could slip through — improving this is a known gap.)
 
 **Residual risk:** Low-medium. The wall-thickness sampler uses a finite grid; pathological thin bridges between grid sample points could still slip through.
 
@@ -144,6 +144,18 @@ This document describes the attack surface of the Forge benchmark and the mitiga
 
 ---
 
+## Threat 9 — Specialist gaming (leaderboard breadth gap)
+
+**Attack:** Enter only 3 easy specs (one per round) and achieve #1 on all three. Under an avg_rank model, this yields avg_rank = 1.0 — ranking above a well-rounded agent that competes on all 45 specs but averages rank 1.5.
+
+**Mitigations:**
+- **Implemented:** Overall leaderboard now sorts by `overall_score` — mean normalized performance across ALL active specs. Unentered specs count as 1.0 (baseline) in the mean. A miner who skips 42 specs cannot achieve overall_score < 1.0, so they rank below any agent that beats baseline across the full problem pool.
+- This directly rewards breadth: entering more specs and beating baseline on each one lowers your overall_score.
+
+**Residual risk:** Low. A miner who is genuinely #1 on every spec they enter still benefits from entering more specs — better coverage further reduces their overall_score.
+
+---
+
 ## Summary table
 
 | Threat | Severity | Status |
@@ -156,3 +168,5 @@ This document describes the attack surface of the Forge benchmark and the mitiga
 | Sybil submissions | Low | Mitigated (credibility + min PR requirement) |
 | Eval overfitting | Medium | Mitigated (hidden spec set + rotation) |
 | Load-case overfitting | Low | Mitigated (seeded load perturbation in FEA) |
+| Specialist gaming | Medium | Mitigated (breadth-normalized overall_score) |
+| Determinism check coverage | Low | Partial — only spec 0 runs 2× (gap: stochastic agents may vary on later specs) |
diff --git a/scripts/record_submissions.py b/scripts/record_submissions.py
@@ -9,6 +9,7 @@
 import json
 import os
 import sys
+import time
 import urllib.error
 import urllib.request
 from pathlib import Path
@@ -63,8 +64,17 @@
     req = urllib.request.Request(
         url, data=body, headers={"Content-Type": "application/json"}, method="POST"
     )
-    try:
-        with urllib.request.urlopen(req, timeout=15) as resp:
-            print(f"[{spec_id}] recorded: {resp.read().decode()[:80]}")
-    except urllib.error.URLError as e:
-        print(f"[{spec_id}] forge-api POST failed (non-blocking): {e}", flush=True)
+    # Retry with exponential backoff (1s → 2s → 4s) so transient API hiccups
+    # don't silently drop leaderboard submissions.
+    last_exc: Exception | None = None
+    for attempt in range(3):
+        try:
+            with urllib.request.urlopen(req, timeout=15) as resp:
+                print(f"[{spec_id}] recorded: {resp.read().decode()[:80]}")
+                break
+        except urllib.error.URLError as e:
+            last_exc = e
+            if attempt < 2:
+                time.sleep(2 ** attempt)
+    else:
+        print(f"[{spec_id}] forge-api POST failed after 3 attempts (non-blocking): {last_exc}", flush=True)
diff --git a/scripts/run_hidden_eval.py b/scripts/run_hidden_eval.py
@@ -25,7 +25,9 @@
 import subprocess
 import sys
 import tempfile
+import time
 import urllib.request
+import urllib.error
 
 API_URL = os.environ.get("FORGE_API_URL", "").rstrip("/")
 ADMIN_KEY = os.environ.get("FORGE_ADMIN_KEY", "")
@@ -48,11 +50,24 @@
 workspace = os.getcwd()
 
 
+def _api_request(req: urllib.request.Request, label: str) -> dict:
+    """Execute an API request with 3-attempt exponential backoff (1s → 2s → 4s)."""
+    last_exc: Exception | None = None
+    for attempt in range(3):
+        try:
+            with urllib.request.urlopen(req, timeout=15) as resp:
+                return json.loads(resp.read())
+        except urllib.error.URLError as e:
+            last_exc = e
+            if attempt < 2:
+                time.sleep(2 ** attempt)
+    raise RuntimeError(f"[{label}] forge-api request failed after 3 attempts: {last_exc}") from last_exc
+
+
 def _api_get(path: str) -> dict:
     url = f"{API_URL}{path}"
     req = urllib.request.Request(url, headers={"Authorization": f"Bearer {ADMIN_KEY}"})
-    with urllib.request.urlopen(req, timeout=15) as resp:
-        return json.loads(resp.read())
+    return _api_request(req, f"GET {path}")
 
 
 def _api_post(path: str, payload: dict) -> dict:
@@ -63,8 +78,7 @@ def _api_post(path: str, payload: dict) -> dict:
         headers={"Authorization": f"Bearer {ADMIN_KEY}", "Content-Type": "application/json"},
         method="POST",
     )
-    with urllib.request.urlopen(req, timeout=15) as resp:
-        return json.loads(resp.read())
+    return _api_request(req, f"POST {path}")
 
 
 def _run_eval(spec: dict) -> dict: