From c5a307a90f2e9d381e0bfb9de05c9d6d9436fefb Mon Sep 17 00:00:00 2001 From: Punch Date: Wed, 3 Jun 2026 05:19:23 +0000 Subject: [PATCH 1/7] Fix threat model: correct determinism claim, add Threat 9 specialist gaming MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Correct '3× determinism check' → '2× on first spec only' (spec 0 runs twice; remaining specs run once to keep CI time manageable) - Document the known gap: stochastic agents that vary only on later specs may slip through the single-run check - Add Threat 9: Specialist gaming — old avg_rank model allowed a miner entering 3 easy specs at rank 1 to beat a well-rounded agent averaging rank 1.5 across all 45. Mitigated by switching overall leaderboard to breadth-normalized overall_score (unentered specs count as baseline 1.0) - Add 'Determinism check coverage' row to summary table Co-Authored-By: Claude Sonnet 4.6 --- docs/threat-model.md | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/docs/threat-model.md b/docs/threat-model.md index f870f4d..3c02103 100644 --- a/docs/threat-model.md +++ b/docs/threat-model.md @@ -71,7 +71,7 @@ This document describes the attack surface of the Forge benchmark and the mitiga - FEA gate: CalculiX linear statics must pass; max stress ≤ allowable stress. - Geometry gate: `min_wall_thickness` constraint enforced via Shapely cross-section sampling. - Build volume, bolt pattern, overhang angle checked by `benchmark/geometry.py`. -- 3× determinism check: eval runs three times with the same seed; non-deterministic outputs are rejected. +- 2× determinism check on the first spec: CI runs spec 0 twice; if the score differs, the submission is flagged non-deterministic and rejected. Remaining specs run once to keep CI time manageable. (Note: stochastic agents that vary only on later specs could slip through — improving this is a known gap.) **Residual risk:** Low-medium. The wall-thickness sampler uses a finite grid; pathological thin bridges between grid sample points could still slip through. @@ -144,6 +144,18 @@ This document describes the attack surface of the Forge benchmark and the mitiga --- +## Threat 9 — Specialist gaming (leaderboard breadth gap) + +**Attack:** Enter only 3 easy specs (one per round) and achieve #1 on all three. Under an avg_rank model, this yields avg_rank = 1.0 — ranking above a well-rounded agent that competes on all 45 specs but averages rank 1.5. + +**Mitigations:** +- **Implemented:** Overall leaderboard now sorts by `overall_score` — mean normalized performance across ALL active specs. Unentered specs count as 1.0 (baseline) in the mean. A miner who skips 42 specs cannot achieve overall_score < 1.0, so they rank below any agent that beats baseline across the full problem pool. +- This directly rewards breadth: entering more specs and beating baseline on each one lowers your overall_score. + +**Residual risk:** Low. A miner who is genuinely #1 on every spec they enter still benefits from entering more specs — better coverage further reduces their overall_score. + +--- + ## Summary table | Threat | Severity | Status | @@ -156,3 +168,5 @@ This document describes the attack surface of the Forge benchmark and the mitiga | Sybil submissions | Low | Mitigated (credibility + min PR requirement) | | Eval overfitting | Medium | Mitigated (hidden spec set + rotation) | | Load-case overfitting | Low | Mitigated (seeded load perturbation in FEA) | +| Specialist gaming | Medium | Mitigated (breadth-normalized overall_score) | +| Determinism check coverage | Low | Partial — only spec 0 runs 2× (gap: stochastic agents may vary on later specs) | From a9d52cf3328cc3f249736a42f72f0d5062cdd57f Mon Sep 17 00:00:00 2001 From: Punch Date: Wed, 3 Jun 2026 05:27:41 +0000 Subject: [PATCH 2/7] Add CI concurrency limit + retry logic for submission posting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit eval.yml: Add concurrency group keyed on branch name with cancel-in-progress=true. When a miner pushes multiple commits rapidly to the same PR branch, only the latest push runs eval — prevents CI queue pile-up from spurious duplicate runs. record_submissions.py: Replace single-attempt POST with 3-attempt exponential backoff (1s → 2s → 4s). Transient forge-api blips (restart, brief 503) no longer silently drop leaderboard submissions. All three attempts must fail before the error is logged as non-blocking. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/eval.yml | 6 ++++++ scripts/record_submissions.py | 20 +++++++++++++++----- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml index 96d6c93..244639c 100644 --- a/.github/workflows/eval.yml +++ b/.github/workflows/eval.yml @@ -5,6 +5,12 @@ on: paths: - "agents/**" +# Cancel any in-progress eval for the same PR branch when a new push arrives. +# Prevents CI queue pile-up when a miner pushes multiple commits in quick succession. +concurrency: + group: eval-${{ github.head_ref }} + cancel-in-progress: true + env: FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true diff --git a/scripts/record_submissions.py b/scripts/record_submissions.py index ede744c..61e935c 100644 --- a/scripts/record_submissions.py +++ b/scripts/record_submissions.py @@ -9,6 +9,7 @@ import json import os import sys +import time import urllib.error import urllib.request from pathlib import Path @@ -63,8 +64,17 @@ req = urllib.request.Request( url, data=body, headers={"Content-Type": "application/json"}, method="POST" ) - try: - with urllib.request.urlopen(req, timeout=15) as resp: - print(f"[{spec_id}] recorded: {resp.read().decode()[:80]}") - except urllib.error.URLError as e: - print(f"[{spec_id}] forge-api POST failed (non-blocking): {e}", flush=True) + # Retry with exponential backoff (1s → 2s → 4s) so transient API hiccups + # don't silently drop leaderboard submissions. + last_exc: Exception | None = None + for attempt in range(3): + try: + with urllib.request.urlopen(req, timeout=15) as resp: + print(f"[{spec_id}] recorded: {resp.read().decode()[:80]}") + break + except urllib.error.URLError as e: + last_exc = e + if attempt < 2: + time.sleep(2 ** attempt) + else: + print(f"[{spec_id}] forge-api POST failed after 3 attempts (non-blocking): {last_exc}", flush=True) From 3a1b550b125e700da36249f527f8cd55ce6bbee7 Mon Sep 17 00:00:00 2001 From: Punch Date: Wed, 3 Jun 2026 05:28:19 +0000 Subject: [PATCH 3/7] Add retry backoff to hidden eval API calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit run_hidden_eval.py GET and POST were single-attempt with no retry. Consolidate into _api_request() with 3-attempt exponential backoff (1s → 2s → 4s). Raises RuntimeError only after all attempts fail, which propagates cleanly to the CI step rather than leaving the hidden eval silently unrecorded. Co-Authored-By: Claude Sonnet 4.6 --- scripts/run_hidden_eval.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/scripts/run_hidden_eval.py b/scripts/run_hidden_eval.py index 4cee17e..6a9e698 100644 --- a/scripts/run_hidden_eval.py +++ b/scripts/run_hidden_eval.py @@ -25,7 +25,9 @@ import subprocess import sys import tempfile +import time import urllib.request +import urllib.error API_URL = os.environ.get("FORGE_API_URL", "").rstrip("/") ADMIN_KEY = os.environ.get("FORGE_ADMIN_KEY", "") @@ -48,11 +50,24 @@ workspace = os.getcwd() +def _api_request(req: urllib.request.Request, label: str) -> dict: + """Execute an API request with 3-attempt exponential backoff (1s → 2s → 4s).""" + last_exc: Exception | None = None + for attempt in range(3): + try: + with urllib.request.urlopen(req, timeout=15) as resp: + return json.loads(resp.read()) + except urllib.error.URLError as e: + last_exc = e + if attempt < 2: + time.sleep(2 ** attempt) + raise RuntimeError(f"[{label}] forge-api request failed after 3 attempts: {last_exc}") from last_exc + + def _api_get(path: str) -> dict: url = f"{API_URL}{path}" req = urllib.request.Request(url, headers={"Authorization": f"Bearer {ADMIN_KEY}"}) - with urllib.request.urlopen(req, timeout=15) as resp: - return json.loads(resp.read()) + return _api_request(req, f"GET {path}") def _api_post(path: str, payload: dict) -> dict: @@ -63,8 +78,7 @@ def _api_post(path: str, payload: dict) -> dict: headers={"Authorization": f"Bearer {ADMIN_KEY}", "Content-Type": "application/json"}, method="POST", ) - with urllib.request.urlopen(req, timeout=15) as resp: - return json.loads(resp.read()) + return _api_request(req, f"POST {path}") def _run_eval(spec: dict) -> dict: From 798a1ebd6626ce580c7ffd5285482ae7ca7afe15 Mon Sep 17 00:00:00 2001 From: Punch Date: Wed, 3 Jun 2026 05:31:07 +0000 Subject: [PATCH 4/7] Fix stale docs: remove static-agent signature, update template to LLM-only PR #212 made generate(spec, llm) mandatory but left three places still showing generate(spec) as a supported alternative: - README.md: Remove static-agent code block; clarify that agents without the llm parameter are rejected at eval time - QUICKSTART.md: Remove static-agent section; remove reference to deleted examples/deterministic-agent/; update reference list to current examples - agents/template/agent.py: Change scaffold to use generate(spec, llm) signature with LLMClient import; add LLM usage example in TODO comments; remove misleading "Two supported signatures" docstring Without this fix, a miner following the scaffold + docs would write a static agent, run CI, and get a cryptic rejection error. Co-Authored-By: Claude Sonnet 4.6 --- QUICKSTART.md | 17 +++++------------ README.md | 12 ++---------- agents/template/agent.py | 30 +++++++++++++++++++----------- 3 files changed, 26 insertions(+), 33 deletions(-) diff --git a/QUICKSTART.md b/QUICKSTART.md index 599d599..89cac19 100644 --- a/QUICKSTART.md +++ b/QUICKSTART.md @@ -83,16 +83,8 @@ curl http://143.244.191.193:8000/specs/r01_001_easy forge new ``` -Edit `agents//agent.py`. Two supported signatures: +Edit `agents//agent.py`. The required signature is: -**Static agent** (no LLM): -```python -def generate(spec: dict) -> bytes: - """Takes the spec dict, returns STEP file bytes.""" - ... -``` - -**LLM agent** (recommended — harness injects the client): ```python from forge.sdk.llm import LLMClient @@ -102,13 +94,14 @@ def generate(spec: dict, llm: LLMClient) -> bytes: ... ``` -No API key needed — the harness injects `LLMClient` automatically using whitelisted models. See `examples/metric-aware-agent/agent.py` for a recommended starting point that adapts strategy to all three competition categories. +The harness injects `LLMClient` automatically — no API key needed. Agents that don't accept the `llm` parameter are rejected at eval time. + +Whitelisted models: `claude-haiku-4-5`, `claude-3-5-haiku`, `gpt-4o-mini`. Reference implementations: - `agents/baseline/` — solid bracket baseline; sets the upper-bound score every submission must beat -- `examples/metric-aware-agent/` — adapts geometry to mass / stiffness / deflection objectives +- `examples/metric-aware-agent/` — adapts geometry to mass / stiffness / deflection objectives (recommended starting point) - `examples/llm-agent/` — minimal LLM integration example -- `examples/deterministic-agent/` — pure geometry math, no LLM; shows algorithms are welcome --- diff --git a/README.md b/README.md index 03e3379..83e659c 100644 --- a/README.md +++ b/README.md @@ -103,16 +103,8 @@ See [CONTRIBUTING.md](CONTRIBUTING.md) for full guidelines. ## Agent interface -Two supported signatures — the harness detects which one you use automatically: +Your `agent.py` must export a single function: -**Static agent** (no LLM): -```python -def generate(spec: dict) -> bytes: - """Build and return STEP file bytes for the given spec.""" - ... -``` - -**LLM agent** (recommended): ```python from forge.sdk.llm import LLMClient @@ -122,7 +114,7 @@ def generate(spec: dict, llm: LLMClient) -> bytes: ... ``` -The harness injects `LLMClient` automatically — no API key required. Whitelisted models: `claude-haiku-4-5`, `claude-3-5-haiku`, `gpt-4o-mini`. +The harness injects `LLMClient` automatically — no API key required. Whitelisted models: `claude-haiku-4-5`, `claude-3-5-haiku`, `gpt-4o-mini`. Agents without the `llm` parameter are rejected at eval time. Three example agents in `examples/`: diff --git a/agents/template/agent.py b/agents/template/agent.py index 6d8f403..c2c8297 100644 --- a/agents/template/agent.py +++ b/agents/template/agent.py @@ -1,13 +1,12 @@ """ Template agent — start here. -Two supported signatures: +Required signature: - generate(spec: dict) -> bytes # static agent - generate(spec: dict, llm: LLMClient) -> bytes # LLM agent (recommended) + generate(spec: dict, llm: LLMClient) -> bytes -The harness detects which you use via inspect.signature and injects LLMClient -automatically if present — no API key required from you. +The harness injects LLMClient automatically — no API key required. +Agents without the `llm` parameter are rejected at eval time. See QUICKSTART.md for a full walkthrough. For a recommended starting point that adapts to all three competition categories, see examples/metric-aware-agent/. @@ -15,19 +14,19 @@ from __future__ import annotations -# To use the LLM client, uncomment: -# from forge.sdk.llm import LLMClient +from forge.sdk.llm import LLMClient # TODO: import your geometry library # from build123d import ... # recommended # from OCP.BRepPrimAPI import ... # raw OCP (see agents/baseline/) -def generate(spec: dict) -> bytes: +def generate(spec: dict, llm: LLMClient) -> bytes: """ Build and return a STEP file for the given spec. - To use an LLM, change the signature to: generate(spec, llm: LLMClient) + Use `llm.chat(messages)` to call the whitelisted LLM + (claude-haiku-4-5, claude-3-5-haiku, or gpt-4o-mini). Args: spec: Problem specification dict. Key fields: @@ -51,6 +50,15 @@ def generate(spec: dict) -> bytes: """ constraints = spec["constraints"] + metric = spec["scoring"]["metric"] # "mass_grams" | "stiffness_to_weight" | "deflection_mm" + + # TODO: use the LLM to reason about geometry parameters + # response = llm.chat([ + # {"role": "system", "content": "You are a structural engineering assistant."}, + # {"role": "user", "content": f"Suggest wall thickness (mm) for a bracket optimizing {metric}. " + # f"Load: {constraints['load_newtons']} N. Reply with a single number."}, + # ]) + # thickness_mm = float(response.strip()) # TODO: read the constraints you need # load_n = constraints["load_newtons"] @@ -59,14 +67,14 @@ def generate(spec: dict) -> bytes: # bolt_pattern = constraints["bolt_pattern_mm"] # [[y, z], ...] # bolt_d = constraints["bolt_diameter_clearance_mm"] - # TODO: build your geometry + # TODO: build your geometry using the parameters above # shape = ... # TODO: write to STEP and return bytes # return _to_step_bytes(shape) raise NotImplementedError( - "Replace this with your geometry. See QUICKSTART.md for examples." + "Replace this with your geometry. See examples/metric-aware-agent/ for a working example." ) From 381a1257405f89bd504afd6de5a4e200e2760401 Mon Sep 17 00:00:00 2001 From: Punch Date: Wed, 3 Jun 2026 05:36:18 +0000 Subject: [PATCH 5/7] Fix false-positive optimization label when forge-api is unreachable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When FORGE_API_URL is temporarily down during a PR eval, the SOTA fetch silently returns nothing and all specs appear 'unclaimed' → anyBeatsSota becomes true → every passing agent gets the optimization label. Fix: track apiReachable separately. 404 responses from the API mark specs as genuinely unclaimed; no response (fetch failed, timeout, etc.) leaves sotaBySpec without that key. In the table-building loop, if the spec's key is missing from sotaBySpec AND the API was unreachable, show '— (API unreachable)' instead of '⭐ unclaimed' and don't set anyBeatsSota. This prevents false-positive optimization labels during API downtime. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/eval.yml | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml index 244639c..5ba7491 100644 --- a/.github/workflows/eval.yml +++ b/.github/workflows/eval.yml @@ -147,15 +147,23 @@ jobs: round_003: 'Absolute Stiffness \u2193', }; - // Fetch live SOTA for each spec + // Fetch live SOTA for each spec. + // Track apiReachable separately: if the API is down, null sota entries + // should NOT be treated as "unclaimed" — we just don't know their status. const sotaBySpec = {}; + let apiReachable = !apiUrl; // treat "no URL configured" as reachable-but-empty for (const r of results) { if (!apiUrl) break; try { const resp = await fetch(`${apiUrl}/sota/${r.spec_id}`).catch(() => null); if (resp && resp.ok) { + apiReachable = true; const d = await resp.json(); sotaBySpec[r.spec_id] = d.score ?? d.score_grams ?? null; + } else if (resp && resp.status === 404) { + // Spec has no SOTA — genuinely unclaimed + apiReachable = true; + sotaBySpec[r.spec_id] = undefined; // use undefined to mark "unclaimed" } } catch {} } @@ -178,7 +186,9 @@ jobs: const rname = ROUND_NAME[r.round_id] || r.round_id; const score = res.score; // null when eval failed const baseline = r.baseline; - const sota = sotaBySpec[r.spec_id] ?? null; + // sotaBySpec[id] values: number = has SOTA, undefined = unclaimed, missing key = API was down + const sotaKnown = r.spec_id in sotaBySpec; + const sota = sotaBySpec[r.spec_id] ?? null; // null means unclaimed (if sotaKnown) const dir = r.direction; const icon = res.passed ? '\u2705' : '\u274c'; @@ -204,7 +214,10 @@ jobs: const vsBaseline = (pct >= 0 ? '+' : '') + pct.toFixed(1) + '%'; let sotaCell = '\u2014'; - if (sota === null) { + if (!apiReachable || !sotaKnown) { + // API was down — can't determine SOTA status; don't guess + sotaCell = '\u2014 (API unreachable)'; + } else if (sota === null || sota === undefined) { // Unclaimed spec — any passing submission sets new SOTA anyBeatsSota = true; sotaCell = '\u2b50 unclaimed \u2014 sets new SOTA'; From 10dc702676dc58ef5ea9b115817a7e49f073f02c Mon Sep 17 00:00:00 2001 From: Punch Date: Wed, 3 Jun 2026 05:37:52 +0000 Subject: [PATCH 6/7] Enforce marginal-gain rule in PR comment and optimization label MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously the PR comment compared raw scores against SOTA and applied the optimization label based on whether score < sota (minimize) or score > sota (maximize). This ignored the time-decay marginal-gain rule (1.0% for 0-7 days, 0.5% for 7-30 days, 0.1% for 30-90 days). A submission improving SOTA by 0.001g on day 3 would incorrectly receive the optimization label (2× Gittensor multiplier). Fix: after fetching the current SOTA score, also call GET /sota/{spec_id}/eligibility?score={eval_score} for each spec. Use the eligibility.eligible field (which applies the marginal-gain rule) to set anyBeatsSota and the label: - eligible=true → '✓ beats X (margin ok)' → optimization label - beats raw score but margin too small → '⚠ beats X — margin too small' - doesn't beat SOTA → current score shown, no label - API unreachable → '— (API unreachable)', no label Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/eval.yml | 58 +++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml index 5ba7491..8642706 100644 --- a/.github/workflows/eval.yml +++ b/.github/workflows/eval.yml @@ -147,23 +147,37 @@ jobs: round_003: 'Absolute Stiffness \u2193', }; - // Fetch live SOTA for each spec. - // Track apiReachable separately: if the API is down, null sota entries - // should NOT be treated as "unclaimed" — we just don't know their status. + // Fetch live SOTA + eligibility for each spec. + // sotaBySpec[id]: { score: number, eligible: boolean } | 'unclaimed' | null (API down) + // We check the eligibility endpoint with the actual eval score to enforce the + // marginal-gain rule (1% improvement required for 0–7 day old SOTAs, etc.). const sotaBySpec = {}; let apiReachable = !apiUrl; // treat "no URL configured" as reachable-but-empty for (const r of results) { if (!apiUrl) break; try { - const resp = await fetch(`${apiUrl}/sota/${r.spec_id}`).catch(() => null); - if (resp && resp.ok) { + // Fetch current SOTA score + const sotaResp = await fetch(`${apiUrl}/sota/${r.spec_id}`).catch(() => null); + if (sotaResp && sotaResp.status === 404) { apiReachable = true; - const d = await resp.json(); - sotaBySpec[r.spec_id] = d.score ?? d.score_grams ?? null; - } else if (resp && resp.status === 404) { - // Spec has no SOTA — genuinely unclaimed + sotaBySpec[r.spec_id] = 'unclaimed'; + } else if (sotaResp && sotaResp.ok) { apiReachable = true; - sotaBySpec[r.spec_id] = undefined; // use undefined to mark "unclaimed" + const d = await sotaResp.json(); + const currentScore = d.score ?? d.score_grams ?? null; + // Check marginal-gain eligibility with the eval score + const evalScore = r.result?.score; + let eligible = false; + if (evalScore !== null && evalScore !== undefined) { + const elResp = await fetch( + `${apiUrl}/sota/${r.spec_id}/eligibility?score=${evalScore}` + ).catch(() => null); + if (elResp && elResp.ok) { + const el = await elResp.json(); + eligible = el.eligible === true; + } + } + sotaBySpec[r.spec_id] = { score: currentScore, eligible }; } } catch {} } @@ -186,9 +200,9 @@ jobs: const rname = ROUND_NAME[r.round_id] || r.round_id; const score = res.score; // null when eval failed const baseline = r.baseline; - // sotaBySpec[id] values: number = has SOTA, undefined = unclaimed, missing key = API was down - const sotaKnown = r.spec_id in sotaBySpec; - const sota = sotaBySpec[r.spec_id] ?? null; // null means unclaimed (if sotaKnown) + // sotaBySpec[id]: { score, eligible } | 'unclaimed' | undefined (API was down) + const sotaData = sotaBySpec[r.spec_id]; + const sotaKnown = sotaData !== undefined; const dir = r.direction; const icon = res.passed ? '\u2705' : '\u274c'; @@ -217,17 +231,21 @@ jobs: if (!apiReachable || !sotaKnown) { // API was down — can't determine SOTA status; don't guess sotaCell = '\u2014 (API unreachable)'; - } else if (sota === null || sota === undefined) { - // Unclaimed spec — any passing submission sets new SOTA + } else if (sotaData === 'unclaimed') { + // No existing SOTA — any passing submission sets it anyBeatsSota = true; sotaCell = '\u2b50 unclaimed \u2014 sets new SOTA'; - } else { - const beats = dir === 'maximize' ? score > sota : score < sota; + } else if (sotaData && sotaData.eligible) { + // Beats current SOTA with sufficient margin (marginal-gain rule satisfied) + anyBeatsSota = true; + sotaCell = `\u2713 beats ${fmt(sotaData.score, r.metric)} ${unit} (margin ok)`; + } else if (sotaData && sotaData.score !== null) { + const beats = dir === 'maximize' ? score > sotaData.score : score < sotaData.score; if (beats) { - anyBeatsSota = true; - sotaCell = `\u2713 beats ${fmt(sota, r.metric)} ${unit}`; + // Beats raw score but margin threshold not met + sotaCell = `\u26a0\ufe0f beats ${fmt(sotaData.score, r.metric)} ${unit} — margin too small`; } else { - sotaCell = `${fmt(sota, r.metric)} ${unit}`; + sotaCell = `${fmt(sotaData.score, r.metric)} ${unit}`; } } From e3be87557e7dc58f2aea3d289952071d6e558e6b Mon Sep 17 00:00:00 2001 From: Punch Date: Wed, 3 Jun 2026 05:40:27 +0000 Subject: [PATCH 7/7] Fix baseline agent signature to comply with LLM-only contract MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The harness (since PR #212) rejects agents with a one-param generate(spec) signature. The baseline was using the old single-param form and would fail eval if anyone tried to run it through CI. Updated to generate(spec, llm: LLMClient) to match the required contract. The baseline does not call the LLM — it is deterministic geometry — so llm is accepted but unused (noqa: ARG001). The docstring clarifies this is permitted: the harness requires the parameter in the signature, not that it must be called. Co-Authored-By: Claude Sonnet 4.6 --- agents/baseline/agent.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/agents/baseline/agent.py b/agents/baseline/agent.py index d0d1df8..c7e0cc9 100644 --- a/agents/baseline/agent.py +++ b/agents/baseline/agent.py @@ -4,6 +4,10 @@ Score: ~165g. Miners beat this by removing material where stress is low. The bracket has a vertical mounting plate (bolt holes), a horizontal shelf (reaches the load point), and no topology optimization whatsoever. + +This agent accepts the LLMClient parameter but does not use it — it is +purely deterministic geometry. This is permitted; the harness requires +the parameter to be present in the signature, not necessarily used. """ from __future__ import annotations @@ -11,8 +15,10 @@ import os import tempfile +from forge.sdk.llm import LLMClient + -def generate(spec: dict) -> bytes: +def generate(spec: dict, llm: LLMClient) -> bytes: # noqa: ARG001 """Build a parametric L-bracket and return STEP bytes.""" from OCP.BRepAlgoAPI import BRepAlgoAPI_Cut, BRepAlgoAPI_Fuse from OCP.BRepPrimAPI import BRepPrimAPI_MakeBox, BRepPrimAPI_MakeCylinder