Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 50 additions & 13 deletions .github/workflows/eval.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ on:
paths:
- "agents/**"

# Cancel any in-progress eval for the same PR branch when a new push arrives.
# Prevents CI queue pile-up when a miner pushes multiple commits in quick succession.
concurrency:
group: eval-${{ github.head_ref }}
cancel-in-progress: true

env:
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true

Expand Down Expand Up @@ -141,15 +147,37 @@ jobs:
round_003: 'Absolute Stiffness \u2193',
};

// Fetch live SOTA for each spec
// Fetch live SOTA + eligibility for each spec.
// sotaBySpec[id]: { score: number, eligible: boolean } | 'unclaimed' | null (API down)
// We check the eligibility endpoint with the actual eval score to enforce the
// marginal-gain rule (1% improvement required for 0–7 day old SOTAs, etc.).
const sotaBySpec = {};
let apiReachable = !apiUrl; // treat "no URL configured" as reachable-but-empty
for (const r of results) {
if (!apiUrl) break;
try {
const resp = await fetch(`${apiUrl}/sota/${r.spec_id}`).catch(() => null);
if (resp && resp.ok) {
const d = await resp.json();
sotaBySpec[r.spec_id] = d.score ?? d.score_grams ?? null;
// Fetch current SOTA score
const sotaResp = await fetch(`${apiUrl}/sota/${r.spec_id}`).catch(() => null);
if (sotaResp && sotaResp.status === 404) {
apiReachable = true;
sotaBySpec[r.spec_id] = 'unclaimed';
} else if (sotaResp && sotaResp.ok) {
apiReachable = true;
const d = await sotaResp.json();
const currentScore = d.score ?? d.score_grams ?? null;
// Check marginal-gain eligibility with the eval score
const evalScore = r.result?.score;
let eligible = false;
if (evalScore !== null && evalScore !== undefined) {
const elResp = await fetch(
`${apiUrl}/sota/${r.spec_id}/eligibility?score=${evalScore}`
).catch(() => null);
if (elResp && elResp.ok) {
const el = await elResp.json();
eligible = el.eligible === true;
}
}
sotaBySpec[r.spec_id] = { score: currentScore, eligible };
}
} catch {}
}
Expand All @@ -172,7 +200,9 @@ jobs:
const rname = ROUND_NAME[r.round_id] || r.round_id;
const score = res.score; // null when eval failed
const baseline = r.baseline;
const sota = sotaBySpec[r.spec_id] ?? null;
// sotaBySpec[id]: { score, eligible } | 'unclaimed' | undefined (API was down)
const sotaData = sotaBySpec[r.spec_id];
const sotaKnown = sotaData !== undefined;
const dir = r.direction;
const icon = res.passed ? '\u2705' : '\u274c';

Expand All @@ -198,17 +228,24 @@ jobs:
const vsBaseline = (pct >= 0 ? '+' : '') + pct.toFixed(1) + '%';

let sotaCell = '\u2014';
if (sota === null) {
// Unclaimed spec — any passing submission sets new SOTA
if (!apiReachable || !sotaKnown) {
// API was down — can't determine SOTA status; don't guess
sotaCell = '\u2014 (API unreachable)';
} else if (sotaData === 'unclaimed') {
// No existing SOTA — any passing submission sets it
anyBeatsSota = true;
sotaCell = '\u2b50 unclaimed \u2014 sets new SOTA';
} else {
const beats = dir === 'maximize' ? score > sota : score < sota;
} else if (sotaData && sotaData.eligible) {
// Beats current SOTA with sufficient margin (marginal-gain rule satisfied)
anyBeatsSota = true;
sotaCell = `\u2713 beats ${fmt(sotaData.score, r.metric)} ${unit} (margin ok)`;
} else if (sotaData && sotaData.score !== null) {
const beats = dir === 'maximize' ? score > sotaData.score : score < sotaData.score;
if (beats) {
anyBeatsSota = true;
sotaCell = `\u2713 beats ${fmt(sota, r.metric)} ${unit}`;
// Beats raw score but margin threshold not met
sotaCell = `\u26a0\ufe0f beats ${fmt(sotaData.score, r.metric)} ${unit} — margin too small`;
} else {
sotaCell = `${fmt(sota, r.metric)} ${unit}`;
sotaCell = `${fmt(sotaData.score, r.metric)} ${unit}`;
}
}

Expand Down
17 changes: 5 additions & 12 deletions QUICKSTART.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,16 +83,8 @@ curl http://143.244.191.193:8000/specs/r01_001_easy
forge new <your-name>
```

Edit `agents/<your-name>/agent.py`. Two supported signatures:
Edit `agents/<your-name>/agent.py`. The required signature is:

**Static agent** (no LLM):
```python
def generate(spec: dict) -> bytes:
"""Takes the spec dict, returns STEP file bytes."""
...
```

**LLM agent** (recommended — harness injects the client):
```python
from forge.sdk.llm import LLMClient

Expand All @@ -102,13 +94,14 @@ def generate(spec: dict, llm: LLMClient) -> bytes:
...
```

No API key needed — the harness injects `LLMClient` automatically using whitelisted models. See `examples/metric-aware-agent/agent.py` for a recommended starting point that adapts strategy to all three competition categories.
The harness injects `LLMClient` automatically — no API key needed. Agents that don't accept the `llm` parameter are rejected at eval time.

Whitelisted models: `claude-haiku-4-5`, `claude-3-5-haiku`, `gpt-4o-mini`.

Reference implementations:
- `agents/baseline/` — solid bracket baseline; sets the upper-bound score every submission must beat
- `examples/metric-aware-agent/` — adapts geometry to mass / stiffness / deflection objectives
- `examples/metric-aware-agent/` — adapts geometry to mass / stiffness / deflection objectives (recommended starting point)
- `examples/llm-agent/` — minimal LLM integration example
- `examples/deterministic-agent/` — pure geometry math, no LLM; shows algorithms are welcome

---

Expand Down
12 changes: 2 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,16 +103,8 @@ See [CONTRIBUTING.md](CONTRIBUTING.md) for full guidelines.

## Agent interface

Two supported signatures — the harness detects which one you use automatically:
Your `agent.py` must export a single function:

**Static agent** (no LLM):
```python
def generate(spec: dict) -> bytes:
"""Build and return STEP file bytes for the given spec."""
...
```

**LLM agent** (recommended):
```python
from forge.sdk.llm import LLMClient

Expand All @@ -122,7 +114,7 @@ def generate(spec: dict, llm: LLMClient) -> bytes:
...
```

The harness injects `LLMClient` automatically — no API key required. Whitelisted models: `claude-haiku-4-5`, `claude-3-5-haiku`, `gpt-4o-mini`.
The harness injects `LLMClient` automatically — no API key required. Whitelisted models: `claude-haiku-4-5`, `claude-3-5-haiku`, `gpt-4o-mini`. Agents without the `llm` parameter are rejected at eval time.

Three example agents in `examples/`:

Expand Down
8 changes: 7 additions & 1 deletion agents/baseline/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,21 @@
Score: ~165g. Miners beat this by removing material where stress is low.
The bracket has a vertical mounting plate (bolt holes), a horizontal shelf
(reaches the load point), and no topology optimization whatsoever.

This agent accepts the LLMClient parameter but does not use it — it is
purely deterministic geometry. This is permitted; the harness requires
the parameter to be present in the signature, not necessarily used.
"""

from __future__ import annotations

import os
import tempfile

from forge.sdk.llm import LLMClient


def generate(spec: dict) -> bytes:
def generate(spec: dict, llm: LLMClient) -> bytes: # noqa: ARG001
"""Build a parametric L-bracket and return STEP bytes."""
from OCP.BRepAlgoAPI import BRepAlgoAPI_Cut, BRepAlgoAPI_Fuse
from OCP.BRepPrimAPI import BRepPrimAPI_MakeBox, BRepPrimAPI_MakeCylinder
Expand Down
30 changes: 19 additions & 11 deletions agents/template/agent.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,32 @@
"""
Template agent — start here.

Two supported signatures:
Required signature:

generate(spec: dict) -> bytes # static agent
generate(spec: dict, llm: LLMClient) -> bytes # LLM agent (recommended)
generate(spec: dict, llm: LLMClient) -> bytes

The harness detects which you use via inspect.signature and injects LLMClient
automatically if present — no API key required from you.
The harness injects LLMClient automatically — no API key required.
Agents without the `llm` parameter are rejected at eval time.

See QUICKSTART.md for a full walkthrough. For a recommended starting point
that adapts to all three competition categories, see examples/metric-aware-agent/.
"""

from __future__ import annotations

# To use the LLM client, uncomment:
# from forge.sdk.llm import LLMClient
from forge.sdk.llm import LLMClient

# TODO: import your geometry library
# from build123d import ... # recommended
# from OCP.BRepPrimAPI import ... # raw OCP (see agents/baseline/)


def generate(spec: dict) -> bytes:
def generate(spec: dict, llm: LLMClient) -> bytes:
"""
Build and return a STEP file for the given spec.

To use an LLM, change the signature to: generate(spec, llm: LLMClient)
Use `llm.chat(messages)` to call the whitelisted LLM
(claude-haiku-4-5, claude-3-5-haiku, or gpt-4o-mini).

Args:
spec: Problem specification dict. Key fields:
Expand All @@ -51,6 +50,15 @@ def generate(spec: dict) -> bytes:
"""

constraints = spec["constraints"]
metric = spec["scoring"]["metric"] # "mass_grams" | "stiffness_to_weight" | "deflection_mm"

# TODO: use the LLM to reason about geometry parameters
# response = llm.chat([
# {"role": "system", "content": "You are a structural engineering assistant."},
# {"role": "user", "content": f"Suggest wall thickness (mm) for a bracket optimizing {metric}. "
# f"Load: {constraints['load_newtons']} N. Reply with a single number."},
# ])
# thickness_mm = float(response.strip())

# TODO: read the constraints you need
# load_n = constraints["load_newtons"]
Expand All @@ -59,14 +67,14 @@ def generate(spec: dict) -> bytes:
# bolt_pattern = constraints["bolt_pattern_mm"] # [[y, z], ...]
# bolt_d = constraints["bolt_diameter_clearance_mm"]

# TODO: build your geometry
# TODO: build your geometry using the parameters above
# shape = ...

# TODO: write to STEP and return bytes
# return _to_step_bytes(shape)

raise NotImplementedError(
"Replace this with your geometry. See QUICKSTART.md for examples."
"Replace this with your geometry. See examples/metric-aware-agent/ for a working example."
)


Expand Down
16 changes: 15 additions & 1 deletion docs/threat-model.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ This document describes the attack surface of the Forge benchmark and the mitiga
- FEA gate: CalculiX linear statics must pass; max stress ≤ allowable stress.
- Geometry gate: `min_wall_thickness` constraint enforced via Shapely cross-section sampling.
- Build volume, bolt pattern, overhang angle checked by `benchmark/geometry.py`.
- 3× determinism check: eval runs three times with the same seed; non-deterministic outputs are rejected.
- 2× determinism check on the first spec: CI runs spec 0 twice; if the score differs, the submission is flagged non-deterministic and rejected. Remaining specs run once to keep CI time manageable. (Note: stochastic agents that vary only on later specs could slip through — improving this is a known gap.)

**Residual risk:** Low-medium. The wall-thickness sampler uses a finite grid; pathological thin bridges between grid sample points could still slip through.

Expand Down Expand Up @@ -144,6 +144,18 @@ This document describes the attack surface of the Forge benchmark and the mitiga

---

## Threat 9 — Specialist gaming (leaderboard breadth gap)

**Attack:** Enter only 3 easy specs (one per round) and achieve #1 on all three. Under an avg_rank model, this yields avg_rank = 1.0 — ranking above a well-rounded agent that competes on all 45 specs but averages rank 1.5.

**Mitigations:**
- **Implemented:** Overall leaderboard now sorts by `overall_score` — mean normalized performance across ALL active specs. Unentered specs count as 1.0 (baseline) in the mean. A miner who skips 42 specs cannot achieve overall_score < 1.0, so they rank below any agent that beats baseline across the full problem pool.
- This directly rewards breadth: entering more specs and beating baseline on each one lowers your overall_score.

**Residual risk:** Low. A miner who is genuinely #1 on every spec they enter still benefits from entering more specs — better coverage further reduces their overall_score.

---

## Summary table

| Threat | Severity | Status |
Expand All @@ -156,3 +168,5 @@ This document describes the attack surface of the Forge benchmark and the mitiga
| Sybil submissions | Low | Mitigated (credibility + min PR requirement) |
| Eval overfitting | Medium | Mitigated (hidden spec set + rotation) |
| Load-case overfitting | Low | Mitigated (seeded load perturbation in FEA) |
| Specialist gaming | Medium | Mitigated (breadth-normalized overall_score) |
| Determinism check coverage | Low | Partial — only spec 0 runs 2× (gap: stochastic agents may vary on later specs) |
20 changes: 15 additions & 5 deletions scripts/record_submissions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import json
import os
import sys
import time
import urllib.error
import urllib.request
from pathlib import Path
Expand Down Expand Up @@ -63,8 +64,17 @@
req = urllib.request.Request(
url, data=body, headers={"Content-Type": "application/json"}, method="POST"
)
try:
with urllib.request.urlopen(req, timeout=15) as resp:
print(f"[{spec_id}] recorded: {resp.read().decode()[:80]}")
except urllib.error.URLError as e:
print(f"[{spec_id}] forge-api POST failed (non-blocking): {e}", flush=True)
# Retry with exponential backoff (1s → 2s → 4s) so transient API hiccups
# don't silently drop leaderboard submissions.
last_exc: Exception | None = None
for attempt in range(3):
try:
with urllib.request.urlopen(req, timeout=15) as resp:
print(f"[{spec_id}] recorded: {resp.read().decode()[:80]}")
break
except urllib.error.URLError as e:
last_exc = e
if attempt < 2:
time.sleep(2 ** attempt)
else:
print(f"[{spec_id}] forge-api POST failed after 3 attempts (non-blocking): {last_exc}", flush=True)
22 changes: 18 additions & 4 deletions scripts/run_hidden_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@
import subprocess
import sys
import tempfile
import time
import urllib.request
import urllib.error

API_URL = os.environ.get("FORGE_API_URL", "").rstrip("/")
ADMIN_KEY = os.environ.get("FORGE_ADMIN_KEY", "")
Expand All @@ -48,11 +50,24 @@
workspace = os.getcwd()


def _api_request(req: urllib.request.Request, label: str) -> dict:
"""Execute an API request with 3-attempt exponential backoff (1s → 2s → 4s)."""
last_exc: Exception | None = None
for attempt in range(3):
try:
with urllib.request.urlopen(req, timeout=15) as resp:
return json.loads(resp.read())
except urllib.error.URLError as e:
last_exc = e
if attempt < 2:
time.sleep(2 ** attempt)
raise RuntimeError(f"[{label}] forge-api request failed after 3 attempts: {last_exc}") from last_exc


def _api_get(path: str) -> dict:
url = f"{API_URL}{path}"
req = urllib.request.Request(url, headers={"Authorization": f"Bearer {ADMIN_KEY}"})
with urllib.request.urlopen(req, timeout=15) as resp:
return json.loads(resp.read())
return _api_request(req, f"GET {path}")


def _api_post(path: str, payload: dict) -> dict:
Expand All @@ -63,8 +78,7 @@ def _api_post(path: str, payload: dict) -> dict:
headers={"Authorization": f"Bearer {ADMIN_KEY}", "Content-Type": "application/json"},
method="POST",
)
with urllib.request.urlopen(req, timeout=15) as resp:
return json.loads(resp.read())
return _api_request(req, f"POST {path}")


def _run_eval(spec: dict) -> dict:
Expand Down
Loading