From 29aab0e189ad075f456def8851cc45d99fe78f87 Mon Sep 17 00:00:00 2001 From: Punch Date: Wed, 3 Jun 2026 13:14:16 +0000 Subject: [PATCH 1/6] Clamp baseline geometry to build volume, cover load point Z --- agents/baseline/agent.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/agents/baseline/agent.py b/agents/baseline/agent.py index c7e0cc9..9c5a396 100644 --- a/agents/baseline/agent.py +++ b/agents/baseline/agent.py @@ -29,15 +29,19 @@ def generate(spec: dict, llm: LLMClient) -> bytes: # noqa: ARG001 constraints = spec["constraints"] bolt_pattern = constraints["bolt_pattern_mm"] bolt_d = constraints["bolt_diameter_clearance_mm"] + bv = constraints["build_volume_mm"] # [x, y, z] limits + load_pt = constraints["load_point_mm"] by_coords = [p[0] for p in bolt_pattern] bz_coords = [p[1] for p in bolt_pattern] - plate_y = max(by_coords) + 20.0 - plate_z = max(bz_coords) + 20.0 + # Clamp to build volume so the bracket always fits + plate_y = min(max(by_coords) + 20.0, bv[1]) + plate_z = min(max(bz_coords) + 20.0, bv[2]) plate_thickness = 10.0 - shelf_length = constraints["load_point_mm"][0] + 15.0 - shelf_thickness = 12.0 + # Shelf must reach the load point in X and cover its Z coordinate + shelf_length = min(load_pt[0] + 15.0, bv[0]) + shelf_thickness = min(max(15.0, load_pt[2] + 10.0), bv[2]) shelf_z = plate_z # Mounting plate From df509dbe0ddaec0e97d8ccb65e124291298d09a1 Mon Sep 17 00:00:00 2001 From: Punch Date: Wed, 3 Jun 2026 13:19:35 +0000 Subject: [PATCH 2/6] Leave 2mm margin from build volume boundary in baseline --- agents/baseline/agent.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/agents/baseline/agent.py b/agents/baseline/agent.py index 9c5a396..687c7b2 100644 --- a/agents/baseline/agent.py +++ b/agents/baseline/agent.py @@ -34,14 +34,14 @@ def generate(spec: dict, llm: LLMClient) -> bytes: # noqa: ARG001 by_coords = [p[0] for p in bolt_pattern] bz_coords = [p[1] for p in bolt_pattern] - # Clamp to build volume so the bracket always fits - plate_y = min(max(by_coords) + 20.0, bv[1]) - plate_z = min(max(bz_coords) + 20.0, bv[2]) + # Clamp to build volume — leave 2mm margin to avoid floating-point boundary failures + plate_y = min(max(by_coords) + 20.0, bv[1] - 2.0) + plate_z = min(max(bz_coords) + 20.0, bv[2] - 2.0) plate_thickness = 10.0 # Shelf must reach the load point in X and cover its Z coordinate - shelf_length = min(load_pt[0] + 15.0, bv[0]) - shelf_thickness = min(max(15.0, load_pt[2] + 10.0), bv[2]) + shelf_length = min(load_pt[0] + 15.0, bv[0] - 2.0) + shelf_thickness = min(max(15.0, load_pt[2] + 10.0), bv[2] - 2.0) shelf_z = plate_z # Mounting plate From 78b6193996b510d2145a887ddeb5be7b06781d95 Mon Sep 17 00:00:00 2001 From: Punch Date: Wed, 3 Jun 2026 13:24:09 +0000 Subject: [PATCH 3/6] Fix model IDs in CONTRIBUTING, add llm-agent to examples list --- CONTRIBUTING.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 07b588d..dfc6b96 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -28,10 +28,11 @@ def generate(spec: dict, llm: LLMClient) -> bytes: ... ``` -The harness injects `LLMClient` — no API key required. Whitelisted models: `claude-haiku-4-5`, `claude-3-5-haiku`, `gpt-4o-mini`. Both parameters are required; agents that omit `llm` are rejected at eval time. +The harness injects `LLMClient` — no API key required. Whitelisted models: `anthropic/claude-haiku-4-5`, `anthropic/claude-3-5-haiku`, `openai/gpt-4o-mini`. Both parameters are required; agents that omit `llm` are rejected at eval time. Starting point in `examples/`: -- `metric-aware-agent/` — adapts strategy per scoring metric, call `llm.chat()` to reason about geometry +- `metric-aware-agent/` — adapts strategy per scoring metric; uses `llm.chat()` to reason about geometry +- `llm-agent/` — minimal single-call LLM agent, good for getting started The agent runs inside a Docker container: - **Time:** 60 seconds | **Memory:** 4 GB | **Network:** enabled (LLM calls only) From ca1a7c69a63112626ed4e6ee69e8368682b3e731 Mon Sep 17 00:00:00 2001 From: Punch Date: Wed, 3 Jun 2026 13:25:21 +0000 Subject: [PATCH 4/6] Serialize score.yml to prevent concurrent DB-write overload --- .github/workflows/score.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/score.yml b/.github/workflows/score.yml index 7e78bfa..14c5799 100644 --- a/.github/workflows/score.yml +++ b/.github/workflows/score.yml @@ -12,6 +12,12 @@ on: env: FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true +# Serialize full-score jobs to avoid concurrent DB writes overwhelming the API. +# Pending runs wait; they do NOT cancel each other (each merged agent must be scored). +concurrency: + group: score-main + cancel-in-progress: false + jobs: find-agent: runs-on: ubuntu-latest From 97b274c1863c140a85e7bd433326c81120d1459e Mon Sep 17 00:00:00 2001 From: Punch Date: Wed, 3 Jun 2026 13:26:54 +0000 Subject: [PATCH 5/6] Add session-2 changelog entries --- CHANGELOG.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5090d60..46824e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,15 @@ Format: newest entries first. ## [Unreleased] +## 2026-06-03 (session 2) + +### Fixed +- **Baseline agent build-volume overflow** (PR #214, `agents/baseline/agent.py`): naive L-bracket used `max_bolt_coord + 20mm` for plate dimensions, exceeding build volume on some specs; also `shelf_thickness=12mm` fixed regardless of spec load-point Z, causing "too few nodes near load point" FEA failure. Fixed: all dimensions clamped to build volume with 2mm margin; shelf height computed from `load_point_mm[2]`. +- **CONTRIBUTING model IDs** (PR #215): model IDs were shown without provider prefix (`claude-haiku-4-5` vs `anthropic/claude-haiku-4-5`); corrected to match `FORGE_MODEL_WHITELIST` format. Added `llm-agent/` to examples list. + +### Changed +- **score.yml serialization** (PR #216, `.github/workflows/score.yml`): add `concurrency: group: score-main, cancel-in-progress: false` to serialize post-merge full scoring runs. Prevents concurrent SQLite writes overwhelming forge-api at high miner volume. + ## 2026-06-03 ### Added From ed1206ebe3f671a5e2f4c0498de39e22b6bd75f5 Mon Sep 17 00:00:00 2001 From: Punch Date: Wed, 3 Jun 2026 13:42:45 +0000 Subject: [PATCH 6/6] Harden CI eval: crash detection, empty STEP guard, extended timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three independent fixes identified in scale readiness audit: 1. run_eval_pool.py: distinguish container crash (returncode != 0, no output) from bad JSON (container ran but output is garbage). Previously both showed "Invalid JSON output" — crash now shows "Container exited 137" with stderr tail, making OOM kills and segfaults debuggable by miners. 2. record_submissions.py: skip STEP files smaller than 200 bytes. The file is pre-created as 0 bytes before docker run so the container can write to it; if the container crashes mid-run the file stays empty. Storing an empty BLOB sets has_step=true for a submission with no geometry, breaking the 3D viewer for that entry. 3. score.yml: increase score-round timeout-minutes from 90 → 150. 15 specs × ~180s each + Docker overhead ≈ 50 min per round; 90 min was dangerously close to the limit for slower specs under high load. eval.yml and hidden-eval remain at 90 min (3 specs each — sufficient). Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/score.yml | 2 +- scripts/record_submissions.py | 5 ++++- scripts/run_eval_pool.py | 20 ++++++++++++++------ 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/.github/workflows/score.yml b/.github/workflows/score.yml index 14c5799..45d9777 100644 --- a/.github/workflows/score.yml +++ b/.github/workflows/score.yml @@ -58,7 +58,7 @@ jobs: needs: find-agent if: needs.find-agent.outputs.found == 'true' runs-on: ubuntu-latest - timeout-minutes: 90 + timeout-minutes: 150 strategy: fail-fast: false matrix: diff --git a/scripts/record_submissions.py b/scripts/record_submissions.py index 61e935c..f542f53 100644 --- a/scripts/record_submissions.py +++ b/scripts/record_submissions.py @@ -35,7 +35,10 @@ step_b64 = None step_file = Path(f".forge_step_{spec_id}.step") if step_file.exists(): - step_b64 = base64.b64encode(step_file.read_bytes()).decode() + # Only include non-empty STEP files; a 0-byte file means the container + # crashed before writing geometry. 200 bytes is well below any valid STEP header. + if step_file.stat().st_size > 200: + step_b64 = base64.b64encode(step_file.read_bytes()).decode() step_file.unlink(missing_ok=True) score_metric = result.get("score_metric", entry.get("metric", "mass_grams")) diff --git a/scripts/run_eval_pool.py b/scripts/run_eval_pool.py index 7b699d7..641f16e 100644 --- a/scripts/run_eval_pool.py +++ b/scripts/run_eval_pool.py @@ -72,16 +72,24 @@ if proc.stderr.strip(): print(f"[{spec_id}] stderr:\n{proc.stderr.strip()}", flush=True) - try: - result_data = json.loads(out) - except (json.JSONDecodeError, ValueError): - # Surface the tail of stderr so miners can debug crashes. - stderr_tail = proc.stderr.strip()[-400:] if proc.stderr.strip() else "" + stderr_tail = proc.stderr.strip()[-400:] if proc.stderr.strip() else "" + if proc.returncode != 0 and not out: + # Container crashed before producing any output (OOM, segfault, etc.) hint = f" | stderr: {stderr_tail}" if stderr_tail else "" result_data = { "passed": False, - "reason": f"Invalid JSON output: {out[:120]}{hint}", + "reason": f"Container exited {proc.returncode}{hint}", } + else: + try: + result_data = json.loads(out) + except (json.JSONDecodeError, ValueError): + # Surface the tail of stderr so miners can debug crashes. + hint = f" | stderr: {stderr_tail}" if stderr_tail else "" + result_data = { + "passed": False, + "reason": f"Invalid JSON output: {out[:120]}{hint}", + } # Check determinism on first spec if prev_score is not None: