PunchTheDev · PunchTheDev · Jun 3, 2026 · Jun 3, 2026 · Jun 3, 2026 · Jun 3, 2026
diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml
@@ -30,16 +30,30 @@ jobs:
       - name: Find changed agent
         id: agent
         run: |
-          AGENT=$(git diff --name-only origin/${{ github.base_ref }}...HEAD \
-            | grep '^agents/.*/agent\.py$' | grep -v '^agents/template/' | head -1)
-          if [ -z "$AGENT" ]; then
+          CHANGED=$(git diff --name-only origin/${{ github.base_ref }}...HEAD \
+            | grep '^agents/.*/agent\.py$' | grep -v '^agents/template/')
+
+          if [ -z "$CHANGED" ]; then
             echo "No agent.py changed — skipping eval."
             echo "found=false" >> "$GITHUB_OUTPUT"
-          else
-            echo "path=$AGENT" >> "$GITHUB_OUTPUT"
-            echo "found=true" >> "$GITHUB_OUTPUT"
+            exit 0
           fi
 
+          # Block PRs that touch multiple agent directories — prevents one miner
+          # from silently overwriting another miner's agent on merge.
+          DIRS=$(echo "$CHANGED" | sed 's|/agent\.py||' | sort -u)
+          DIR_COUNT=$(echo "$DIRS" | wc -l)
+          if [ "$DIR_COUNT" -gt "1" ]; then
+            echo "ERROR: PR modifies agent.py in multiple directories:" >&2
+            echo "$DIRS" >&2
+            echo "Each PR must touch exactly one agent directory." >&2
+            exit 1
+          fi
+
+          AGENT=$(echo "$CHANGED" | head -1)
+          echo "path=$AGENT" >> "$GITHUB_OUTPUT"
+          echo "found=true" >> "$GITHUB_OUTPUT"
+
       - name: Check source similarity against reference agents
         id: similarity
         if: steps.agent.outputs.found == 'true'

diff --git a/.github/workflows/score.yml b/.github/workflows/score.yml
@@ -12,6 +12,12 @@ on:
 env:
   FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
 
+# Serialize full-score jobs to avoid concurrent DB writes overwhelming the API.
+# Pending runs wait; they do NOT cancel each other (each merged agent must be scored).
+concurrency:
+  group: score-main
+  cancel-in-progress: false
+
 jobs:
   find-agent:
     runs-on: ubuntu-latest
@@ -52,7 +58,7 @@ jobs:
     needs: find-agent
     if: needs.find-agent.outputs.found == 'true'
     runs-on: ubuntu-latest
-    timeout-minutes: 90
+    timeout-minutes: 150
     strategy:
       fail-fast: false
       matrix:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,15 @@ Format: newest entries first.
 
 ## [Unreleased]
 
+## 2026-06-03 (session 2)
+
+### Fixed
+- **Baseline agent build-volume overflow** (PR #214, `agents/baseline/agent.py`): naive L-bracket used `max_bolt_coord + 20mm` for plate dimensions, exceeding build volume on some specs; also `shelf_thickness=12mm` fixed regardless of spec load-point Z, causing "too few nodes near load point" FEA failure. Fixed: all dimensions clamped to build volume with 2mm margin; shelf height computed from `load_point_mm[2]`.
+- **CONTRIBUTING model IDs** (PR #215): model IDs were shown without provider prefix (`claude-haiku-4-5` vs `anthropic/claude-haiku-4-5`); corrected to match `FORGE_MODEL_WHITELIST` format. Added `llm-agent/` to examples list.
+
+### Changed
+- **score.yml serialization** (PR #216, `.github/workflows/score.yml`): add `concurrency: group: score-main, cancel-in-progress: false` to serialize post-merge full scoring runs. Prevents concurrent SQLite writes overwhelming forge-api at high miner volume.
+
 ## 2026-06-03
 
 ### Added

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -28,10 +28,11 @@ def generate(spec: dict, llm: LLMClient) -> bytes:
     ...
 ```
 
-The harness injects `LLMClient` — no API key required. Whitelisted models: `claude-haiku-4-5`, `claude-3-5-haiku`, `gpt-4o-mini`. Both parameters are required; agents that omit `llm` are rejected at eval time.
+The harness injects `LLMClient` — no API key required. Whitelisted models: `anthropic/claude-haiku-4-5`, `anthropic/claude-3-5-haiku`, `openai/gpt-4o-mini`. Both parameters are required; agents that omit `llm` are rejected at eval time.
 
 Starting point in `examples/`:
-- `metric-aware-agent/` — adapts strategy per scoring metric, call `llm.chat()` to reason about geometry
+- `metric-aware-agent/` — adapts strategy per scoring metric; uses `llm.chat()` to reason about geometry
+- `llm-agent/` — minimal single-call LLM agent, good for getting started
 
 The agent runs inside a Docker container:
 - **Time:** 60 seconds | **Memory:** 4 GB | **Network:** enabled (LLM calls only)

diff --git a/agents/baseline/agent.py b/agents/baseline/agent.py
@@ -29,15 +29,19 @@ def generate(spec: dict, llm: LLMClient) -> bytes:  # noqa: ARG001
     constraints = spec["constraints"]
     bolt_pattern = constraints["bolt_pattern_mm"]
     bolt_d = constraints["bolt_diameter_clearance_mm"]
+    bv = constraints["build_volume_mm"]  # [x, y, z] limits
+    load_pt = constraints["load_point_mm"]
 
     by_coords = [p[0] for p in bolt_pattern]
     bz_coords = [p[1] for p in bolt_pattern]
-    plate_y = max(by_coords) + 20.0
-    plate_z = max(bz_coords) + 20.0
+    # Clamp to build volume — leave 2mm margin to avoid floating-point boundary failures
+    plate_y = min(max(by_coords) + 20.0, bv[1] - 2.0)
+    plate_z = min(max(bz_coords) + 20.0, bv[2] - 2.0)
     plate_thickness = 10.0
 
-    shelf_length = constraints["load_point_mm"][0] + 15.0
-    shelf_thickness = 12.0
+    # Shelf must reach the load point in X and cover its Z coordinate
+    shelf_length = min(load_pt[0] + 15.0, bv[0] - 2.0)
+    shelf_thickness = min(max(15.0, load_pt[2] + 10.0), bv[2] - 2.0)
     shelf_z = plate_z
 
     # Mounting plate

diff --git a/scripts/record_submissions.py b/scripts/record_submissions.py
@@ -35,7 +35,10 @@
     step_b64 = None
     step_file = Path(f".forge_step_{spec_id}.step")
     if step_file.exists():
-        step_b64 = base64.b64encode(step_file.read_bytes()).decode()
+        # Only include non-empty STEP files; a 0-byte file means the container
+        # crashed before writing geometry. 200 bytes is well below any valid STEP header.
+        if step_file.stat().st_size > 200:
+            step_b64 = base64.b64encode(step_file.read_bytes()).decode()
         step_file.unlink(missing_ok=True)
 
     score_metric = result.get("score_metric", entry.get("metric", "mass_grams"))

diff --git a/scripts/run_eval_pool.py b/scripts/run_eval_pool.py
@@ -72,16 +72,24 @@
         if proc.stderr.strip():
             print(f"[{spec_id}] stderr:\n{proc.stderr.strip()}", flush=True)
 
-        try:
-            result_data = json.loads(out)
-        except (json.JSONDecodeError, ValueError):
-            # Surface the tail of stderr so miners can debug crashes.
-            stderr_tail = proc.stderr.strip()[-400:] if proc.stderr.strip() else ""
+        stderr_tail = proc.stderr.strip()[-400:] if proc.stderr.strip() else ""
+        if proc.returncode != 0 and not out:
+            # Container crashed before producing any output (OOM, segfault, etc.)
             hint = f" | stderr: {stderr_tail}" if stderr_tail else ""
             result_data = {
                 "passed": False,
-                "reason": f"Invalid JSON output: {out[:120]}{hint}",
+                "reason": f"Container exited {proc.returncode}{hint}",
             }
+        else:
+            try:
+                result_data = json.loads(out)
+            except (json.JSONDecodeError, ValueError):
+                # Surface the tail of stderr so miners can debug crashes.
+                hint = f" | stderr: {stderr_tail}" if stderr_tail else ""
+                result_data = {
+                    "passed": False,
+                    "reason": f"Invalid JSON output: {out[:120]}{hint}",
+                }
 
         # Check determinism on first spec
         if prev_score is not None: