diff --git a/.github/workflows/score.yml b/.github/workflows/score.yml
index 7e78bfa..45d9777 100644
--- a/.github/workflows/score.yml
+++ b/.github/workflows/score.yml
@@ -12,6 +12,12 @@ on:
 env:
   FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
 
+# Serialize full-score jobs to avoid concurrent DB writes overwhelming the API.
+# Pending runs wait; they do NOT cancel each other (each merged agent must be scored).
+concurrency:
+  group: score-main
+  cancel-in-progress: false
+
 jobs:
   find-agent:
     runs-on: ubuntu-latest
@@ -52,7 +58,7 @@ jobs:
     needs: find-agent
     if: needs.find-agent.outputs.found == 'true'
     runs-on: ubuntu-latest
-    timeout-minutes: 90
+    timeout-minutes: 150
     strategy:
       fail-fast: false
       matrix:
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5090d60..46824e2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,15 @@ Format: newest entries first.
 
 ## [Unreleased]
 
+## 2026-06-03 (session 2)
+
+### Fixed
+- **Baseline agent build-volume overflow** (PR #214, `agents/baseline/agent.py`): naive L-bracket used `max_bolt_coord + 20mm` for plate dimensions, exceeding build volume on some specs; also `shelf_thickness=12mm` fixed regardless of spec load-point Z, causing "too few nodes near load point" FEA failure. Fixed: all dimensions clamped to build volume with 2mm margin; shelf height computed from `load_point_mm[2]`.
+- **CONTRIBUTING model IDs** (PR #215): model IDs were shown without provider prefix (`claude-haiku-4-5` vs `anthropic/claude-haiku-4-5`); corrected to match `FORGE_MODEL_WHITELIST` format. Added `llm-agent/` to examples list.
+
+### Changed
+- **score.yml serialization** (PR #216, `.github/workflows/score.yml`): add `concurrency: group: score-main, cancel-in-progress: false` to serialize post-merge full scoring runs. Prevents concurrent SQLite writes overwhelming forge-api at high miner volume.
+
 ## 2026-06-03
 
 ### Added
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 07b588d..dfc6b96 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -28,10 +28,11 @@ def generate(spec: dict, llm: LLMClient) -> bytes:
     ...
 ```
 
-The harness injects `LLMClient` — no API key required. Whitelisted models: `claude-haiku-4-5`, `claude-3-5-haiku`, `gpt-4o-mini`. Both parameters are required; agents that omit `llm` are rejected at eval time.
+The harness injects `LLMClient` — no API key required. Whitelisted models: `anthropic/claude-haiku-4-5`, `anthropic/claude-3-5-haiku`, `openai/gpt-4o-mini`. Both parameters are required; agents that omit `llm` are rejected at eval time.
 
 Starting point in `examples/`:
-- `metric-aware-agent/` — adapts strategy per scoring metric, call `llm.chat()` to reason about geometry
+- `metric-aware-agent/` — adapts strategy per scoring metric; uses `llm.chat()` to reason about geometry
+- `llm-agent/` — minimal single-call LLM agent, good for getting started
 
 The agent runs inside a Docker container:
 - **Time:** 60 seconds | **Memory:** 4 GB | **Network:** enabled (LLM calls only)
diff --git a/agents/baseline/agent.py b/agents/baseline/agent.py
index c7e0cc9..687c7b2 100644
--- a/agents/baseline/agent.py
+++ b/agents/baseline/agent.py
@@ -29,15 +29,19 @@ def generate(spec: dict, llm: LLMClient) -> bytes:  # noqa: ARG001
     constraints = spec["constraints"]
     bolt_pattern = constraints["bolt_pattern_mm"]
     bolt_d = constraints["bolt_diameter_clearance_mm"]
+    bv = constraints["build_volume_mm"]  # [x, y, z] limits
+    load_pt = constraints["load_point_mm"]
 
     by_coords = [p[0] for p in bolt_pattern]
     bz_coords = [p[1] for p in bolt_pattern]
-    plate_y = max(by_coords) + 20.0
-    plate_z = max(bz_coords) + 20.0
+    # Clamp to build volume — leave 2mm margin to avoid floating-point boundary failures
+    plate_y = min(max(by_coords) + 20.0, bv[1] - 2.0)
+    plate_z = min(max(bz_coords) + 20.0, bv[2] - 2.0)
     plate_thickness = 10.0
 
-    shelf_length = constraints["load_point_mm"][0] + 15.0
-    shelf_thickness = 12.0
+    # Shelf must reach the load point in X and cover its Z coordinate
+    shelf_length = min(load_pt[0] + 15.0, bv[0] - 2.0)
+    shelf_thickness = min(max(15.0, load_pt[2] + 10.0), bv[2] - 2.0)
     shelf_z = plate_z
 
     # Mounting plate
diff --git a/scripts/record_submissions.py b/scripts/record_submissions.py
index 61e935c..f542f53 100644
--- a/scripts/record_submissions.py
+++ b/scripts/record_submissions.py
@@ -35,7 +35,10 @@
     step_b64 = None
     step_file = Path(f".forge_step_{spec_id}.step")
     if step_file.exists():
-        step_b64 = base64.b64encode(step_file.read_bytes()).decode()
+        # Only include non-empty STEP files; a 0-byte file means the container
+        # crashed before writing geometry. 200 bytes is well below any valid STEP header.
+        if step_file.stat().st_size > 200:
+            step_b64 = base64.b64encode(step_file.read_bytes()).decode()
         step_file.unlink(missing_ok=True)
 
     score_metric = result.get("score_metric", entry.get("metric", "mass_grams"))
diff --git a/scripts/run_eval_pool.py b/scripts/run_eval_pool.py
index 7b699d7..641f16e 100644
--- a/scripts/run_eval_pool.py
+++ b/scripts/run_eval_pool.py
@@ -72,16 +72,24 @@
         if proc.stderr.strip():
             print(f"[{spec_id}] stderr:\n{proc.stderr.strip()}", flush=True)
 
-        try:
-            result_data = json.loads(out)
-        except (json.JSONDecodeError, ValueError):
-            # Surface the tail of stderr so miners can debug crashes.
-            stderr_tail = proc.stderr.strip()[-400:] if proc.stderr.strip() else ""
+        stderr_tail = proc.stderr.strip()[-400:] if proc.stderr.strip() else ""
+        if proc.returncode != 0 and not out:
+            # Container crashed before producing any output (OOM, segfault, etc.)
             hint = f" | stderr: {stderr_tail}" if stderr_tail else ""
             result_data = {
                 "passed": False,
-                "reason": f"Invalid JSON output: {out[:120]}{hint}",
+                "reason": f"Container exited {proc.returncode}{hint}",
             }
+        else:
+            try:
+                result_data = json.loads(out)
+            except (json.JSONDecodeError, ValueError):
+                # Surface the tail of stderr so miners can debug crashes.
+                hint = f" | stderr: {stderr_tail}" if stderr_tail else ""
+                result_data = {
+                    "passed": False,
+                    "reason": f"Invalid JSON output: {out[:120]}{hint}",
+                }
 
         # Check determinism on first spec
         if prev_score is not None: