diff --git a/.github/workflows/score.yml b/.github/workflows/score.yml index 7e78bfa..45d9777 100644 --- a/.github/workflows/score.yml +++ b/.github/workflows/score.yml @@ -12,6 +12,12 @@ on: env: FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true +# Serialize full-score jobs to avoid concurrent DB writes overwhelming the API. +# Pending runs wait; they do NOT cancel each other (each merged agent must be scored). +concurrency: + group: score-main + cancel-in-progress: false + jobs: find-agent: runs-on: ubuntu-latest @@ -52,7 +58,7 @@ jobs: needs: find-agent if: needs.find-agent.outputs.found == 'true' runs-on: ubuntu-latest - timeout-minutes: 90 + timeout-minutes: 150 strategy: fail-fast: false matrix: diff --git a/CHANGELOG.md b/CHANGELOG.md index 5090d60..46824e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,15 @@ Format: newest entries first. ## [Unreleased] +## 2026-06-03 (session 2) + +### Fixed +- **Baseline agent build-volume overflow** (PR #214, `agents/baseline/agent.py`): naive L-bracket used `max_bolt_coord + 20mm` for plate dimensions, exceeding build volume on some specs; also `shelf_thickness=12mm` fixed regardless of spec load-point Z, causing "too few nodes near load point" FEA failure. Fixed: all dimensions clamped to build volume with 2mm margin; shelf height computed from `load_point_mm[2]`. +- **CONTRIBUTING model IDs** (PR #215): model IDs were shown without provider prefix (`claude-haiku-4-5` vs `anthropic/claude-haiku-4-5`); corrected to match `FORGE_MODEL_WHITELIST` format. Added `llm-agent/` to examples list. + +### Changed +- **score.yml serialization** (PR #216, `.github/workflows/score.yml`): add `concurrency: group: score-main, cancel-in-progress: false` to serialize post-merge full scoring runs. Prevents concurrent SQLite writes overwhelming forge-api at high miner volume. + ## 2026-06-03 ### Added diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 07b588d..dfc6b96 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -28,10 +28,11 @@ def generate(spec: dict, llm: LLMClient) -> bytes: ... ``` -The harness injects `LLMClient` — no API key required. Whitelisted models: `claude-haiku-4-5`, `claude-3-5-haiku`, `gpt-4o-mini`. Both parameters are required; agents that omit `llm` are rejected at eval time. +The harness injects `LLMClient` — no API key required. Whitelisted models: `anthropic/claude-haiku-4-5`, `anthropic/claude-3-5-haiku`, `openai/gpt-4o-mini`. Both parameters are required; agents that omit `llm` are rejected at eval time. Starting point in `examples/`: -- `metric-aware-agent/` — adapts strategy per scoring metric, call `llm.chat()` to reason about geometry +- `metric-aware-agent/` — adapts strategy per scoring metric; uses `llm.chat()` to reason about geometry +- `llm-agent/` — minimal single-call LLM agent, good for getting started The agent runs inside a Docker container: - **Time:** 60 seconds | **Memory:** 4 GB | **Network:** enabled (LLM calls only) diff --git a/agents/baseline/agent.py b/agents/baseline/agent.py index c7e0cc9..687c7b2 100644 --- a/agents/baseline/agent.py +++ b/agents/baseline/agent.py @@ -29,15 +29,19 @@ def generate(spec: dict, llm: LLMClient) -> bytes: # noqa: ARG001 constraints = spec["constraints"] bolt_pattern = constraints["bolt_pattern_mm"] bolt_d = constraints["bolt_diameter_clearance_mm"] + bv = constraints["build_volume_mm"] # [x, y, z] limits + load_pt = constraints["load_point_mm"] by_coords = [p[0] for p in bolt_pattern] bz_coords = [p[1] for p in bolt_pattern] - plate_y = max(by_coords) + 20.0 - plate_z = max(bz_coords) + 20.0 + # Clamp to build volume — leave 2mm margin to avoid floating-point boundary failures + plate_y = min(max(by_coords) + 20.0, bv[1] - 2.0) + plate_z = min(max(bz_coords) + 20.0, bv[2] - 2.0) plate_thickness = 10.0 - shelf_length = constraints["load_point_mm"][0] + 15.0 - shelf_thickness = 12.0 + # Shelf must reach the load point in X and cover its Z coordinate + shelf_length = min(load_pt[0] + 15.0, bv[0] - 2.0) + shelf_thickness = min(max(15.0, load_pt[2] + 10.0), bv[2] - 2.0) shelf_z = plate_z # Mounting plate diff --git a/scripts/record_submissions.py b/scripts/record_submissions.py index 61e935c..f542f53 100644 --- a/scripts/record_submissions.py +++ b/scripts/record_submissions.py @@ -35,7 +35,10 @@ step_b64 = None step_file = Path(f".forge_step_{spec_id}.step") if step_file.exists(): - step_b64 = base64.b64encode(step_file.read_bytes()).decode() + # Only include non-empty STEP files; a 0-byte file means the container + # crashed before writing geometry. 200 bytes is well below any valid STEP header. + if step_file.stat().st_size > 200: + step_b64 = base64.b64encode(step_file.read_bytes()).decode() step_file.unlink(missing_ok=True) score_metric = result.get("score_metric", entry.get("metric", "mass_grams")) diff --git a/scripts/run_eval_pool.py b/scripts/run_eval_pool.py index 7b699d7..641f16e 100644 --- a/scripts/run_eval_pool.py +++ b/scripts/run_eval_pool.py @@ -72,16 +72,24 @@ if proc.stderr.strip(): print(f"[{spec_id}] stderr:\n{proc.stderr.strip()}", flush=True) - try: - result_data = json.loads(out) - except (json.JSONDecodeError, ValueError): - # Surface the tail of stderr so miners can debug crashes. - stderr_tail = proc.stderr.strip()[-400:] if proc.stderr.strip() else "" + stderr_tail = proc.stderr.strip()[-400:] if proc.stderr.strip() else "" + if proc.returncode != 0 and not out: + # Container crashed before producing any output (OOM, segfault, etc.) hint = f" | stderr: {stderr_tail}" if stderr_tail else "" result_data = { "passed": False, - "reason": f"Invalid JSON output: {out[:120]}{hint}", + "reason": f"Container exited {proc.returncode}{hint}", } + else: + try: + result_data = json.loads(out) + except (json.JSONDecodeError, ValueError): + # Surface the tail of stderr so miners can debug crashes. + hint = f" | stderr: {stderr_tail}" if stderr_tail else "" + result_data = { + "passed": False, + "reason": f"Invalid JSON output: {out[:120]}{hint}", + } # Check determinism on first spec if prev_score is not None: