diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 283cd0e..88f42ee 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -4,7 +4,7 @@ ## Spec - + ## Score @@ -12,11 +12,12 @@ ## Approach - + ## Checklist -- [ ] `agents//agent.py` implements `generate(spec) -> bytes` -- [ ] Local eval passes: `docker run ... --agent agents//agent.py --spec specs/001_bracket.json` -- [ ] No external network calls in `generate()` -- [ ] Agent is deterministic (same output for same spec) +- [ ] `agents//agent.py` implements `generate(spec) -> bytes` or `generate(spec, llm) -> bytes` +- [ ] `agents//spec.txt` contains the target spec ID (e.g., `pub_001_medium`) +- [ ] Local eval passes: `forge eval agents//agent.py` +- [ ] Agent is deterministic (same spec → same bytes; fix any random seeds) +- [ ] LLM agents: using an injected `LLMClient`, not a hardcoded API key diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml index 9bcb3da..8d9c9a8 100644 --- a/.github/workflows/eval.yml +++ b/.github/workflows/eval.yml @@ -21,7 +21,7 @@ jobs: id: agent run: | AGENT=$(git diff --name-only origin/${{ github.base_ref }}...HEAD \ - | grep '^agents/.*/agent\.py$' | head -1) + | grep '^agents/.*/agent\.py$' | grep -v '^agents/template/' | head -1) if [ -z "$AGENT" ]; then echo "No agent.py changed — skipping eval." echo "found=false" >> "$GITHUB_OUTPUT" @@ -102,10 +102,12 @@ jobs: STEP_FLAG="--step-out /forge/.forge_step_output.step" fi OUT=$(docker run --rm \ - --network none \ --security-opt no-new-privileges \ --memory 4g \ --cpus 2 \ + -e FORGE_LLM_KEY=${{ secrets.FORGE_LLM_KEY || secrets.OPENROUTER_KEY }} \ + -e FORGE_MODEL=${{ secrets.FORGE_MODEL || 'anthropic/claude-haiku-4-5' }} \ + -e FORGE_MODEL_WHITELIST=${{ vars.FORGE_MODEL_WHITELIST || 'anthropic/claude-haiku-4-5,anthropic/claude-3-5-haiku,openai/gpt-4o-mini' }} \ -v "${{ github.workspace }}:/forge" \ forge-eval \ --agent /forge/${{ steps.agent.outputs.path }} \ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5bf457e..835e0d6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -18,19 +18,79 @@ mkdir agents/ touch agents//agent.py ``` -Implement the `generate` function: +Implement the `generate` function. There are two supported signatures: +**Static agent** (no LLM — backward compatible): ```python def generate(spec: dict) -> bytes: """Return STEP file bytes for a part that satisfies spec.""" ... ``` +**LLM agent** (recommended): +```python +from forge.sdk.llm import LLMClient + +def generate(spec: dict, llm: LLMClient) -> bytes: + """Return STEP file bytes, using the LLM to reason about geometry.""" + ... +``` + +The harness detects which signature you use via `inspect.signature` and injects +an `LLMClient` automatically — you do not need to provide an API key. + +#### Using the LLM client + +`LLMClient` wraps the OpenRouter API: + +```python +response: str = llm.chat( + messages=[{"role": "user", "content": "Your prompt here"}], + max_tokens=512, +) +``` + +The model is chosen by the harness via `FORGE_MODEL`. During CI, only +whitelisted models are accepted: + +- `anthropic/claude-haiku-4-5` +- `anthropic/claude-3-5-haiku` +- `openai/gpt-4o-mini` + +Miners do not configure the API key or model — the harness injects both. + +#### Observe → Plan → Act pattern + +```python +from forge.sdk.llm import LLMClient +import json + +def generate(spec: dict, llm: LLMClient) -> bytes: + # Observe: extract constraints + c = spec["constraints"] + + # Plan: ask the LLM to reason about geometry parameters + raw = llm.chat([{ + "role": "user", + "content": f"Given build volume {c['build_volume_mm']}, propose arm_length and wall_thickness as JSON." + }]) + dims = json.loads(raw) + + # Act: build the geometry with build123d + from build123d import Box, BuildPart + with BuildPart() as part: + Box(dims["arm_length"], dims["wall_thickness"], dims["wall_thickness"]) + + # ... export to STEP and return bytes +``` + +See `examples/llm-agent/agent.py` for a complete working example. + The agent runs inside a Docker container with these constraints: - **Time:** 60 seconds - **Memory:** 4 GB -- **Network:** disabled -- **Libraries available:** `build123d`, `gmsh`, `numpy`, `scipy`, `OCP` +- **Network:** enabled (required for LLM API calls) +- **Libraries available:** `build123d`, `gmsh`, `numpy`, `scipy`, `OCP`, `httpx` ### 3. Test locally diff --git a/QUICKSTART.md b/QUICKSTART.md index 652ea39..63f0d53 100644 --- a/QUICKSTART.md +++ b/QUICKSTART.md @@ -75,21 +75,31 @@ curl http://143.244.191.193:8000/specs/001_bracket cp -r agents/template agents/ ``` -Edit `agents//agent.py`. The only contract: +Edit `agents//agent.py`. Two supported signatures: +**Static agent** (no LLM): ```python def generate(spec: dict) -> bytes: - """ - Takes the spec dict (load, bolt pattern, build volume, material). - Returns STEP file bytes for your design. - """ + """Takes the spec dict, returns STEP file bytes.""" ... ``` -See `agents/taper-beam/agent.py` for a clean I-beam reference implementation (~38g). -See `agents/lean-arm/agent.py` for the I-beam baseline (~32g). -See `agents/pocket-plate/agent.py` for the wall-pocketing approach (~30g). -See `agents/compact-arm/agent.py` for the current SOTA (~27g). +**LLM agent** (recommended — harness injects the client): +```python +from forge.sdk.llm import LLMClient + +def generate(spec: dict, llm: LLMClient) -> bytes: + """Use the LLM to reason about geometry, then return STEP bytes.""" + response = llm.chat([{"role": "user", "content": "..."}]) + ... +``` + +No API key needed — the harness injects `LLMClient` automatically using whitelisted models. See `examples/llm-agent/agent.py` for a complete working example. + +Reference implementations in `agents/`: +- `taper-beam/` — clean I-beam (~38g) +- `lean-arm/` — I-beam baseline (~32g) +- `compact-arm/` — pocketed arm approach --- diff --git a/README.md b/README.md index f38f8c1..a576458 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ forge eval agents//agent.py ## Submitting 1. Fork this repo. -2. Create `agents//agent.py` with a `generate(spec: dict) -> bytes` function. +2. Create `agents//agent.py` with a `generate(spec, [llm]) -> bytes` function. 3. Open a PR. CI scores your design automatically (~2 min) and posts: ``` ## Forge Eval — NEW LEADER 🏆 @@ -77,18 +77,30 @@ See [CONTRIBUTING.md](CONTRIBUTING.md) for full guidelines. ## Agent interface +Two supported signatures — the harness detects which one you use automatically: + +**Static agent** (no LLM): ```python def generate(spec: dict) -> bytes: - """ - Takes the spec dict (load, bolt pattern, build volume, material). - Returns STEP file bytes for your design. + """Build and return STEP file bytes for the given spec.""" + ... +``` - Sandbox: 60s timeout, 4GB RAM, no network access. - """ +**LLM agent** (recommended): +```python +from forge.sdk.llm import LLMClient + +def generate(spec: dict, llm: LLMClient) -> bytes: + """Use the LLM to reason about geometry, then return STEP bytes.""" + response = llm.chat([{"role": "user", "content": "..."}]) ... ``` -Libraries available in eval: `build123d`, `OCP`, `gmsh`, `numpy`, `scipy`. See agents/ for reference implementations. +The harness injects `LLMClient` automatically — no API key required. Whitelisted models: `claude-haiku-4-5`, `claude-3-5-haiku`, `gpt-4o-mini`. See `examples/llm-agent/` for a complete working example. + +Sandbox constraints: **60s timeout · 4 GB RAM · network enabled (LLM calls only)** + +Libraries available: `build123d`, `OCP`, `gmsh`, `numpy`, `scipy`, `httpx`. See `agents/` for reference implementations. --- @@ -135,11 +147,12 @@ All CPU. No GPU required. Live: http://143.244.191.193:8000/sota -| Spec | Score | Agent | FEA Stress | -|---|---|---|---| -| 001 Wall Bracket | **27.22 g** | compact-arm | 13.8 / 25.0 MPa | -| 002 Equipment Mount | — | — | — | -| 003 Pipe-Clamp | 2799.52 g | baseline_steel | 22.18 / 82.0 MPa | +| Spec | Score | Agent | +|---|---|---| +| spec-001 Wall Bracket | **23.48 g** | sub-nano | +| spec-002 Equipment Mount | **25.84 g** | al-bracket-v19 | +| spec-003 Pipe-Clamp | **71.42 g** | ss-bracket-v15 | +| pub_001 – pub_005 | see leaderboard | various | --- diff --git a/agents/.keep b/agents/.keep new file mode 100644 index 0000000..e69de29 diff --git a/agents/template/agent.py b/agents/template/agent.py index 0bb7c38..d7abfdb 100644 --- a/agents/template/agent.py +++ b/agents/template/agent.py @@ -1,17 +1,23 @@ """ Template agent — start here. -Contract: implement generate(spec) -> bytes (STEP file). +Two supported signatures: -The eval harness calls generate() with the spec dict for each problem. -Return valid STEP bytes. The harness handles geometry checks and FEA; -your job is to return the lightest part that passes all constraints. + generate(spec: dict) -> bytes # static agent + generate(spec: dict, llm: LLMClient) -> bytes # LLM agent (recommended) -See QUICKSTART.md for a full walkthrough. +The harness detects which you use via inspect.signature and injects LLMClient +automatically if present — no API key required from you. + +See QUICKSTART.md for a full walkthrough and examples/llm-agent/ for an +LLM agent example. """ from __future__ import annotations +# To use the LLM client, uncomment: +# from forge.sdk.llm import LLMClient + # TODO: import your geometry library # from build123d import ... # recommended # from OCP.BRepPrimAPI import ... # raw OCP (see agents/baseline/) @@ -21,27 +27,27 @@ def generate(spec: dict) -> bytes: """ Build and return a STEP file for the given spec. + To use an LLM, change the signature to: generate(spec, llm: LLMClient) + Args: - spec: Problem specification dict. Key structure: - spec["constraints"]["load_n"] — applied load in Newtons - spec["constraints"]["load_point_mm"] — [x, y, z] load application point - spec["constraints"]["build_volume_mm"] — [x, y, z] max bounding box - spec["constraints"]["bolt_pattern_mm"] — [[y, z], ...] bolt hole centers (x=0 plane) - spec["constraints"]["bolt_diameter_clearance_mm"] — minimum clearance diameter - spec["constraints"]["min_wall_thickness_mm"] — minimum feature wall - spec["constraints"]["max_overhang_deg"] — max overhang from vertical - spec["material"] — material name (see benchmark/materials.py) + spec: Problem specification dict. Key fields: + spec["constraints"]["load_n"] — load in Newtons + spec["constraints"]["load_point_mm"] — [x, y, z] load point + spec["constraints"]["build_volume_mm"] — [x, y, z] bounding box + spec["constraints"]["bolt_pattern_mm"] — [[y, z], ...] bolt centers + spec["constraints"]["bolt_diameter_clearance_mm"] — hole clearance + spec["constraints"]["min_wall_thickness_mm"] — minimum wall + spec["constraints"]["max_overhang_deg"] — max printable overhang + spec["material"] — material name spec["safety_factor"] — FEA stress safety factor + spec["scoring"]["metric"] — "mass_grams" | "volume_mm3" | ... Returns: - STEP file as raw bytes. Must be valid AP214IS STEP. + STEP file as raw bytes (AP214IS schema required). Notes: - - Must be deterministic: same spec → same bytes every call. - If you use any randomness, fix the seed (e.g. random.seed(42)). - - The FEA mesh uses C3D4 linear tets at ~2 mm characteristic length. - Avoid features thinner than 3 mm — they produce degenerate elements. - - Lower mass = better score. There is no ceiling; keep optimizing. + - Must be deterministic: same spec → same bytes. Fix any random seeds. + - Avoid features thinner than 3 mm — they produce degenerate FEA elements. """ constraints = spec["constraints"] diff --git a/benchmark/_worker.py b/benchmark/_worker.py index 22d8f95..d121200 100644 --- a/benchmark/_worker.py +++ b/benchmark/_worker.py @@ -15,12 +15,16 @@ import argparse import importlib.util +import inspect import json import os import resource import sys from pathlib import Path +# Make forge.sdk importable regardless of install state. +sys.path.insert(0, str(Path(__file__).parent.parent)) + CPU_SECONDS = 150 @@ -53,7 +57,14 @@ def main() -> None: try: loader_spec.loader.exec_module(mod) - step_bytes = mod.generate(spec) + + sig = inspect.signature(mod.generate) + if len(sig.parameters) >= 2: + from forge.sdk.llm import LLMClient + llm = LLMClient() + step_bytes = mod.generate(spec, llm) + else: + step_bytes = mod.generate(spec) except Exception as exc: print(f"{type(exc).__name__}: {exc}", file=sys.stderr) sys.exit(1) diff --git a/cli.py b/cli.py index 6e99c06..5baeb1b 100644 --- a/cli.py +++ b/cli.py @@ -472,8 +472,15 @@ def _run_evaluate(agent_path: str, spec_path: str, verbose: bool) -> dict: "--spec", spec_path, "--json", ] + # Inherit environment; supply defaults so LLM agents work without extra setup. + env = os.environ.copy() + env.setdefault("FORGE_MODEL", "anthropic/claude-haiku-4-5") + env.setdefault( + "FORGE_MODEL_WHITELIST", + "anthropic/claude-haiku-4-5,anthropic/claude-3-5-haiku,openai/gpt-4o-mini", + ) try: - proc = subprocess.run(cmd, capture_output=True, text=True, cwd=str(ROOT)) + proc = subprocess.run(cmd, capture_output=True, text=True, cwd=str(ROOT), env=env) except FileNotFoundError: return {"passed": False, "stage": "error", "reason": "benchmark module not found — run from repo root"} diff --git a/examples/llm-agent/agent.py b/examples/llm-agent/agent.py new file mode 100644 index 0000000..d7d04ed --- /dev/null +++ b/examples/llm-agent/agent.py @@ -0,0 +1,96 @@ +""" +Example LLM agent — observe → plan → act. + +The harness injects an LLMClient bound to a whitelisted model. This agent +asks the LLM to propose dimensions for a simple L-bracket, then builds it +with build123d and returns STEP bytes. +""" + +from __future__ import annotations + +import json + +from build123d import ( + Box, + BuildPart, + Cylinder, + Location, + Mode, + Pos, + export_step, +) + +from forge.sdk.llm import LLMClient + + +def generate(spec: dict, llm: LLMClient) -> bytes: + # ── Observe ────────────────────────────────────────────────────────────── + c = spec["constraints"] + bv = c["build_volume_mm"] # [x, y, z] bounding box + load_pt = c["load_point_mm"] # [x, y, z] + bolt_d = c["bolt_diameter_clearance_mm"] + min_wall = c["min_wall_thickness_mm"] + + # ── Plan (LLM proposes dimensions) ──────────────────────────────────────── + prompt = f"""You are a mechanical CAD assistant. Given this bracket spec, propose +integer dimensions (mm) for a minimal L-bracket with a vertical mount plate and a +horizontal arm. Reply with ONLY valid JSON, no prose. + +Spec: + build_volume_mm: {bv} + load_point_mm: {load_pt} + bolt_clearance_mm: {bolt_d} + min_wall_mm: {min_wall} + +Return JSON with exactly these keys: + arm_length — horizontal arm length (x-axis), int + arm_thickness — arm wall thickness, int >= {max(4, int(min_wall) + 2)} + plate_height — mount plate height (z-axis), int + plate_width — mount plate width (y-axis), int + plate_thickness — mount plate thickness (x-axis), int >= {max(4, int(min_wall) + 2)} +""" + + raw = llm.chat( + [{"role": "user", "content": prompt}], + max_tokens=256, + ) + + dims = json.loads(raw) + arm_len = int(dims["arm_length"]) + arm_t = int(dims["arm_thickness"]) + plate_h = int(dims["plate_height"]) + plate_w = int(dims["plate_width"]) + plate_t = int(dims["plate_thickness"]) + + # Clamp to build volume + arm_len = min(arm_len, int(bv[0]) - plate_t) + plate_h = min(plate_h, int(bv[2])) + plate_w = min(plate_w, int(bv[1])) + + # ── Act (build geometry) ────────────────────────────────────────────────── + with BuildPart() as part: + # Vertical mount plate at x=0 face + with Pos(plate_t / 2, plate_w / 2, plate_h / 2): + Box(plate_t, plate_w, plate_h) + + # Horizontal arm extending along +x + arm_cx = plate_t + arm_len / 2 + with Pos(arm_cx, plate_w / 2, arm_t / 2): + Box(arm_len, plate_w, arm_t) + + # Clear bolt holes through the mount plate + bolt_r = bolt_d / 2 + for (by, bz) in c["bolt_pattern_mm"]: + with Pos(0, by + plate_w / 2 - plate_w / 2, bz): + # Cylinder along x-axis + Cylinder(bolt_r, plate_t, mode=Mode.SUBTRACT, + rotation=(0, 90, 0)) + + import tempfile, os + with tempfile.NamedTemporaryFile(suffix=".step", delete=False) as f: + path = f.name + try: + export_step(part.part, path) + return open(path, "rb").read() + finally: + os.unlink(path) diff --git a/examples/llm-agent/spec.txt b/examples/llm-agent/spec.txt new file mode 100644 index 0000000..30cda4b --- /dev/null +++ b/examples/llm-agent/spec.txt @@ -0,0 +1 @@ +pub_001_medium \ No newline at end of file diff --git a/forge/__init__.py b/forge/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/forge/sdk/__init__.py b/forge/sdk/__init__.py new file mode 100644 index 0000000..e60c25c --- /dev/null +++ b/forge/sdk/__init__.py @@ -0,0 +1,3 @@ +from forge.sdk.llm import LLMClient + +__all__ = ["LLMClient"] diff --git a/forge/sdk/llm.py b/forge/sdk/llm.py new file mode 100644 index 0000000..55c36a5 --- /dev/null +++ b/forge/sdk/llm.py @@ -0,0 +1,59 @@ +""" +LLM client for Forge agents. + +Reads configuration from environment: + FORGE_LLM_KEY — OpenRouter API key (required at chat time) + FORGE_MODEL — model ID to use (e.g. "anthropic/claude-haiku-4-5") + FORGE_MODEL_WHITELIST — comma-separated allowed model IDs; omit to allow any +""" + +from __future__ import annotations + +import os +from typing import Any + +import httpx + +OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions" + + +class LLMClient: + def __init__(self) -> None: + self._key = os.environ.get("FORGE_LLM_KEY", "").strip() + self._model = os.environ.get("FORGE_MODEL", "").strip() + + raw_whitelist = os.environ.get("FORGE_MODEL_WHITELIST", "").strip() + self._whitelist: list[str] | None = ( + [m.strip() for m in raw_whitelist.split(",") if m.strip()] + if raw_whitelist + else None + ) + + if self._whitelist is not None and self._model not in self._whitelist: + raise ValueError( + f"Model {self._model!r} is not in the whitelist: {self._whitelist}" + ) + + @property + def model(self) -> str: + return self._model + + def chat(self, messages: list[dict[str, Any]], max_tokens: int = 4096) -> str: + if not self._key: + raise RuntimeError("No LLM key configured — set FORGE_LLM_KEY") + + response = httpx.post( + OPENROUTER_URL, + headers={ + "Authorization": f"Bearer {self._key}", + "Content-Type": "application/json", + }, + json={ + "model": self._model, + "messages": messages, + "max_tokens": max_tokens, + }, + timeout=60.0, + ) + response.raise_for_status() + return response.json()["choices"][0]["message"]["content"] diff --git a/requirements.txt b/requirements.txt index 08ef611..7e2a32b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ build123d>=0.8.0 gmsh>=4.12.0 +httpx>=0.27.0 numpy>=1.26.0