Luce-Org · easel · Jun 3, 2026 · Jun 11, 2026 · Jun 11, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -22,6 +22,29 @@ jobs:
       - name: Lint Python surfaces touched by lucebox tooling
         run: uv run --frozen --extra dev ruff check .
 
+      - name: Install shellcheck (for bash test runner)
+        # ubuntu-latest typically ships shellcheck pre-installed, but pin
+        # the dependency explicitly so the bash test runner can always rely
+        # on `command -v shellcheck` succeeding.
+        run: |
+          if ! command -v shellcheck >/dev/null 2>&1; then
+            sudo apt-get update
+            sudo apt-get install -y shellcheck
+          fi
+          shellcheck --version | head -3
+
+      - name: Typecheck lucebox CLI
+        run: uv run --frozen --extra dev python -m mypy --package lucebox
+
+      - name: Smoke-test lucebox.sh wrapper
+        # Catches `set -u` regressions, syntax errors, and stale dispatch
+        # handlers in the host-side wrapper + the in-container entrypoint.
+        # Runs shellcheck --severity=error across every shipped .sh file,
+        # exercises every subcommand dispatch under `set -u`, and drives the
+        # entrypoint's draft-resolution block through every family-glob
+        # branch — all on the bare runner without docker/nvidia/systemd.
+        run: bash scripts/test_lucebox_sh.sh
+
   build:
     name: Build (cmake + uv sync --extra megakernel)
     runs-on: ubuntu-latest

diff --git a/harness/clients/README.md b/harness/clients/README.md
@@ -70,6 +70,7 @@ The defaults below are the current RTX 3090 starting points for
 | OpenClaw | `run_openclaw.sh` | `MAX_CTX=204800 BUDGET=22 VERIFY_MODE=ddtree EXTRA_SERVER_ARGS=--lazy-draft` |
 | Open WebUI chat | `run_openwebui.sh` | `MAX_CTX=262144 BUDGET=22 VERIFY_MODE=ddtree EXTRA_SERVER_ARGS=--lazy-draft` |
 | Open WebUI tools | `run_openwebui_tools.sh` | `MAX_CTX=65536 BUDGET=22 VERIFY_MODE=ddtree EXTRA_SERVER_ARGS=--lazy-draft` |
+| luce-bench | `run_lucebench.sh` | `MAX_CTX=32768 BUDGET=22 VERIFY_MODE=ddtree EXTRA_SERVER_ARGS=--lazy-draft` |
 
 Override any setting inline:
 
@@ -102,6 +103,29 @@ OpenAI Chat Completions clients can call llama.cpp directly. Claude Code and
 Codex use `llamacpp_compat_proxy.py` so their real Anthropic Messages and
 Responses requests can be compared too.
 
+## luce-bench
+
+`run_lucebench.sh` is the odd one out: the "client" is `luce-bench` (the
+in-tree capability bench at `luce-bench/`), not a vendored binary. It hits
+`/v1/chat/completions` with the standard ds4-eval / HumanEval / longctx /
+agent / forge case sets and writes per-case PASS/FAIL + timings.
+
+Useful as a regression gate: a server change that breaks tool-call parsing,
+chat-template rendering, or sampling defaults will show up here the same way
+it would break a real-client launcher above.
+
+```bash
+# Default — runs the level1 set: smoke, code, gsm8k, agent, longctx
+harness/clients/run_lucebench.sh
+
+# Single area
+LUCEBENCH_AREA=code harness/clients/run_lucebench.sh
+LUCEBENCH_AREA=ds4-eval LUCEBENCH_THINK=1 harness/clients/run_lucebench.sh
+
+# Knobs (see top of run_lucebench.sh): LUCEBENCH_AREA, LUCEBENCH_THINK,
+# LUCEBENCH_MAX_TOKENS, LUCEBENCH_TIMEOUT, LUCEBENCH_PARALLEL.
+```
+
 ## Notes
 
 - `common.sh` contains the shared server startup logic.

diff --git a/harness/clients/run_lucebench.sh b/harness/clients/run_lucebench.sh
@@ -0,0 +1,116 @@
+#!/usr/bin/env bash
+# Run luce-bench as a harness client against a freshly-started Lucebox server.
+#
+# Slots into the same start-server → run-client → save-logs → stop-server
+# pattern as the other harness/clients/run_*.sh wrappers (run_codex.sh,
+# run_claude_code.sh, etc.). The "client" here is luce-bench (the standalone
+# HTTP capability bench, now an in-tree workspace member at luce-bench/).
+#
+# Why this exists: luce-bench is just another HTTP client of /v1/chat/completions.
+# Wrapping it in the harness pattern gives operators a uniform way to invoke
+# it ("did this server change break luce-bench?") alongside real-client smoke
+# tests, and lets the harness sweep matrix surface luce-bench regressions the
+# same way it surfaces an OpenCode or Hermes regression.
+#
+# Knobs (env var or default):
+#   LUCEBENCH_AREA      area(s) to run; pass the comma list (or `all`) to
+#                       luce-bench directly.
+#                       (default: empty → the level1 set
+#                       `smoke,code,gsm8k,agent,longctx` — matches
+#                       `luce-bench/src/lucebench/levels.py:LEVELS["level1"]`.
+#                       Use `LUCEBENCH_AREA=all` for the full stdlib sweep;
+#                       `LUCEBENCH_AREA=forge` requires the [forge] extra.)
+#   LUCEBENCH_THINK     1 → --think, 0 → --no-think, empty → per-area
+#                       defaults from luce-bench's area cards (recommended).
+#                       Default empty so we don't override card-defined
+#                       defaults; set `LUCEBENCH_THINK=0` for the
+#                       ~4× faster nothink mode on gemma-4-26b (see
+#                       2026-05-26 think/nothink comparison) when running
+#                       A/B sweeps.
+#   LUCEBENCH_MAX_TOKENS overrides per-request decode cap when set
+#   LUCEBENCH_TIMEOUT   per-request wall timeout in seconds (default 300)
+#   LUCEBENCH_PARALLEL  in-flight concurrency (default 1 — single-GPU)
+#
+# All harness/common.sh knobs apply: MODEL_SERVER (set
+# `MODEL_SERVER=lucebox` for the native dflash_server or
+# `MODEL_SERVER=llamacpp` for llama.cpp's server), MAX_CTX, BUDGET,
+# MODEL_ID, EXTRA_SERVER_ARGS, PORT, etc.
+#
+# Output:
+#   $LOG_DIR/lucebench-{area,sweep}.{json,md}  — bench results (per-case rows
+#                                                + markdown summary)
+#   $LOG_DIR/lucebench.out                     — stdout/stderr from the run
+#   $LOG_DIR/server.log                        — server stdout/stderr
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+: "${MAX_CTX:=32768}"
+: "${BUDGET:=22}"
+: "${VERIFY_MODE:=ddtree}"
+: "${EXTRA_SERVER_ARGS:=--lazy-draft}"
+: "${LUCEBENCH_AREA:=}"
+: "${LUCEBENCH_THINK:=}"
+: "${LUCEBENCH_MAX_TOKENS:=}"
+: "${LUCEBENCH_TIMEOUT:=300}"
+: "${LUCEBENCH_PARALLEL:=1}"
+source "$SCRIPT_DIR/common.sh"
+
+CLIENT_OUT="$LOG_DIR/lucebench.out"
+
+# Build the luce-bench argv. With no LUCEBENCH_AREA, we run the level1 set
+# (smoke + code + gsm8k + agent + longctx — the standard capability gate
+# documented in luce-bench/src/lucebench/levels.py), and write per-area
+# JSONs + `_summary.{json,md}` under $LOG_DIR/lucebench-sweep/.
+# With LUCEBENCH_AREA=X (single area), we write a single JSON to
+# $LOG_DIR/lucebench-X.json so the file name carries the area.
+# With LUCEBENCH_AREA=<comma list> or `all`, we sweep into lucebench-sweep/.
+# `--areas` is the canonical flag since luce-bench v0.2.5; the older
+# `--sweep` is still accepted but emits a deprecation note.
+lucebench_args=(--base-url "$BASE_URL" --model "$MODEL_ID" \
+                --timeout "$LUCEBENCH_TIMEOUT" --parallel "$LUCEBENCH_PARALLEL")
+
+# Default area set when LUCEBENCH_AREA is unset/empty: the level1 capability
+# gate (mirrors luce-bench's `--level level1`). Picking `all` here was too
+# broad — it tripped slow areas (ds4-eval, forge, agent_recorded) on every
+# default run.
+: "${LUCEBENCH_AREA_DEFAULT:=smoke,code,gsm8k,agent,longctx}"
+effective_area="${LUCEBENCH_AREA:-$LUCEBENCH_AREA_DEFAULT}"
+
+if [[ "$effective_area" == *","* || "$effective_area" == "all" ]]; then
+  # Multi-area or `all`: sweep, write per-area JSONs + a roll-up.
+  lucebench_args+=(--areas "$effective_area" --out-dir "$LOG_DIR" --name lucebench-sweep)
+else
+  # Single area: one JSON named after the area for convenient diffing.
+  lucebench_args+=(--areas "$effective_area" \
+                   --json-out "$LOG_DIR/lucebench-$effective_area.json")
+fi
+
+# --think / --no-think only applies when explicitly set. Leaving the flag
+# off lets the server's card-defined defaults govern (recommended for
+# capability gates; explicit modes are for A/B sweeps).
+if [[ "$LUCEBENCH_THINK" == "1" ]]; then
+  lucebench_args+=(--think)
+elif [[ "$LUCEBENCH_THINK" == "0" ]]; then
+  lucebench_args+=(--no-think)
+fi
+
+if [[ -n "$LUCEBENCH_MAX_TOKENS" ]]; then
+  lucebench_args+=(--max-tokens "$LUCEBENCH_MAX_TOKENS")
+fi
+
+start_lucebox_server
+trap stop_lucebox_server EXIT
+wait_lucebox_server
+
+set +e
+cd "$REPO_DIR"
+# Delegate to harness.bench (the Python entry point) so this wrapper, the
+# `lucebox profile` framework, and ad-hoc operators all go through the
+# same argv-building source of truth.
+uv run python -m harness.bench "${lucebench_args[@]}" \
+  > "$CLIENT_OUT" 2>&1
+RC=$?
+set -e
+
+finish_report "$CLIENT_OUT" "$RC"
+exit "$RC"
diff --git a/harness/pyproject.toml b/harness/pyproject.toml
@@ -0,0 +1,33 @@
+[project]
+name = "harness"
+version = "0.1.0"
+description = "Client launchers, server-profile sweeps, and bench orchestration for Lucebox."
+readme = "README.md"
+license = { text = "Apache-2.0" }
+requires-python = ">=3.10"
+authors = [{ name = "Lucebox" }]
+
+# luce-bench is consumed lazily — `harness.bench.run` invokes
+# `python -m lucebench.cli` as a subprocess, so the package doesn't need
+# to be importable at install time. Kept out of the hard dep list so the
+# workspace can lock without luce-bench in the registry (lands in a
+# sibling PR — see #337). Install with `uv pip install luce-bench` on the
+# host running benches.
+dependencies = []
+
+[project.optional-dependencies]
+# Kept as an empty alias for backward compatibility — luce-bench[forge]
+# is itself a back-compat no-op as of luce-bench v0.2.6.
+forge = []
+dev = ["pytest>=8.0"]
+
+[project.scripts]
+harness-run-bench = "harness.bench:main"
+harness-claude-code = "harness.clients.claude_code:main"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/harness"]
diff --git a/harness/src/harness/__init__.py b/harness/src/harness/__init__.py
@@ -0,0 +1,21 @@
+"""Lucebox harness — client launchers, bench orchestration, profile sweeps.
+
+The harness is the "run X against a Lucebox server" abstraction. It owns the
+server-lifecycle + client-config patterns that the shell launchers under
+``harness/clients/`` use, exposed here as importable Python so callers like
+``lucebox profile`` can build on it without re-implementing argv.
+
+Modules:
+  - `harness.bench` — run a luce-bench area (or full sweep) against a server,
+    return the parsed JSON. The Python entry point for
+    ``harness/clients/run_lucebench.sh``.
+  - `harness.clients.claude_code` — launch Claude Code against a Lucebox
+    server with the right env (ANTHROPIC_BASE_URL, telemetry-off knobs,
+    etc.). The Python entry point for ``harness/clients/run_claude_code.sh``
+    and for the host-side ``lucebox claude`` subcommand.
+
+All entry points keep the stdlib-only invariant — fresh test boxes can run
+the harness before any project Python deps are installed.
+"""
+
+__version__ = "0.1.0"