From daf30f47b2f6167bb6dba3ebcf3c19326833711e Mon Sep 17 00:00:00 2001 From: Erik LaBianca Date: Wed, 3 Jun 2026 17:29:35 -0400 Subject: [PATCH 1/4] feat(lucebox): hub CLI with autotune, sweep, profile, smoke + harness adapters New lucebox/ Python package exposing the hub CLI (autotune, sweep, profile, smoke, models, config, download, host-check, docker_run) plus the lucebox.sh launcher wrapper and install.sh. Adds the harness/ adapter package wrapping external coding agents (claude_code, codex, hermes, openclaw, opencode, pi) that autotune sweeps drive. Ships scripts/check_lucebox_wrapper_sandbox.sh and scripts/test_lucebox_sh.sh for wrapper validation, full pytest coverage under lucebox/tests/, and the bragi autotune profile-sweep protocol docs. This is the user-facing surface of lucebox-hub: one CLI to launch the image, tune layer-split / pflash settings against a host, run sweeps, and dispatch bench runs. Splitting it out keeps Python-side review independent of the C++ server and Docker stack reviews. - #334 (docker-stack): docker_run.py launches the lucebox-hub image - #337 (lucebench-harness): lucebox bench delegates to luce-bench (workspace dep) - #336 (server-layer-split): autotune presumes layer-split build artifacts --- .github/workflows/ci.yml | 23 + harness/clients/README.md | 24 + harness/clients/run_lucebench.sh | 116 ++ harness/pyproject.toml | 33 + harness/src/harness/__init__.py | 21 + harness/src/harness/bench.py | 231 +++ harness/src/harness/clients/__init__.py | 17 + harness/src/harness/clients/_common.py | 86 ++ harness/src/harness/clients/claude_code.py | 194 +++ harness/src/harness/clients/codex.py | 129 ++ harness/src/harness/clients/hermes.py | 187 +++ harness/src/harness/clients/openclaw.py | 170 +++ harness/src/harness/clients/opencode.py | 196 +++ harness/src/harness/clients/pi.py | 130 ++ harness/src/harness/py.typed | 0 install.sh | 138 ++ lefthook.yml | 59 + lucebox.sh | 1275 +++++++++++++++++ lucebox/.gitignore | 3 + lucebox/README.md | 17 + lucebox/pyproject.toml | 54 + lucebox/src/lucebox/__init__.py | 15 + lucebox/src/lucebox/__main__.py | 6 + lucebox/src/lucebox/autotune.py | 489 +++++++ lucebox/src/lucebox/cli.py | 758 ++++++++++ lucebox/src/lucebox/config.py | 463 ++++++ lucebox/src/lucebox/docker_run.py | 232 +++ lucebox/src/lucebox/download.py | 500 +++++++ lucebox/src/lucebox/host_check.py | 232 +++ lucebox/src/lucebox/host_facts.py | 58 + lucebox/src/lucebox/profile.py | 203 +++ lucebox/src/lucebox/py.typed | 1 + lucebox/src/lucebox/smoke.py | 247 ++++ lucebox/src/lucebox/sweep.py | 868 +++++++++++ lucebox/src/lucebox/types.py | 140 ++ lucebox/tests/test_autotune.py | 135 ++ .../tests/test_autotune_candidate_configs.py | 92 ++ lucebox/tests/test_autotune_cli.py | 175 +++ lucebox/tests/test_check.py | 118 ++ lucebox/tests/test_cli.py | 111 ++ lucebox/tests/test_config.py | 176 +++ lucebox/tests/test_config_cli.py | 127 ++ lucebox/tests/test_download.py | 301 ++++ lucebox/tests/test_models_cli.py | 142 ++ lucebox/tests/test_profile.py | 159 ++ lucebox/tests/test_smoke.py | 36 + lucebox/tests/test_sweep.py | 523 +++++++ pyproject.toml | 10 +- scripts/check_lucebox_wrapper_sandbox.sh | 242 ++++ scripts/test_lucebox_sh.sh | 1131 +++++++++++++++ uv.lock | 49 + 51 files changed, 10839 insertions(+), 3 deletions(-) create mode 100755 harness/clients/run_lucebench.sh create mode 100644 harness/pyproject.toml create mode 100644 harness/src/harness/__init__.py create mode 100644 harness/src/harness/bench.py create mode 100644 harness/src/harness/clients/__init__.py create mode 100644 harness/src/harness/clients/_common.py create mode 100644 harness/src/harness/clients/claude_code.py create mode 100644 harness/src/harness/clients/codex.py create mode 100644 harness/src/harness/clients/hermes.py create mode 100644 harness/src/harness/clients/openclaw.py create mode 100644 harness/src/harness/clients/opencode.py create mode 100644 harness/src/harness/clients/pi.py create mode 100644 harness/src/harness/py.typed create mode 100755 install.sh create mode 100644 lefthook.yml create mode 100755 lucebox.sh create mode 100644 lucebox/.gitignore create mode 100644 lucebox/README.md create mode 100644 lucebox/pyproject.toml create mode 100644 lucebox/src/lucebox/__init__.py create mode 100644 lucebox/src/lucebox/__main__.py create mode 100644 lucebox/src/lucebox/autotune.py create mode 100644 lucebox/src/lucebox/cli.py create mode 100644 lucebox/src/lucebox/config.py create mode 100644 lucebox/src/lucebox/docker_run.py create mode 100644 lucebox/src/lucebox/download.py create mode 100644 lucebox/src/lucebox/host_check.py create mode 100644 lucebox/src/lucebox/host_facts.py create mode 100644 lucebox/src/lucebox/profile.py create mode 100644 lucebox/src/lucebox/py.typed create mode 100644 lucebox/src/lucebox/smoke.py create mode 100644 lucebox/src/lucebox/sweep.py create mode 100644 lucebox/src/lucebox/types.py create mode 100644 lucebox/tests/test_autotune.py create mode 100644 lucebox/tests/test_autotune_candidate_configs.py create mode 100644 lucebox/tests/test_autotune_cli.py create mode 100644 lucebox/tests/test_check.py create mode 100644 lucebox/tests/test_cli.py create mode 100644 lucebox/tests/test_config.py create mode 100644 lucebox/tests/test_config_cli.py create mode 100644 lucebox/tests/test_download.py create mode 100644 lucebox/tests/test_models_cli.py create mode 100644 lucebox/tests/test_profile.py create mode 100644 lucebox/tests/test_smoke.py create mode 100644 lucebox/tests/test_sweep.py create mode 100755 scripts/check_lucebox_wrapper_sandbox.sh create mode 100755 scripts/test_lucebox_sh.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 46919deb..c287e8e4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,6 +22,29 @@ jobs: - name: Lint Python surfaces touched by lucebox tooling run: uv run --frozen --extra dev ruff check . + - name: Install shellcheck (for bash test runner) + # ubuntu-latest typically ships shellcheck pre-installed, but pin + # the dependency explicitly so the bash test runner can always rely + # on `command -v shellcheck` succeeding. + run: | + if ! command -v shellcheck >/dev/null 2>&1; then + sudo apt-get update + sudo apt-get install -y shellcheck + fi + shellcheck --version | head -3 + + - name: Typecheck lucebox CLI + run: uv run --frozen --extra dev python -m mypy --package lucebox + + - name: Smoke-test lucebox.sh wrapper + # Catches `set -u` regressions, syntax errors, and stale dispatch + # handlers in the host-side wrapper + the in-container entrypoint. + # Runs shellcheck --severity=error across every shipped .sh file, + # exercises every subcommand dispatch under `set -u`, and drives the + # entrypoint's draft-resolution block through every family-glob + # branch — all on the bare runner without docker/nvidia/systemd. + run: bash scripts/test_lucebox_sh.sh + build: name: Build (cmake + uv sync --extra megakernel) runs-on: ubuntu-latest diff --git a/harness/clients/README.md b/harness/clients/README.md index dee7758e..edc87a39 100644 --- a/harness/clients/README.md +++ b/harness/clients/README.md @@ -70,6 +70,7 @@ The defaults below are the current RTX 3090 starting points for | OpenClaw | `run_openclaw.sh` | `MAX_CTX=204800 BUDGET=22 VERIFY_MODE=ddtree EXTRA_SERVER_ARGS=--lazy-draft` | | Open WebUI chat | `run_openwebui.sh` | `MAX_CTX=262144 BUDGET=22 VERIFY_MODE=ddtree EXTRA_SERVER_ARGS=--lazy-draft` | | Open WebUI tools | `run_openwebui_tools.sh` | `MAX_CTX=65536 BUDGET=22 VERIFY_MODE=ddtree EXTRA_SERVER_ARGS=--lazy-draft` | +| luce-bench | `run_lucebench.sh` | `MAX_CTX=32768 BUDGET=22 VERIFY_MODE=ddtree EXTRA_SERVER_ARGS=--lazy-draft` | Override any setting inline: @@ -102,6 +103,29 @@ OpenAI Chat Completions clients can call llama.cpp directly. Claude Code and Codex use `llamacpp_compat_proxy.py` so their real Anthropic Messages and Responses requests can be compared too. +## luce-bench + +`run_lucebench.sh` is the odd one out: the "client" is `luce-bench` (the +in-tree capability bench at `luce-bench/`), not a vendored binary. It hits +`/v1/chat/completions` with the standard ds4-eval / HumanEval / longctx / +agent / forge case sets and writes per-case PASS/FAIL + timings. + +Useful as a regression gate: a server change that breaks tool-call parsing, +chat-template rendering, or sampling defaults will show up here the same way +it would break a real-client launcher above. + +```bash +# Default — runs the level1 set: smoke, code, gsm8k, agent, longctx +harness/clients/run_lucebench.sh + +# Single area +LUCEBENCH_AREA=code harness/clients/run_lucebench.sh +LUCEBENCH_AREA=ds4-eval LUCEBENCH_THINK=1 harness/clients/run_lucebench.sh + +# Knobs (see top of run_lucebench.sh): LUCEBENCH_AREA, LUCEBENCH_THINK, +# LUCEBENCH_MAX_TOKENS, LUCEBENCH_TIMEOUT, LUCEBENCH_PARALLEL. +``` + ## Notes - `common.sh` contains the shared server startup logic. diff --git a/harness/clients/run_lucebench.sh b/harness/clients/run_lucebench.sh new file mode 100755 index 00000000..7fefd0eb --- /dev/null +++ b/harness/clients/run_lucebench.sh @@ -0,0 +1,116 @@ +#!/usr/bin/env bash +# Run luce-bench as a harness client against a freshly-started Lucebox server. +# +# Slots into the same start-server → run-client → save-logs → stop-server +# pattern as the other harness/clients/run_*.sh wrappers (run_codex.sh, +# run_claude_code.sh, etc.). The "client" here is luce-bench (the standalone +# HTTP capability bench, now an in-tree workspace member at luce-bench/). +# +# Why this exists: luce-bench is just another HTTP client of /v1/chat/completions. +# Wrapping it in the harness pattern gives operators a uniform way to invoke +# it ("did this server change break luce-bench?") alongside real-client smoke +# tests, and lets the harness sweep matrix surface luce-bench regressions the +# same way it surfaces an OpenCode or Hermes regression. +# +# Knobs (env var or default): +# LUCEBENCH_AREA area(s) to run; pass the comma list (or `all`) to +# luce-bench directly. +# (default: empty → the level1 set +# `smoke,code,gsm8k,agent,longctx` — matches +# `luce-bench/src/lucebench/levels.py:LEVELS["level1"]`. +# Use `LUCEBENCH_AREA=all` for the full stdlib sweep; +# `LUCEBENCH_AREA=forge` requires the [forge] extra.) +# LUCEBENCH_THINK 1 → --think, 0 → --no-think, empty → per-area +# defaults from luce-bench's area cards (recommended). +# Default empty so we don't override card-defined +# defaults; set `LUCEBENCH_THINK=0` for the +# ~4× faster nothink mode on gemma-4-26b (see +# 2026-05-26 think/nothink comparison) when running +# A/B sweeps. +# LUCEBENCH_MAX_TOKENS overrides per-request decode cap when set +# LUCEBENCH_TIMEOUT per-request wall timeout in seconds (default 300) +# LUCEBENCH_PARALLEL in-flight concurrency (default 1 — single-GPU) +# +# All harness/common.sh knobs apply: MODEL_SERVER (set +# `MODEL_SERVER=lucebox` for the native dflash_server or +# `MODEL_SERVER=llamacpp` for llama.cpp's server), MAX_CTX, BUDGET, +# MODEL_ID, EXTRA_SERVER_ARGS, PORT, etc. +# +# Output: +# $LOG_DIR/lucebench-{area,sweep}.{json,md} — bench results (per-case rows +# + markdown summary) +# $LOG_DIR/lucebench.out — stdout/stderr from the run +# $LOG_DIR/server.log — server stdout/stderr +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +: "${MAX_CTX:=32768}" +: "${BUDGET:=22}" +: "${VERIFY_MODE:=ddtree}" +: "${EXTRA_SERVER_ARGS:=--lazy-draft}" +: "${LUCEBENCH_AREA:=}" +: "${LUCEBENCH_THINK:=}" +: "${LUCEBENCH_MAX_TOKENS:=}" +: "${LUCEBENCH_TIMEOUT:=300}" +: "${LUCEBENCH_PARALLEL:=1}" +source "$SCRIPT_DIR/common.sh" + +CLIENT_OUT="$LOG_DIR/lucebench.out" + +# Build the luce-bench argv. With no LUCEBENCH_AREA, we run the level1 set +# (smoke + code + gsm8k + agent + longctx — the standard capability gate +# documented in luce-bench/src/lucebench/levels.py), and write per-area +# JSONs + `_summary.{json,md}` under $LOG_DIR/lucebench-sweep/. +# With LUCEBENCH_AREA=X (single area), we write a single JSON to +# $LOG_DIR/lucebench-X.json so the file name carries the area. +# With LUCEBENCH_AREA= or `all`, we sweep into lucebench-sweep/. +# `--areas` is the canonical flag since luce-bench v0.2.5; the older +# `--sweep` is still accepted but emits a deprecation note. +lucebench_args=(--base-url "$BASE_URL" --model "$MODEL_ID" \ + --timeout "$LUCEBENCH_TIMEOUT" --parallel "$LUCEBENCH_PARALLEL") + +# Default area set when LUCEBENCH_AREA is unset/empty: the level1 capability +# gate (mirrors luce-bench's `--level level1`). Picking `all` here was too +# broad — it tripped slow areas (ds4-eval, forge, agent_recorded) on every +# default run. +: "${LUCEBENCH_AREA_DEFAULT:=smoke,code,gsm8k,agent,longctx}" +effective_area="${LUCEBENCH_AREA:-$LUCEBENCH_AREA_DEFAULT}" + +if [[ "$effective_area" == *","* || "$effective_area" == "all" ]]; then + # Multi-area or `all`: sweep, write per-area JSONs + a roll-up. + lucebench_args+=(--areas "$effective_area" --out-dir "$LOG_DIR" --name lucebench-sweep) +else + # Single area: one JSON named after the area for convenient diffing. + lucebench_args+=(--areas "$effective_area" \ + --json-out "$LOG_DIR/lucebench-$effective_area.json") +fi + +# --think / --no-think only applies when explicitly set. Leaving the flag +# off lets the server's card-defined defaults govern (recommended for +# capability gates; explicit modes are for A/B sweeps). +if [[ "$LUCEBENCH_THINK" == "1" ]]; then + lucebench_args+=(--think) +elif [[ "$LUCEBENCH_THINK" == "0" ]]; then + lucebench_args+=(--no-think) +fi + +if [[ -n "$LUCEBENCH_MAX_TOKENS" ]]; then + lucebench_args+=(--max-tokens "$LUCEBENCH_MAX_TOKENS") +fi + +start_lucebox_server +trap stop_lucebox_server EXIT +wait_lucebox_server + +set +e +cd "$REPO_DIR" +# Delegate to harness.bench (the Python entry point) so this wrapper, the +# `lucebox profile` framework, and ad-hoc operators all go through the +# same argv-building source of truth. +uv run python -m harness.bench "${lucebench_args[@]}" \ + > "$CLIENT_OUT" 2>&1 +RC=$? +set -e + +finish_report "$CLIENT_OUT" "$RC" +exit "$RC" diff --git a/harness/pyproject.toml b/harness/pyproject.toml new file mode 100644 index 00000000..ca1d2e67 --- /dev/null +++ b/harness/pyproject.toml @@ -0,0 +1,33 @@ +[project] +name = "harness" +version = "0.1.0" +description = "Client launchers, server-profile sweeps, and bench orchestration for Lucebox." +readme = "README.md" +license = { text = "Apache-2.0" } +requires-python = ">=3.10" +authors = [{ name = "Lucebox" }] + +# luce-bench is consumed lazily — `harness.bench.run` invokes +# `python -m lucebench.cli` as a subprocess, so the package doesn't need +# to be importable at install time. Kept out of the hard dep list so the +# workspace can lock without luce-bench in the registry (lands in a +# sibling PR — see #337). Install with `uv pip install luce-bench` on the +# host running benches. +dependencies = [] + +[project.optional-dependencies] +# Kept as an empty alias for backward compatibility — luce-bench[forge] +# is itself a back-compat no-op as of luce-bench v0.2.6. +forge = [] +dev = ["pytest>=8.0"] + +[project.scripts] +harness-run-bench = "harness.bench:main" +harness-claude-code = "harness.clients.claude_code:main" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src/harness"] diff --git a/harness/src/harness/__init__.py b/harness/src/harness/__init__.py new file mode 100644 index 00000000..51d29d9f --- /dev/null +++ b/harness/src/harness/__init__.py @@ -0,0 +1,21 @@ +"""Lucebox harness — client launchers, bench orchestration, profile sweeps. + +The harness is the "run X against a Lucebox server" abstraction. It owns the +server-lifecycle + client-config patterns that the shell launchers under +``harness/clients/`` use, exposed here as importable Python so callers like +``lucebox profile`` can build on it without re-implementing argv. + +Modules: + - `harness.bench` — run a luce-bench area (or full sweep) against a server, + return the parsed JSON. The Python entry point for + ``harness/clients/run_lucebench.sh``. + - `harness.clients.claude_code` — launch Claude Code against a Lucebox + server with the right env (ANTHROPIC_BASE_URL, telemetry-off knobs, + etc.). The Python entry point for ``harness/clients/run_claude_code.sh`` + and for the host-side ``lucebox claude`` subcommand. + +All entry points keep the stdlib-only invariant — fresh test boxes can run +the harness before any project Python deps are installed. +""" + +__version__ = "0.1.0" diff --git a/harness/src/harness/bench.py b/harness/src/harness/bench.py new file mode 100644 index 00000000..06f0b6a3 --- /dev/null +++ b/harness/src/harness/bench.py @@ -0,0 +1,231 @@ +"""Run a luce-bench area (or full sweep) against a Lucebox server. + +The function form of ``harness/clients/run_lucebench.sh``. Same contract: +build a luce-bench argv with the per-area knobs, exec it against a running +server, parse the JSON snapshot back. Used by ``lucebox profile`` so the +StepDefinition framework doesn't have to re-derive argv. + +The shell wrapper still exists for operator use (``harness/clients/run_lucebench.sh``). +Both ultimately do the same thing — single source of truth for what +"run luce-bench against this server" means. + +Stdlib-only at runtime. luce-bench is invoked as a subprocess so we don't +have to import it (its CLI module owns argv parsing + dispatch). +""" + +from __future__ import annotations + +import json +import shutil +import subprocess +import sys +from pathlib import Path +from typing import Any, Literal + +Area = Literal["ds4-eval", "code", "longctx", "agent", "forge"] + + +def run_bench( + *, + base_url: str, + area: Area | None = None, + areas: str | None = None, + model: str = "default", + think: bool | None = None, + max_tokens: int | None = None, + timeout: int = 300, + parallel: int = 1, + auth_env: str | None = None, + out_dir: Path | None = None, + name: str | None = None, + json_out: Path | None = None, + extra_body: dict[str, Any] | None = None, +) -> dict[str, Any]: + """Run a luce-bench area (or the full sweep) and return the parsed result. + + Args: + base_url: Lucebox server's HTTP base, e.g. ``http://localhost:8080``. + area: Single area name, or ``None`` for sweep mode. + areas: Optional explicit selector forwarded as luce-bench's + ``--areas`` value (a single name, comma list, or ``all``). + Use this when invoking from a wrapper that has already + resolved a custom area set (e.g. the level1 default + ``smoke,code,gsm8k,agent,longctx``); leave it None to let + ``area``'s sweep-mode (``area=None`` → ``--areas all``) + drive selection. + model: Model ID. ``"default"`` triggers luce-bench's ``/v1/models`` + auto-resolve (uses the single exposed model if there's exactly one). + think: ``True`` → ``--think``, ``False`` → ``--no-think``, ``None`` → + omit the flag and let the server's card defaults govern. + max_tokens: Per-request decode cap. ``None`` → use luce-bench area default. + timeout: Per-case wall timeout (seconds). + parallel: In-flight concurrency. + auth_env: Env var name to read Authorization bearer from (e.g. + ``OPENROUTER_API_KEY``). + out_dir: Directory for sweep output. Required when ``area`` is None. + name: Name for the sweep dir. Required when ``area`` is None. + json_out: Single-area mode only — override the output JSON path. + Used by ``lucebox profile`` to land snapshots where its + framework expects them (``dest/bench-.json``). Ignored + in sweep mode (sweep always writes per-area files + summary + under ``out_dir/name/``). + extra_body: Additional fields to merge into every chat-completion + request body. Use for provider-specific knobs. + + Returns: + For single-area: the parsed area JSON (rows, pass count, timings). + For sweep: the parsed ``_summary.json`` (cross-area aggregate). + """ + if area is None and (out_dir is None or name is None): + raise ValueError("sweep mode (area=None) requires out_dir and name") + if area is not None and areas is not None: + raise ValueError("pass either area=... or areas=..., not both") + + argv: list[str] = [ + sys.executable, + "-m", + "lucebench.cli", + "--base-url", + base_url, + "--model", + model, + "--timeout", + str(timeout), + "--parallel", + str(parallel), + ] + + resolved_json_out: Path + if area is not None: + if json_out is not None: + resolved_json_out = json_out + else: + resolved_json_out = (out_dir or Path.cwd()) / f"lucebench-{area}.json" + resolved_json_out.parent.mkdir(parents=True, exist_ok=True) + # --areas (canonical in v0.2.5+) accepts a single name too, so we + # use it everywhere instead of the back-compat --area form. + argv += ["--areas", area, "--json-out", str(resolved_json_out)] + else: + assert out_dir is not None and name is not None # narrowed by check above + out_dir.mkdir(parents=True, exist_ok=True) + # `--areas all` is the v0.2.5+ replacement for `--sweep`. Same + # output shape: per-area JSONs + _summary.{json,md} under + # out_dir/name/. Pre-v0.2.5 luce-bench still accepts --sweep + # with a deprecation warning, but new callers use --areas. + # Honor an explicit ``areas=`` selector when the caller has one + # (e.g. the shell wrapper forwarding LUCEBENCH_AREA's default + # level1 set); fall back to `all` for the default sweep mode. + selector = areas if areas is not None else "all" + argv += ["--areas", selector, "--out-dir", str(out_dir), "--name", name] + resolved_json_out = out_dir / name / "_summary.json" + + if think is True: + argv += ["--think"] + elif think is False: + argv += ["--no-think"] + if max_tokens is not None: + argv += ["--max-tokens", str(max_tokens)] + if auth_env is not None: + argv += ["--auth-env", auth_env] + if extra_body is not None: + argv += ["--extra-body", json.dumps(extra_body)] + + subprocess.run(argv, check=True) + return json.loads(resolved_json_out.read_text()) + + +def main() -> int: + """Thin CLI wrapping ``run_bench`` for the ``harness-run-bench`` console script. + + Most operator invocations go through ``harness/clients/run_lucebench.sh`` + (which handles the server lifecycle too). This entry exists so the + function form has a working CLI surface for ad-hoc use. + """ + import argparse + + parser = argparse.ArgumentParser(prog="harness-run-bench") + parser.add_argument("--base-url", required=True) + # Two forms accepted for compatibility with the shell wrapper: + # --area single-area mode (one of the choices below) + # --areas delegate verbatim to luce-bench + # The shell wrapper (harness/clients/run_lucebench.sh) emits + # `--areas`, so callers downstream of it must accept that form + # without an "unrecognized argument" error. + parser.add_argument("--area", default=None, + choices=["ds4-eval", "code", "longctx", "agent", "forge"]) + parser.add_argument("--areas", default=None, + help="Area selector: a single name, a comma list " + "(e.g. `code,gsm8k`), or `all`. Sweep mode kicks in " + "whenever this contains a comma or equals `all`.") + parser.add_argument("--model", default="default") + grp = parser.add_mutually_exclusive_group() + grp.add_argument("--think", action="store_true") + grp.add_argument("--no-think", action="store_true") + parser.add_argument("--max-tokens", type=int, default=None) + parser.add_argument("--timeout", type=int, default=300) + parser.add_argument("--parallel", type=int, default=1) + parser.add_argument("--auth-env", default=None) + parser.add_argument("--out-dir", type=Path, default=Path.cwd()) + parser.add_argument("--name", default="harness-run") + parser.add_argument("--json-out", type=Path, default=None, + help="single-area only — explicit output JSON path") + args = parser.parse_args() + + think: bool | None = None + if args.think: + think = True + elif args.no_think: + think = False + + # Caller can be on a fresh test box; check luce-bench is reachable. + if shutil.which(sys.executable) is None: + print(f"[harness] missing python: {sys.executable}", file=sys.stderr) + return 2 + + # Resolve --area / --areas → the (area, areas_arg) pair that + # ``run_bench`` understands. Sweep mode is a comma-list, "all", or + # a single area we route through the luce-bench `--areas` flag. + area: Area | None = args.area + areas_arg: str | None = args.areas + if areas_arg is not None and area is not None: + print("[harness] pass either --area or --areas, not both", + file=sys.stderr) + return 2 + if areas_arg is not None: + if "," in areas_arg or areas_arg == "all": + # Sweep mode: signal via area=None and pass the literal + # selector through to luce-bench via ``extra_body`` is the + # wrong path — we plumb a dedicated kwarg below. + area = None + else: + # Single name passed via --areas: treat as --area for + # function-form parity. Validate against the allowed set. + valid = {"ds4-eval", "code", "longctx", "agent", "forge", + "smoke", "gsm8k", "agent_recorded"} + if areas_arg not in valid: + print(f"[harness] unknown area {areas_arg!r}; " + f"known: {sorted(valid)}", file=sys.stderr) + return 2 + area = areas_arg # type: ignore[assignment] + areas_arg = None + + result = run_bench( + base_url=args.base_url, + area=area, + areas=areas_arg, + model=args.model, + think=think, + max_tokens=args.max_tokens, + timeout=args.timeout, + parallel=args.parallel, + auth_env=args.auth_env, + out_dir=args.out_dir, + name=args.name, + json_out=args.json_out, + ) + print(json.dumps(result, indent=2)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/harness/src/harness/clients/__init__.py b/harness/src/harness/clients/__init__.py new file mode 100644 index 00000000..5f57afb1 --- /dev/null +++ b/harness/src/harness/clients/__init__.py @@ -0,0 +1,17 @@ +"""Client launchers — start a Lucebox server, point a real client at it. + +Each module here exposes a `launch()` function that handles the +client-specific env config + binary exec, alongside the shell wrappers +under ``harness/clients/run_*.sh`` that handle the server lifecycle. + +The split: shell wrappers own the server start/stop + log-dir setup; +these Python modules own the client-side env + argv. ``lucebox `` +subcommands (e.g. ``lucebox claude``) call these directly. +""" + +# Re-export submodules so callers (and mypy) can resolve +# ``from harness.clients import claude_code`` without the submodule +# needing to be force-loaded by an earlier import. +from . import claude_code, codex, hermes, openclaw, opencode, pi + +__all__ = ["claude_code", "codex", "hermes", "openclaw", "opencode", "pi"] diff --git a/harness/src/harness/clients/_common.py b/harness/src/harness/clients/_common.py new file mode 100644 index 00000000..1746412a --- /dev/null +++ b/harness/src/harness/clients/_common.py @@ -0,0 +1,86 @@ +"""Shared helpers for harness client launchers. + +Each ``harness.clients.`` module exposes a ``launch()`` function with +the same shape (base_url, model, api_key, prompt, interactive, …). The +patterns below capture the bits that repeat: binary resolution, work-dir +setup, exec convention. +""" + +from __future__ import annotations + +import os +import shutil +import subprocess +import tempfile +from pathlib import Path + +DEFAULT_API_KEY = "sk-lucebox" +DEFAULT_MODEL_ID = "luce-dflash" + + +def find_bin(name: str, *, env_var: str, work_dir_hint: str | None = None) -> str: + """Locate a client binary. + + Search order: + 1. ``$`` (explicit override) + 2. ``$PATH`` + 3. ``$CLIENT_WORK_DIR/`` (test-box convention) + + Raises FileNotFoundError with a clear install hint otherwise. + """ + explicit = os.environ.get(env_var) + if explicit and _is_executable_file(Path(explicit)): + return explicit + on_path = shutil.which(name) + if on_path: + return on_path + work_dir = os.environ.get("CLIENT_WORK_DIR") + if work_dir and work_dir_hint: + candidate = Path(work_dir) / work_dir_hint + if _is_executable_file(candidate): + return str(candidate) + raise FileNotFoundError( + f"{name!r} binary not found. Install it or set ${env_var} to its path." + ) + + +def _is_executable_file(p: Path) -> bool: + """True iff ``p`` is a regular file (or symlink to one) that's +x. + + Used by find_bin so an env-var override pointing at a directory, a + non-executable wrapper, or a stale path doesn't get returned to the + launcher only to fail at exec time with a worse error. + """ + return p.is_file() and os.access(p, os.X_OK) + + +def mktempdir(prefix: str) -> Path: + """Make a working directory for client config/state. Returns Path.""" + return Path(tempfile.mkdtemp(prefix=f"lucebox-{prefix}-")) + + +def exec_client( + argv: list[str], + env: dict[str, str], + *, + interactive: bool, + timeout: int | None = None, +) -> int: + """Run a client binary with env, return its exit code. + + Interactive: inherits stdio (TUI works), no timeout. + Non-interactive: stdin from /dev/null, optional wall-time timeout via + ``subprocess.run(..., timeout=N)`` — no dependency on the external + ``timeout`` binary, which isn't guaranteed across base images. On + timeout we return 124 to match the conventional GNU ``timeout`` exit + code, so harness scripts that branch on $? see the same value either + way. + """ + if interactive: + return subprocess.run(argv, env=env).returncode + try: + return subprocess.run( + argv, env=env, stdin=subprocess.DEVNULL, timeout=timeout + ).returncode + except subprocess.TimeoutExpired: + return 124 diff --git a/harness/src/harness/clients/claude_code.py b/harness/src/harness/clients/claude_code.py new file mode 100644 index 00000000..8775dd83 --- /dev/null +++ b/harness/src/harness/clients/claude_code.py @@ -0,0 +1,194 @@ +"""Launch Claude Code pointed at a Lucebox server. + +The env contract is the same one ``harness/clients/run_claude_code.sh`` uses: + + ANTHROPIC_BASE_URL → Lucebox /v1 base (Anthropic-Messages compat) + ANTHROPIC_API_KEY → any token; Lucebox doesn't gate + CLAUDE_CODE_API_BASE_URL → some Claude Code versions read this instead + CLAUDE_CODE_DISABLE_* → telemetry + nonessential traffic off + CLAUDE_CODE_DISABLE_NONSTREAMING_FALLBACK → prevent the client from + falling back to a non-streaming code path + that older Lucebox builds don't speak + +Two invocation modes: + - **interactive** (default): exec claude with an empty argv, user gets the + full TUI. The ``lucebox claude`` host subcommand calls this. + - **print** (test mode): ``--print --output-format json`` for the harness + ``run_claude_code.sh`` compatibility-check flow. + +Stdlib only. +""" + +from __future__ import annotations + +import os +import shutil +import subprocess +import sys +from pathlib import Path + +DEFAULT_API_KEY = "sk-lucebox" # Lucebox doesn't auth; placeholder satisfies clients + + +def claude_env( + base_url: str, + *, + api_key: str = DEFAULT_API_KEY, + extra_env: dict[str, str] | None = None, +) -> dict[str, str]: + """Compose the env dict that points Claude Code at a Lucebox server. + + Returns a fresh dict to merge over os.environ — callers control whether + to inherit, sanitize, or replace the parent environment. + """ + env: dict[str, str] = { + "ANTHROPIC_API_KEY": api_key, + "ANTHROPIC_BASE_URL": base_url.rstrip("/"), + "CLAUDE_CODE_API_BASE_URL": base_url.rstrip("/"), + # Older Claude Code versions occasionally retry a non-streaming + # request when the streaming endpoint returns an unexpected shape. + # That path isn't well-tested against Lucebox; force-disable so + # any incompatibility surfaces in the streaming path where we test. + "CLAUDE_CODE_DISABLE_NONSTREAMING_FALLBACK": "1", + # Privacy/telemetry off — both for the test harness (deterministic + # runs) and for user-facing `lucebox claude` (running a local model). + "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1", + "CLAUDE_CODE_DISABLE_TELEMETRY": "1", + } + if extra_env: + env.update(extra_env) + return env + + +def find_claude_bin() -> str: + """Locate the `claude` binary. + + Search order: + 1. $CLAUDE_BIN env var (explicit override) + 2. $PATH (typical dev install) + 3. Test-box convention: $CLIENT_WORK_DIR/clients/claude_code/npm/bin/claude + + Raises FileNotFoundError if none of the above resolve. + """ + explicit = os.environ.get("CLAUDE_BIN") + if explicit and Path(explicit).exists(): + return explicit + on_path = shutil.which("claude") + if on_path: + return on_path + work_dir = os.environ.get("CLIENT_WORK_DIR") + if work_dir: + candidate = Path(work_dir) / "clients" / "claude_code" / "npm" / "bin" / "claude" + if candidate.exists(): + return str(candidate) + raise FileNotFoundError( + "claude binary not found. Install Claude Code or set $CLAUDE_BIN to its path." + ) + + +def launch( + *, + base_url: str, + model: str = "luce-dflash", + api_key: str = DEFAULT_API_KEY, + prompt: str | None = None, + timeout: int | None = None, + extra_args: list[str] | None = None, + interactive: bool = True, +) -> int: + """Run Claude Code against the given Lucebox server. + + Args: + base_url: Lucebox HTTP base, e.g. ``http://localhost:8080``. + model: Model ID to advertise to Claude Code. + api_key: Bearer token for ANTHROPIC_API_KEY. Lucebox doesn't gate; + any non-empty string works. + prompt: For non-interactive use — pass a prompt to ``claude --print``. + Ignored when ``interactive=True``. + timeout: Wrap in ``timeout`` (seconds) for non-interactive runs. + Ignored in interactive mode. + extra_args: Extra argv to forward to claude. + interactive: True → TUI mode (default). False → `--print` mode for + the harness compat-check pattern. + + Returns: + claude's exit code. + """ + claude = find_claude_bin() + env = {**os.environ, **claude_env(base_url, api_key=api_key)} + argv: list[str] = [claude] + + if interactive: + # Claude Code's TUI honors `--model` at startup; without it the + # user gets whichever default model the CLI shipped with, even + # though they passed --model on the host wrapper. Mirror the + # non-interactive flag so the model selection is consistent + # across both modes. + argv += ["--model", model] + if extra_args: + argv += extra_args + # Inherit stdin/out/err so the TUI works. No timeout in interactive mode. + return subprocess.run(argv, env=env).returncode + + # Non-interactive: matches `harness/clients/run_claude_code.sh` flags. + if prompt is None: + raise ValueError("non-interactive mode requires prompt=...") + argv += [ + "--print", + "--output-format", "json", + "--model", model, + "--permission-mode", "dontAsk", + "--no-session-persistence", + ] + if extra_args: + argv += extra_args + argv += [prompt] + + # Use subprocess.run(..., timeout=) instead of the external `timeout` + # binary so we don't depend on a GNU coreutils install on the test + # box. On timeout, return 124 to match the conventional GNU + # `timeout` exit code that any wrapper script branching on $? expects. + try: + return subprocess.run( + argv, env=env, stdin=subprocess.DEVNULL, timeout=timeout + ).returncode + except subprocess.TimeoutExpired: + return 124 + + +def main() -> int: + """`harness-claude-code` console script — small CLI for ad-hoc use. + + The full TUI flow goes through ``lucebox claude`` (interactive). The + harness ``run_claude_code.sh`` calls in test (--print) mode. This main + is a thin wrapper for either.""" + import argparse + + parser = argparse.ArgumentParser(prog="harness-claude-code") + parser.add_argument("--base-url", required=True, + help="Lucebox server, e.g. http://localhost:8080") + parser.add_argument("--model", default="luce-dflash") + parser.add_argument("--api-key", default=DEFAULT_API_KEY) + parser.add_argument("--prompt", default=None, + help="One-shot prompt (non-interactive). Omit for TUI.") + parser.add_argument("--timeout", type=int, default=None) + args, extra = parser.parse_known_args() + + interactive = args.prompt is None + try: + return launch( + base_url=args.base_url, + model=args.model, + api_key=args.api_key, + prompt=args.prompt, + timeout=args.timeout, + extra_args=extra or None, + interactive=interactive, + ) + except FileNotFoundError as e: + print(f"[harness-claude-code] {e}", file=sys.stderr) + return 127 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/harness/src/harness/clients/codex.py b/harness/src/harness/clients/codex.py new file mode 100644 index 00000000..232f29e6 --- /dev/null +++ b/harness/src/harness/clients/codex.py @@ -0,0 +1,129 @@ +"""Launch Codex pointed at a Lucebox server. + +Mirrors ``harness/clients/run_codex.sh`` — writes a per-run CODEX_HOME +config.toml that registers Lucebox as a custom model provider, then exec's +the codex binary with the right env. The Responses API is the default +wire format (matches what current Codex versions speak). +""" + +from __future__ import annotations + +import os +import subprocess +import sys +from pathlib import Path + +from harness.clients._common import ( + DEFAULT_API_KEY, + DEFAULT_MODEL_ID, + exec_client, + find_bin, + mktempdir, +) + + +def write_config(home: Path, *, base_url: str, model: str, sandbox: str, + wire_api: str) -> None: + config_path = home / "config.toml" + config_path.write_text( + f"""model = "{model}" +model_provider = "luce" +approval_policy = "never" +sandbox_mode = "{sandbox}" + +[model_providers.luce] +name = "Lucebox" +base_url = "{base_url.rstrip('/')}/v1" +env_key = "OPENAI_API_KEY" +wire_api = "{wire_api}" +""" + ) + + +def launch( + *, + base_url: str, + model: str = DEFAULT_MODEL_ID, + api_key: str = DEFAULT_API_KEY, + prompt: str | None = None, + timeout: int | None = None, + interactive: bool = True, + sandbox: str = "danger-full-access", + wire_api: str = "responses", + work_dir: Path | None = None, + extra_args: list[str] | None = None, +) -> int: + """Run Codex against the given Lucebox server. + + Codex isolates its config + session state under $CODEX_HOME (and falls + back to $HOME). We point both at a per-run tempdir so the user's actual + codex config isn't disturbed by a lucebox-pointed run. + """ + codex_bin = find_bin("codex", env_var="CODEX_BIN", + work_dir_hint="clients/codex/npm/bin/codex") + home = work_dir or mktempdir("codex") + write_config(home, base_url=base_url, model=model, + sandbox=sandbox, wire_api=wire_api) + + env = { + **os.environ, + "HOME": str(home), + "CODEX_HOME": str(home), + "OPENAI_API_KEY": api_key, + } + + argv: list[str] = [codex_bin] + if interactive: + # Bare interactive — codex picks up config.toml from $CODEX_HOME. + if extra_args: + argv += extra_args + else: + if prompt is None: + raise ValueError("non-interactive mode requires prompt=...") + argv += [ + "exec", + "--skip-git-repo-check", + "--sandbox", sandbox, + "--model", model, + "--json", + ] + if extra_args: + argv += extra_args + argv += [prompt] + + return exec_client(argv, env, interactive=interactive, timeout=timeout) + + +def main() -> int: + import argparse + + parser = argparse.ArgumentParser(prog="harness-codex") + parser.add_argument("--base-url", required=True) + parser.add_argument("--model", default=DEFAULT_MODEL_ID) + parser.add_argument("--api-key", default=DEFAULT_API_KEY) + parser.add_argument("--prompt", default=None) + parser.add_argument("--timeout", type=int, default=None) + parser.add_argument("--sandbox", default="danger-full-access") + parser.add_argument("--wire-api", default="responses", + choices=["responses", "chat"]) + args, extra = parser.parse_known_args() + + try: + return launch( + base_url=args.base_url, + model=args.model, + api_key=args.api_key, + prompt=args.prompt, + timeout=args.timeout, + interactive=args.prompt is None, + sandbox=args.sandbox, + wire_api=args.wire_api, + extra_args=extra or None, + ) + except FileNotFoundError as e: + print(f"[harness-codex] {e}", file=sys.stderr) + return 127 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/harness/src/harness/clients/hermes.py b/harness/src/harness/clients/hermes.py new file mode 100644 index 00000000..0c179103 --- /dev/null +++ b/harness/src/harness/clients/hermes.py @@ -0,0 +1,187 @@ +"""Launch Hermes Agent pointed at a Lucebox server. + +Mirrors ``harness/clients/run_hermes.sh``. Hermes reads YAML config from +$HOME/config.yaml plus a $HOME/.env file. Both get written to a per-run +working dir to keep the user's real Hermes state untouched. +""" + +from __future__ import annotations + +import os +import sys +from pathlib import Path + +from harness.clients._common import ( + DEFAULT_API_KEY, + DEFAULT_MODEL_ID, + exec_client, + find_bin, + mktempdir, +) + + +def _yaml_quote(s: str) -> str: + """Render ``s`` as a YAML double-quoted scalar with JSON-style escapes. + + YAML's double-quoted style accepts the JSON escape set; ``json.dumps`` + produces exactly that, so it's the cheapest correct way to embed an + arbitrary string (paths containing quotes, backslashes, or even + newlines) into the generated config without a YAML library. + """ + import json as _json + return _json.dumps(s) + + +def write_config(home: Path, *, base_url: str, model: str, api_key: str, + max_ctx: int, max_tokens: int, repo_dir: str) -> None: + base = f"{base_url.rstrip('/')}/v1" + q = _yaml_quote + (home / "config.yaml").write_text( + f"""model: + default: {q(model)} + provider: "lucebox" + base_url: {q(base)} + api_key: {q(api_key)} + api_mode: "chat_completions" + context_length: {max_ctx} + max_tokens: {max_tokens} + +custom_providers: + - name: "lucebox" + base_url: {q(base)} + api_key: {q(api_key)} + api_mode: "chat_completions" + models: + {q(model)}: + context_length: {max_ctx} + max_tokens: {max_tokens} + +terminal: + backend: "local" + cwd: {q(repo_dir)} + timeout: 180 + lifetime_seconds: 300 +""" + ) + (home / ".env").write_text( + f"""OPENAI_API_KEY={api_key} +OPENAI_BASE_URL={base} +HERMES_INFERENCE_PROVIDER=lucebox +HERMES_INFERENCE_MODEL={model} +HERMES_ACCEPT_HOOKS=1 +HERMES_API_TIMEOUT=600 +HERMES_API_CALL_STALE_TIMEOUT=600 +""" + ) + + +def launch( + *, + base_url: str, + model: str = DEFAULT_MODEL_ID, + api_key: str = DEFAULT_API_KEY, + prompt: str | None = None, + timeout: int | None = None, + interactive: bool = True, + work_dir: Path | None = None, + max_ctx: int = 98304, + max_tokens: int = 4096, + max_turns: int = 40, + extra_args: list[str] | None = None, +) -> int: + bin_path = find_bin("hermes", env_var="HERMES_BIN", + work_dir_hint="clients/hermes/home/.local/bin/hermes") + home = work_dir or mktempdir("hermes") + # Normalize REPO_DIR to an absolute path so Hermes's `terminal.cwd` + # resolves consistently regardless of the cwd at launch. A relative + # path in REPO_DIR would be re-interpreted against the in-container + # cwd (or wherever Hermes's parser anchors it), which has bitten + # users running the harness from a sibling directory. + repo_dir = str(Path(os.environ.get("REPO_DIR", str(Path.cwd()))).resolve()) + write_config(home, base_url=base_url, model=model, api_key=api_key, + max_ctx=max_ctx, max_tokens=max_tokens, repo_dir=repo_dir) + + base = f"{base_url.rstrip('/')}/v1" + # Mirror harness/clients/run_hermes.sh: HERMES_HOME tells the binary + # which config dir to read (Hermes does not always honor HOME alone); + # the OPENAI_/HERMES_INFERENCE_* env vars are the canonical wiring; + # NO_COLOR keeps the batch log diffable. + env = { + **os.environ, + "HOME": str(home), + "HERMES_HOME": str(home), + "OPENAI_API_KEY": api_key, + "OPENAI_BASE_URL": base, + "HERMES_INFERENCE_PROVIDER": "lucebox", + "HERMES_INFERENCE_MODEL": model, + "HERMES_ACCEPT_HOOKS": "1", + "NO_COLOR": "1", + } + argv: list[str] = [bin_path] + if interactive: + if extra_args: + argv += extra_args + else: + if prompt is None: + raise ValueError("non-interactive mode requires prompt=...") + # Mirror run_hermes.sh's validated batch invocation: `chat` subcommand + # with the lucebox provider, accept-hooks/yolo so it doesn't stop on + # interactive prompts, `--query` for the user prompt (not positional). + argv += [ + "chat", + "--quiet", + "--provider", "lucebox", + "--model", model, + "--accept-hooks", + "--yolo", + "--max-turns", str(max_turns), + "--source", "lucebox-harness", + ] + if extra_args: + argv += extra_args + argv += ["--query", prompt] + + old_cwd = os.getcwd() + try: + os.chdir(repo_dir) + return exec_client(argv, env, interactive=interactive, timeout=timeout) + finally: + os.chdir(old_cwd) + + +def main() -> int: + import argparse + + parser = argparse.ArgumentParser(prog="harness-hermes") + parser.add_argument("--base-url", required=True) + parser.add_argument("--model", default=DEFAULT_MODEL_ID) + parser.add_argument("--api-key", default=DEFAULT_API_KEY) + parser.add_argument("--prompt", default=None) + # Match the 420s wall timeout the shell harness uses (run_hermes.sh): + # Hermes agent loops can hang on a misconfigured server, and the + # CLI form should fail in roughly the same wall-time window the + # shell form does — otherwise an operator who switches mode gets + # surprising "stuck" behavior. + parser.add_argument("--timeout", type=int, default=420) + parser.add_argument("--max-ctx", type=int, default=98304) + parser.add_argument("--max-tokens", type=int, default=4096) + parser.add_argument("--max-turns", type=int, default=40, + help="Max agent turns for `hermes chat --max-turns` " + "(mirrors HERMES_MAX_TURNS in run_hermes.sh).") + args, extra = parser.parse_known_args() + try: + return launch( + base_url=args.base_url, model=args.model, api_key=args.api_key, + prompt=args.prompt, timeout=args.timeout, + interactive=args.prompt is None, + max_ctx=args.max_ctx, max_tokens=args.max_tokens, + max_turns=args.max_turns, + extra_args=extra or None, + ) + except FileNotFoundError as e: + print(f"[harness-hermes] {e}", file=sys.stderr) + return 127 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/harness/src/harness/clients/openclaw.py b/harness/src/harness/clients/openclaw.py new file mode 100644 index 00000000..056358cf --- /dev/null +++ b/harness/src/harness/clients/openclaw.py @@ -0,0 +1,170 @@ +"""Launch OpenClaw pointed at a Lucebox server. + +Mirrors ``harness/clients/run_openclaw.sh``. OpenClaw takes a JSON config +patch that's applied via ``openclaw config patch --file`` before the +agent run; we mirror both the patch step and the agent invocation here. +""" + +from __future__ import annotations + +import json +import os +import subprocess +import sys +from pathlib import Path + +from harness.clients._common import ( + DEFAULT_API_KEY, + DEFAULT_MODEL_ID, + exec_client, + find_bin, + mktempdir, +) + + +def write_config(home: Path, *, base_url: str, model: str, api_key: str, + api: str = "openai-completions", + max_ctx: int = 204800, max_tokens: int = 4096) -> Path: + patch_path = home / "openclaw.patch.json" + patch_path.write_text(json.dumps({ + "models": { + "mode": "merge", + "providers": { + "lucebox": { + "baseUrl": f"{base_url.rstrip('/')}/v1", + "apiKey": api_key, + "auth": "api-key", + "api": api, + "contextWindow": max_ctx, + "maxTokens": max_tokens, + "models": [ + { + "id": model, + "name": "Lucebox DFlash", + "api": api, + "contextWindow": max_ctx, + "maxTokens": max_tokens, + "input": ["text"], + "output": ["text"], + "supportsTools": True, + } + ], + } + }, + "defaultProvider": "lucebox", + "defaultModel": model, + } + }, indent=2)) + return patch_path + + +def launch( + *, + base_url: str, + model: str = DEFAULT_MODEL_ID, + api_key: str = DEFAULT_API_KEY, + prompt: str | None = None, + timeout: int | None = None, + interactive: bool = True, + work_dir: Path | None = None, + max_ctx: int = 204800, + max_tokens: int = 4096, + agent_timeout: int = 300, + extra_args: list[str] | None = None, +) -> int: + bin_path = find_bin("openclaw", env_var="OPENCLAW_BIN", + work_dir_hint="clients/openclaw/npm/bin/openclaw") + home = work_dir or mktempdir("openclaw") + patch_path = write_config(home, base_url=base_url, model=model, + api_key=api_key, max_ctx=max_ctx, + max_tokens=max_tokens) + + env = { + **os.environ, + "HOME": str(home), + "OPENAI_API_KEY": api_key, + # Kept for back-compat with prior callers that referenced this env + # var; the canonical patch application happens via the explicit + # `config patch` step below (mirroring run_openclaw.sh). + "OPENCLAW_CONFIG_PATCH": str(patch_path), + } + # Apply the JSON patch via OpenClaw's `config patch` subcommand — same + # step run_openclaw.sh performs before invoking `agent`. Without this, + # the agent run can't see the lucebox provider entry. + # + # Cap the patch step at 30s (or `timeout`, whichever is shorter): a + # hung preflight has bricked CI runs in the past because the launcher + # waited indefinitely. 30s is plenty for what is fundamentally a JSON + # merge against a local file; if it takes longer, the binary is wedged + # and we want a clear timeout exit rather than a stuck process. + patch_timeout = 30 if timeout is None else min(30, timeout) + try: + subprocess.run( + [bin_path, "config", "patch", "--file", str(patch_path)], + env=env, check=True, stdin=subprocess.DEVNULL, + timeout=patch_timeout, + ) + except subprocess.TimeoutExpired: + print( + f"[harness-openclaw] `openclaw config patch` exceeded " + f"{patch_timeout}s — aborting before agent run.", + file=sys.stderr, + ) + return 124 + argv: list[str] = [bin_path] + if interactive: + if extra_args: + argv += extra_args + else: + if prompt is None: + raise ValueError("non-interactive mode requires prompt=...") + # Mirror run_openclaw.sh: `agent --local --json --model + # lucebox/ --session-id … --timeout … --message `. + # Selecting the explicit model+session keeps the run reproducible + # and stops OpenClaw from falling back to a default provider when + # the patch is partial. + argv += [ + "agent", + "--local", + "--json", + "--model", f"lucebox/{model}", + "--session-id", "lucebox-client-harness", + "--timeout", str(agent_timeout), + ] + if extra_args: + argv += extra_args + argv += ["--message", prompt] + return exec_client(argv, env, interactive=interactive, timeout=timeout) + + +def main() -> int: + import argparse + + parser = argparse.ArgumentParser(prog="harness-openclaw") + parser.add_argument("--base-url", required=True) + parser.add_argument("--model", default=DEFAULT_MODEL_ID) + parser.add_argument("--api-key", default=DEFAULT_API_KEY) + parser.add_argument("--prompt", default=None) + parser.add_argument("--timeout", type=int, default=None) + parser.add_argument("--max-ctx", type=int, default=204800) + parser.add_argument("--max-tokens", type=int, default=4096) + parser.add_argument("--agent-timeout", type=int, default=300, + help="Inner `openclaw agent --timeout` value " + "(mirrors the literal `--timeout 300` in run_openclaw.sh).") + args, extra = parser.parse_known_args() + try: + return launch( + base_url=args.base_url, model=args.model, api_key=args.api_key, + prompt=args.prompt, timeout=args.timeout, + interactive=args.prompt is None, + max_ctx=args.max_ctx, max_tokens=args.max_tokens, + agent_timeout=args.agent_timeout, + extra_args=extra or None, + ) + except FileNotFoundError as e: + print(f"[harness-openclaw] {e}", file=sys.stderr) + return 127 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/harness/src/harness/clients/opencode.py b/harness/src/harness/clients/opencode.py new file mode 100644 index 00000000..a4d981e6 --- /dev/null +++ b/harness/src/harness/clients/opencode.py @@ -0,0 +1,196 @@ +"""Launch OpenCode pointed at a Lucebox server. + +Mirrors ``harness/clients/run_opencode.sh``. OpenCode uses a per-project +opencode.json that registers Lucebox via the OpenAI-compatible AI SDK +provider. We write it to the project dir (cwd by default), set env, exec. +""" + +from __future__ import annotations + +import json +import os +import sys +from pathlib import Path + +from harness.clients._common import ( + DEFAULT_API_KEY, + DEFAULT_MODEL_ID, + exec_client, + find_bin, + mktempdir, +) + + +def write_config( + project_dir: Path, + *, + base_url: str, + model: str, + api_key: str, + max_ctx: int = 32768, + max_tokens: int = 4096, + overwrite: bool = False, +) -> None: + """Write opencode.json into ``project_dir``. + + Mirrors the JSON literal in ``harness/clients/run_opencode.sh`` + including the ``tools`` block (write/bash off) — the shell launcher + keeps a real-client run from mutating the user's working tree, and + the Python launch path must preserve that contract or the two + invocation modes behave differently. + + Refuses to overwrite an existing ``opencode.json`` unless + ``overwrite=True``; the caller (``launch``) opts in only for + sandbox tempdirs and the explicit ``project_dir`` test path. + """ + config_path = project_dir / "opencode.json" + if config_path.exists() and not overwrite: + raise FileExistsError( + f"refusing to overwrite existing {config_path}; " + "remove it or pass --project-dir to a fresh directory." + ) + config = { + "$schema": "https://opencode.ai/config.json", + "model": f"lucebox/{model}", + "small_model": f"lucebox/{model}", + "provider": { + "lucebox": { + "npm": "@ai-sdk/openai-compatible", + "name": "Lucebox", + "options": { + "baseURL": f"{base_url.rstrip('/')}/v1", + "apiKey": api_key, + "timeout": 600000, + "chunkTimeout": 60000, + }, + "models": { + model: { + "name": "Lucebox DFlash", + "limit": {"context": max_ctx, "output": max_tokens}, + } + }, + } + }, + # Match run_opencode.sh: deny the destructive tools so a harness + # run cannot mutate the user's project tree. Operators who want + # write/bash on can flip these via a project-level opencode.json + # overlay. + "tools": { + "write": False, + "bash": False, + }, + } + config_path.write_text(json.dumps(config, indent=2)) + + +def launch( + *, + base_url: str, + model: str = DEFAULT_MODEL_ID, + api_key: str = DEFAULT_API_KEY, + prompt: str | None = None, + timeout: int | None = None, + interactive: bool = True, + project_dir: Path | None = None, + max_ctx: int = 32768, + max_tokens: int = 4096, + extra_args: list[str] | None = None, +) -> int: + """Run OpenCode against the given Lucebox server. + + OpenCode reads opencode.json from the cwd. For interactive mode we + use the current cwd (the user's project). For non-interactive runs + we make a fresh tempdir so the test config doesn't pollute the user's + project tree. + """ + bin_path = find_bin("opencode", env_var="OPENCODE_BIN", + work_dir_hint="clients/opencode/npm/bin/opencode") + cwd = project_dir if project_dir else (Path.cwd() if interactive else mktempdir("opencode")) + cwd.mkdir(parents=True, exist_ok=True) + # Only overwrite an existing opencode.json when we own the directory + # (a fresh tempdir created above for non-interactive runs). The + # interactive default of cwd=Path.cwd() may point at the user's + # actual project — refuse to clobber it. + we_own_cwd = project_dir is None and not interactive + write_config(cwd, base_url=base_url, model=model, api_key=api_key, + max_ctx=max_ctx, max_tokens=max_tokens, + overwrite=we_own_cwd) + + # OpenCode resolves XDG_* for state; sandbox these too in test mode + # so the user's real opencode state isn't touched. + home = cwd / ".lucebox-opencode-home" + home.mkdir(exist_ok=True) + (home / ".config").mkdir(exist_ok=True) + (home / ".local" / "share").mkdir(parents=True, exist_ok=True) + + env = { + **os.environ, + "HOME": str(home), + "XDG_CONFIG_HOME": str(home / ".config"), + "XDG_DATA_HOME": str(home / ".local" / "share"), + "OPENAI_API_KEY": api_key, + } + + argv: list[str] = [bin_path] + # Run from the project dir so opencode.json is picked up. + env["OPENCODE_CWD"] = str(cwd) + if interactive: + if extra_args: + argv += extra_args + else: + if prompt is None: + raise ValueError("non-interactive mode requires prompt=...") + argv += [ + "run", + "--pure", + "--model", f"lucebox/{model}", + "--format", "json", + ] + if extra_args: + argv += extra_args + argv += [prompt] + + # chdir into the project so opencode resolves the right config. + old_cwd = os.getcwd() + try: + os.chdir(cwd) + return exec_client(argv, env, interactive=interactive, timeout=timeout) + finally: + os.chdir(old_cwd) + + +def main() -> int: + import argparse + + parser = argparse.ArgumentParser(prog="harness-opencode") + parser.add_argument("--base-url", required=True) + parser.add_argument("--model", default=DEFAULT_MODEL_ID) + parser.add_argument("--api-key", default=DEFAULT_API_KEY) + parser.add_argument("--prompt", default=None) + parser.add_argument("--timeout", type=int, default=None) + parser.add_argument("--max-ctx", type=int, default=32768) + parser.add_argument("--max-tokens", type=int, default=4096) + args, extra = parser.parse_known_args() + + try: + return launch( + base_url=args.base_url, + model=args.model, + api_key=args.api_key, + prompt=args.prompt, + timeout=args.timeout, + interactive=args.prompt is None, + max_ctx=args.max_ctx, + max_tokens=args.max_tokens, + extra_args=extra or None, + ) + except FileNotFoundError as e: + print(f"[harness-opencode] {e}", file=sys.stderr) + return 127 + except FileExistsError as e: + print(f"[harness-opencode] {e}", file=sys.stderr) + return 2 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/harness/src/harness/clients/pi.py b/harness/src/harness/clients/pi.py new file mode 100644 index 00000000..b25f57a5 --- /dev/null +++ b/harness/src/harness/clients/pi.py @@ -0,0 +1,130 @@ +"""Launch Pi pointed at a Lucebox server. + +Mirrors ``harness/clients/run_pi.sh``. Pi reads $HOME/agent/{settings,models}.json. +""" + +from __future__ import annotations + +import json +import os +import sys +from pathlib import Path + +from harness.clients._common import ( + DEFAULT_API_KEY, + DEFAULT_MODEL_ID, + exec_client, + find_bin, + mktempdir, +) + + +def write_config(home: Path, *, base_url: str, model: str, api_key: str, + api: str = "openai-responses", + tools: str = "read,grep,find,ls") -> None: + agent = home / "agent" + agent.mkdir(parents=True, exist_ok=True) + (home / "sessions").mkdir(parents=True, exist_ok=True) + (agent / "settings.json").write_text(json.dumps({"compaction": {"enabled": False}})) + (agent / "models.json").write_text(json.dumps({ + "providers": { + "lucebox": { + "baseUrl": f"{base_url.rstrip('/')}/v1", + "api": api, + "apiKey": api_key, + "compat": { + "supportsDeveloperRole": False, + "supportsReasoningEffort": False, + "supportsUsageInStreaming": True, + "maxTokensField": "max_tokens", + }, + "models": [ + {"id": model, "name": "Lucebox DFlash"}, + ], + } + }, + "defaultModel": {"provider": "lucebox", "id": model}, + })) + + +def launch( + *, + base_url: str, + model: str = DEFAULT_MODEL_ID, + api_key: str = DEFAULT_API_KEY, + prompt: str | None = None, + timeout: int | None = None, + interactive: bool = True, + work_dir: Path | None = None, + tools: str = "read,grep,find,ls", + extra_args: list[str] | None = None, +) -> int: + bin_path = find_bin("pi", env_var="PI_BIN", + work_dir_hint="clients/pi/npm/bin/pi") + home = work_dir or mktempdir("pi") + write_config(home, base_url=base_url, model=model, api_key=api_key) + + agent_dir = home / "agent" + sessions_dir = home / "sessions" + # Mirror the env exports in harness/clients/run_pi.sh (PI_CODING_AGENT_* + # tell Pi where its config + session state live; PI_OFFLINE keeps it + # from reaching out to the public Pi API). + env = { + **os.environ, + "HOME": str(home), + "PI_CODING_AGENT_DIR": str(agent_dir), + "PI_CODING_AGENT_SESSION_DIR": str(sessions_dir), + "PI_OFFLINE": "1", + } + argv: list[str] = [bin_path] + if interactive: + if extra_args: + argv += extra_args + else: + if prompt is None: + raise ValueError("non-interactive mode requires prompt=...") + # Mirror harness/clients/run_pi.sh's validated invocation: route via + # the lucebox provider with the json print mode + a fixed tool + # allowlist, no session persistence, offline (no cloud Pi calls). + argv += [ + "--provider", "lucebox", + "--model", model, + "--print", + "--mode", "json", + "--tools", tools, + "--no-session", + "--offline", + ] + if extra_args: + argv += extra_args + argv += [prompt] + return exec_client(argv, env, interactive=interactive, timeout=timeout) + + +def main() -> int: + import argparse + + parser = argparse.ArgumentParser(prog="harness-pi") + parser.add_argument("--base-url", required=True) + parser.add_argument("--model", default=DEFAULT_MODEL_ID) + parser.add_argument("--api-key", default=DEFAULT_API_KEY) + parser.add_argument("--prompt", default=None) + parser.add_argument("--timeout", type=int, default=None) + parser.add_argument("--tools", default="read,grep,find,ls", + help="Comma-separated tool allowlist passed to " + "`pi --tools` (matches PI_TOOLS in run_pi.sh).") + args, extra = parser.parse_known_args() + try: + return launch( + base_url=args.base_url, model=args.model, api_key=args.api_key, + prompt=args.prompt, timeout=args.timeout, + interactive=args.prompt is None, tools=args.tools, + extra_args=extra or None, + ) + except FileNotFoundError as e: + print(f"[harness-pi] {e}", file=sys.stderr) + return 127 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/harness/src/harness/py.typed b/harness/src/harness/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/install.sh b/install.sh new file mode 100755 index 00000000..cff54a02 --- /dev/null +++ b/install.sh @@ -0,0 +1,138 @@ +#!/usr/bin/env bash +# install.sh — Bootstrap installer for the lucebox host wrapper. +# +# Canonical install (Luce-Org main, stable channel): +# +# curl -fsSL https://raw.githubusercontent.com/Luce-Org/lucebox-hub/main/install.sh | bash +# +# Install from a different fork / branch (dev channel). Note the env var +# is on the `bash` side of the pipe — `VAR=val curl … | bash` would attach +# it to the `curl` process, leaving `bash` with the canonical default: +# +# curl -fsSL https://raw.githubusercontent.com/easel/lucebox-hub/feat/lucebox-docker/install.sh | \ +# LUCEBOX_INSTALL_URL=https://raw.githubusercontent.com/easel/lucebox-hub/feat/lucebox-docker/lucebox.sh bash +# +# The installer bakes the source URL into the installed `lucebox.sh` as +# `LUCEBOX_INSTALLED_FROM=...`, so `lucebox update` later re-pulls from the +# same channel without the user having to remember which fork they used. +# +# Override the install destination via $LUCEBOX_INSTALL_DEST (default +# $HOME/.local/bin/lucebox). This is what `lucebox update` uses to replace +# the file in place. + +set -euo pipefail + +LUCEBOX_INSTALL_URL="${LUCEBOX_INSTALL_URL:-https://raw.githubusercontent.com/Luce-Org/lucebox-hub/main/lucebox.sh}" +DEST="${LUCEBOX_INSTALL_DEST:-$HOME/.local/bin/lucebox}" + +# ── helpers ─────────────────────────────────────────────────────────────── +C_OK=$'\033[1;32m' ; C_ERR=$'\033[1;31m' ; C_DIM=$'\033[2m' ; C_RST=$'\033[0m' +if [ ! -t 1 ] || [ "${NO_COLOR:-}" ]; then + C_OK="" ; C_ERR="" ; C_DIM="" ; C_RST="" +fi +info() { printf '%s[install]%s %s\n' "$C_DIM" "$C_RST" "$*"; } +ok() { printf '%s[install] ✓%s %s\n' "$C_OK" "$C_RST" "$*"; } +die() { printf '%s[install] ✗%s %s\n' "$C_ERR" "$C_RST" "$*" >&2; exit 1; } + +command -v curl >/dev/null 2>&1 || die "curl is required (apt-get install curl)" + +# ── fetch ───────────────────────────────────────────────────────────────── +tmp=$(mktemp -t lucebox.XXXXXX) || die "couldn't create temp file" +# shellcheck disable=SC2064 # we want $tmp expanded now, not at trap time +trap "rm -f '$tmp' '$tmp.bak'" EXIT +info "fetching $LUCEBOX_INSTALL_URL" +curl -fsSL "$LUCEBOX_INSTALL_URL" -o "$tmp" \ + || die "download failed from $LUCEBOX_INSTALL_URL" + +# ── sanity check ────────────────────────────────────────────────────────── +# Refuse to install something that isn't recognizably lucebox.sh. Catches +# 404 pages, redirects to HTML, and accidental URL typos. +head -1 "$tmp" | grep -q '^#!/usr/bin/env bash$' \ + || die "downloaded file does not look like a bash script (got: $(head -1 "$tmp"))" +grep -q '^VERSION=' "$tmp" \ + || die "downloaded file is missing VERSION marker — not lucebox.sh?" + +# ── decide what gets baked in as the persisted channel ─────────────────── +# `lucebox update` reads LUCEBOX_INSTALLED_FROM from the installed copy and +# re-fetches from it. Persisting a SHA-pinned URL is a footgun — every +# future update would re-install the same frozen SHA forever, defeating +# the point of `update`. So: +# +# 1. If $LUCEBOX_INSTALL_CHANNEL is set, that's the persisted URL +# (caller takes responsibility for picking a real branch URL). +# 2. Else if LUCEBOX_INSTALL_URL has a 40-char hex SHA segment, refuse +# to persist it — tell the user to set LUCEBOX_INSTALL_CHANNEL. +# Common case: someone curl'd from /raw// to bypass a stale CDN +# cache during dev; they meant for updates to track the branch. +# 3. Else persist LUCEBOX_INSTALL_URL as-is (branch or canonical main). +channel_url="${LUCEBOX_INSTALL_CHANNEL:-}" +if [ -z "$channel_url" ]; then + # Match a full 40-char hex SHA in the URL path, not the broader + # {7,40} range — a 7-39 char hex segment is more likely a branch + # name shaped like a short SHA (e.g. `feat/abc1234-hotfix`) than an + # actual SHA-pin. Keeping the gate at exactly 40 chars matches what + # `git rev-parse HEAD` emits and what `/raw//` URLs from + # GitHub's CDN actually carry. + if [[ "$LUCEBOX_INSTALL_URL" =~ /[0-9a-fA-F]{40}/[^/]+\.sh$ ]]; then + die "$(cat </install.sh | \\ + LUCEBOX_INSTALL_URL=/lucebox.sh \\ + LUCEBOX_INSTALL_CHANNEL=https://raw.githubusercontent.com////lucebox.sh \\ + bash +EOM +)" + fi + channel_url="$LUCEBOX_INSTALL_URL" +fi + +# Bake the channel URL into the file. Use a `|` delimiter since URLs +# contain `/`. The line is expected to exist in lucebox.sh with a `:-` +# default; we rewrite the whole assignment. +# +# The URL ends up inside a bash double-quoted literal in the installed +# script, so any of $ ` " \ in `channel_url` would break the installed +# file (or worse, allow command substitution to run at next sourcing). +# Validate that the URL is plain http(s)+ASCII-URL-safe characters; we +# don't expect arbitrary content here, only an upstream raw.github URL +# (or a forked equivalent). Escape the sed metachars (\&|) separately so +# the substitution itself round-trips. +case "$channel_url" in + *['"$`\']*) die "channel URL contains unsafe characters: $channel_url" ;; +esac +escaped_url=$(printf '%s' "$channel_url" | sed 's/[\\&|]/\\&/g') +sed "s|^LUCEBOX_INSTALLED_FROM=.*|LUCEBOX_INSTALLED_FROM=\"$escaped_url\"|" "$tmp" > "$tmp.baked" +mv "$tmp.baked" "$tmp" +grep -q "^LUCEBOX_INSTALLED_FROM=\"$escaped_url\"$" "$tmp" \ + || die "failed to bake install source into the downloaded script" + +# ── install ─────────────────────────────────────────────────────────────── +mkdir -p "$(dirname "$DEST")" +chmod +x "$tmp" +mv "$tmp" "$DEST" +trap - EXIT +ok "installed lucebox → $DEST" +info " fetched from: $LUCEBOX_INSTALL_URL" +info " update channel: $channel_url" +if [ "$LUCEBOX_INSTALL_URL" != "$channel_url" ]; then + info " (lucebox update will track the channel URL, not the fetch URL)" +fi + +# ── PATH hint ───────────────────────────────────────────────────────────── +case ":${PATH:-}:" in + *":$(dirname "$DEST"):"*) ;; + *) info " hint: add $(dirname "$DEST") to PATH so 'lucebox' is on the path" ;; +esac + +cat </dev/null || realpath "$0" 2>/dev/null || echo "$0")" +SCRIPT_NAME="$(basename "$SCRIPT_PATH")" + +# ── tunables / env overrides ─────────────────────────────────────────────── +# Host-side scalars (image registry+variant, port, container name, models +# dir). Resolution order, applied uniformly via _lucebox_resolve below: +# 1. $LUCEBOX_ per-invocation env override +# 2. config.toml
. persisted user choice (system of record) +# 3. derived / canonical default +# This keeps the wrapper and the in-container Python CLI agreeing on +# effective values — config.toml is the single source of truth, both +# sides read it. +UNIT_NAME="lucebox.service" +UNIT_PATH="${XDG_CONFIG_HOME:-$HOME/.config}/systemd/user/$UNIT_NAME" + +# CUDA driver floor for the prebuilt CUDA 12 image. +# shellcheck disable=SC2034 +MIN_DRIVER_CUDA12=525 + +# Canonical source of `lucebox.sh`. The bootstrap installer (`install.sh`) +# rewrites this line at install time to record which URL the user actually +# installed from — `lucebox update` then re-pulls from the same channel +# without losing track of forks. Falls back to the Luce-Org main branch +# when nothing was baked in (e.g. someone curl'd the script directly). +LUCEBOX_INSTALLED_FROM="${LUCEBOX_INSTALLED_FROM:-https://raw.githubusercontent.com/Luce-Org/lucebox-hub/main/lucebox.sh}" + +# Path to the persisted config.toml. Mirrors +# lucebox.config.default_config_path: $LUCEBOX_HOME/config.toml if set, +# else $HOME/.lucebox/config.toml. Read-only from this wrapper — the +# Python CLI is the writer. +_lucebox_config_path() { + if [ -n "${LUCEBOX_HOME:-}" ]; then + printf '%s/config.toml' "$LUCEBOX_HOME" + return + fi + printf '%s/.lucebox/config.toml' "$HOME" +} + +# Read a `
.` value from config.toml. Returns empty if the +# file is missing, the section/key is absent, or the value is empty. +# Handles the subset of TOML that lucebox writes: +# [section] +# key = "string" # surrounding double-quotes are stripped +# key = 8080 # bare scalars passed through verbatim +# key = true # same +# Inline `# comment` is honored. Arrays / inline tables / multi-line +# strings aren't written by the Python persister, so we don't parse them. +_lucebox_config_get() { + local dotted="$1" cfg + cfg="$(_lucebox_config_path)" + [ -f "$cfg" ] || return 0 + local section="${dotted%.*}" + local key="${dotted##*.}" + [ "$section" = "$dotted" ] && section="" + awk -v want_section="$section" -v want_key="$key" ' + BEGIN { current = "" } + /^[[:space:]]*\[/ { + t = $0 + sub(/^[[:space:]]*\[[[:space:]]*/, "", t) + sub(/[[:space:]]*\][[:space:]]*$/, "", t) + current = t + next + } + /^[[:space:]]*#/ { next } + /=/ { + if (current != want_section) next + line = $0 + sub(/#.*$/, "", line) + eq = index(line, "=") + if (eq == 0) next + k = substr(line, 1, eq - 1) + v = substr(line, eq + 1) + gsub(/^[[:space:]]+|[[:space:]]+$/, "", k) + gsub(/^[[:space:]]+|[[:space:]]+$/, "", v) + if (k != want_key) next + if (length(v) >= 2 && substr(v, 1, 1) == "\"" && substr(v, length(v), 1) == "\"") + v = substr(v, 2, length(v) - 2) + print v + exit + } + ' "$cfg" +} + +# Resolve a scalar through the precedence ladder. env_value comes from +# the caller (typically `"${LUCEBOX_FOO:-}"` — the `:-` matters under +# `set -u`). +_lucebox_resolve() { + local env_value="$1" toml_key="$2" default="$3" v + if [ -n "$env_value" ]; then + printf '%s' "$env_value" + return + fi + v="$(_lucebox_config_get "$toml_key")" + if [ -n "$v" ]; then + printf '%s' "$v" + return + fi + printf '%s' "$default" +} + +# Derive the default image URL from the install source so a fork install +# (e.g. easel/lucebox-hub) gets the fork's GHCR image automatically when +# config.toml hasn't pinned one yet. Pattern: +# https://raw.githubusercontent.com////lucebox.sh +# → ghcr.io// +# GHCR rejects mixed-case org paths so the org segment is lowercased; the +# repo name is preserved as-is. Falls back to the canonical Luce-Org image +# when the URL doesn't match the raw.githubusercontent.com pattern. +_lucebox_derive_image() { + # The ref segment can contain slashes (e.g. `feat/lucebox-docker`), so + # the middle `.+` greedily eats everything up to the trailing + # `/lucebox.sh`. The first two `[^/]+` capture org + repo, which are + # never slash-containing on GitHub. + local url="$1" org repo + if [[ "$url" =~ ^https?://raw\.githubusercontent\.com/([^/]+)/([^/]+)/.+/lucebox\.sh$ ]]; then + org=$(printf '%s' "${BASH_REMATCH[1]}" | tr '[:upper:]' '[:lower:]') + repo="${BASH_REMATCH[2]}" + printf 'ghcr.io/%s/%s' "$org" "$repo" + return + fi + printf 'ghcr.io/luce-org/lucebox-hub' +} + +# Effective scalars, env > config.toml > default. +CONTAINER_NAME=$(_lucebox_resolve "${LUCEBOX_CONTAINER:-}" runtime.container_name "lucebox") +DEFAULT_PORT=$(_lucebox_resolve "${LUCEBOX_PORT:-}" runtime.port "8080") +DEFAULT_MODELS_DIR=$(_lucebox_resolve "${LUCEBOX_MODELS:-}" paths.models "${XDG_DATA_HOME:-$HOME/.local/share}/lucebox/models") +IMAGE_BASE=$(_lucebox_resolve "${LUCEBOX_IMAGE:-}" image.registry "$(_lucebox_derive_image "$LUCEBOX_INSTALLED_FROM")") + +# ── LUCEBOX_HOST_* safe defaults (belt-and-suspenders) ──────────────────── +# `set -u` makes any unbound LUCEBOX_HOST_* read fatal. Historically this has +# been the #1 source of regressions in this wrapper: someone adds a code path +# that touches a LUCEBOX_HOST_* var before probe_host has run, the call sites +# that DO pre-probe still work, and the bug ships. To make the bug literally +# unrepresentable we seed every LUCEBOX_HOST_* with an explicit safe default +# at script-load time (these mirror probe_host's "nothing detected" state). +# probe_host then overwrites them with real values. Any future read — pre- or +# post-probe — is now well-defined. +: "${LUCEBOX_HOST_NPROC:=1}" +: "${LUCEBOX_HOST_RAM_GB:=0}" +: "${LUCEBOX_HOST_GPU_VENDOR:=none}" +: "${LUCEBOX_HOST_GPU_NAME:=}" +: "${LUCEBOX_HOST_GPU_COUNT:=0}" +: "${LUCEBOX_HOST_VRAM_GB:=0}" +: "${LUCEBOX_HOST_GPU_SM:=}" +: "${LUCEBOX_HOST_DRIVER_VERSION:=}" +: "${LUCEBOX_HOST_DRIVER_MAJOR:=0}" +: "${LUCEBOX_HOST_HAS_SYSTEMD:=0}" +: "${LUCEBOX_HOST_IS_WSL:=0}" +: "${LUCEBOX_HOST_HAS_DOCKER:=0}" +: "${LUCEBOX_HOST_DOCKER_VERSION:=}" +: "${LUCEBOX_HOST_HAS_CTK:=none}" +# Host-identity facts (item 1 — host-identity capture). These ride along +# the existing LUCEBOX_HOST_* convoy into the container so /opt/lucebox-hub/ +# HOST_INFO can be written without re-probing inside the container (where +# /proc and nvidia-smi see the container's view, not the rig's). +: "${LUCEBOX_HOST_OS_PRETTY:=}" +: "${LUCEBOX_HOST_KERNEL:=}" +: "${LUCEBOX_HOST_WSL_VERSION:=}" +: "${LUCEBOX_HOST_NVIDIA_CTK_VERSION:=}" +: "${LUCEBOX_HOST_CPU_MODEL:=}" +: "${LUCEBOX_HOST_GPU_LIST_CSV:=}" +: "${LUCEBOX_HOST_CUDA_VISIBLE_DEVICES:=}" +# Tracks whether probe_host has actually run; pieces of the code that need +# fresh host facts (e.g. cmd_check, cmd_serve) gate on this. Default 0. +: "${_LUCEBOX_HOST_PROBED:=0}" + +# ── output helpers ──────────────────────────────────────────────────────── +if [ -t 1 ] && [ -z "${NO_COLOR:-}" ]; then + C_INFO='\033[1;34m'; C_OK='\033[1;32m'; C_WARN='\033[1;33m' + C_ERR='\033[1;31m'; C_DIM='\033[2m'; C_RST='\033[0m' +else + C_INFO=''; C_OK=''; C_WARN=''; C_ERR=''; C_DIM=''; C_RST='' +fi + +info() { printf '%b[INFO]%b %s\n' "$C_INFO" "$C_RST" "$*"; } +ok() { printf '%b[OK]%b %s\n' "$C_OK" "$C_RST" "$*"; } +warn() { printf '%b[WARN]%b %s\n' "$C_WARN" "$C_RST" "$*"; } +err() { printf '%b[ERROR]%b %s\n' "$C_ERR" "$C_RST" "$*" >&2; } +hint() { printf ' %b%s%b\n' "$C_DIM" "$*" "$C_RST"; } +die() { err "$*"; exit 1; } + +# ── host probing ────────────────────────────────────────────────────────── +# Sets the LUCEBOX_HOST_* variables consumed by the in-container Python CLI +# (passed through with -e). The Python side trusts these and doesn't reprobe +# — it can't see the host's /proc anyway, only the container's. + +probe_host() { + LUCEBOX_HOST_NPROC=$(nproc 2>/dev/null || echo 1) + # RAM: try Linux /proc/meminfo first, then macOS/BSD sysctl, else 0. + LUCEBOX_HOST_RAM_GB=0 + if [ -r /proc/meminfo ]; then + LUCEBOX_HOST_RAM_GB=$(awk '/MemTotal/{printf "%.0f", $2/1024/1024}' /proc/meminfo 2>/dev/null || echo 0) + elif command -v sysctl &>/dev/null; then + mem_bytes=$(sysctl -n hw.memsize 2>/dev/null || echo 0) + LUCEBOX_HOST_RAM_GB=$(( mem_bytes / 1024 / 1024 / 1024 )) + fi + LUCEBOX_HOST_GPU_VENDOR="none" + LUCEBOX_HOST_GPU_NAME="" + LUCEBOX_HOST_GPU_COUNT=0 + LUCEBOX_HOST_VRAM_GB=0 + LUCEBOX_HOST_GPU_SM="" + LUCEBOX_HOST_DRIVER_VERSION="" + LUCEBOX_HOST_DRIVER_MAJOR=0 + + if command -v nvidia-smi &>/dev/null; then + local q + if q=$(nvidia-smi --query-gpu=name,memory.total,driver_version,compute_cap \ + --format=csv,noheader,nounits 2>/dev/null) && [ -n "$q" ]; then + LUCEBOX_HOST_GPU_VENDOR="nvidia" + LUCEBOX_HOST_GPU_NAME=$(printf '%s\n' "$q" | head -1 | awk -F', ' '{print $1}') + local mem_mib + mem_mib=$(printf '%s\n' "$q" | head -1 | awk -F', ' '{print $2}') + LUCEBOX_HOST_VRAM_GB=$((mem_mib / 1024)) + LUCEBOX_HOST_DRIVER_VERSION=$(printf '%s\n' "$q" | head -1 | awk -F', ' '{print $3}') + LUCEBOX_HOST_DRIVER_MAJOR=${LUCEBOX_HOST_DRIVER_VERSION%%.*} + local cc + cc=$(printf '%s\n' "$q" | head -1 | awk -F', ' '{print $4}') + LUCEBOX_HOST_GPU_SM="${cc//./}" + LUCEBOX_HOST_GPU_COUNT=$(printf '%s\n' "$q" | wc -l) + fi + # Multi-GPU enumeration for /props.host. The single-GPU vars + # above (GPU_NAME / GPU_SM / VRAM_GB / DRIVER_VERSION) keep + # describing GPU 0 for back-compat with cmd_check + autotune; + # the full per-GPU CSV rides along separately so HOST_INFO can + # emit the whole array. + LUCEBOX_HOST_GPU_LIST_CSV=$(nvidia-smi \ + --query-gpu=index,uuid,pci.bus_id,name,compute_cap,memory.total,power.limit \ + --format=csv,noheader 2>/dev/null || echo "") + fi + # CUDA_VISIBLE_DEVICES from the caller's env (empty default = "all GPUs"). + LUCEBOX_HOST_CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-}" + + # OS / kernel identity. /etc/os-release is the freedesktop spec for + # "what distro is this?" and we keep PRETTY_NAME verbatim (it already + # includes the version, e.g. "Ubuntu 22.04.3 LTS"). + LUCEBOX_HOST_OS_PRETTY="" + if [ -r /etc/os-release ]; then + # shellcheck source=/dev/null + LUCEBOX_HOST_OS_PRETTY=$(. /etc/os-release 2>/dev/null && printf '%s' "${PRETTY_NAME:-}") + fi + LUCEBOX_HOST_KERNEL=$(uname -r 2>/dev/null || echo "") + + # WSL version detection. "wsl2" matches the kernel-side string the + # MS-shipped WSL2 kernel embeds; "wsl1" is what the legacy translation + # layer writes. Anything else stays empty (= not WSL). + LUCEBOX_HOST_WSL_VERSION="" + if [ -r /proc/version ]; then + if grep -q "microsoft-standard-WSL2" /proc/version 2>/dev/null; then + LUCEBOX_HOST_WSL_VERSION="wsl2" + elif grep -qi "Microsoft" /proc/version 2>/dev/null; then + LUCEBOX_HOST_WSL_VERSION="wsl1" + fi + fi + + # CPU model — first "model name" hit in /proc/cpuinfo. Cheaper than + # lscpu and keeps the bash side dep-free. + LUCEBOX_HOST_CPU_MODEL="" + if [ -r /proc/cpuinfo ]; then + LUCEBOX_HOST_CPU_MODEL=$(awk -F': ' '/^model name/{print $2; exit}' /proc/cpuinfo 2>/dev/null || echo "") + fi + + LUCEBOX_HOST_HAS_SYSTEMD=0 + if command -v systemctl &>/dev/null && systemctl --user show-environment &>/dev/null; then + LUCEBOX_HOST_HAS_SYSTEMD=1 + fi + + LUCEBOX_HOST_IS_WSL=0 + if grep -qi microsoft /proc/version 2>/dev/null \ + || [ -e /proc/sys/fs/binfmt_misc/WSLInterop ]; then + LUCEBOX_HOST_IS_WSL=1 + fi + + LUCEBOX_HOST_HAS_DOCKER=0 + LUCEBOX_HOST_DOCKER_VERSION="" + if command -v docker &>/dev/null && docker ps &>/dev/null; then + LUCEBOX_HOST_HAS_DOCKER=1 + LUCEBOX_HOST_DOCKER_VERSION=$(timeout 5 docker version --format '{{.Server.Version}}' 2>/dev/null || echo "") + fi + + LUCEBOX_HOST_HAS_CTK="none" + if [ "$LUCEBOX_HOST_HAS_DOCKER" = "1" ]; then + if command -v nvidia-container-runtime &>/dev/null; then + LUCEBOX_HOST_HAS_CTK="runtime" + elif command -v nvidia-ctk &>/dev/null \ + && nvidia-ctk cdi list 2>/dev/null | grep -q 'nvidia.com/gpu'; then + LUCEBOX_HOST_HAS_CTK="cdi" + elif command -v nvidia-ctk &>/dev/null; then + LUCEBOX_HOST_HAS_CTK="installed-unwired" + fi + fi + + # NVIDIA Container Toolkit version (best-effort; empty when nvidia-ctk + # is not installed). nvidia-ctk --version prints "NVIDIA Container + # Toolkit CLI version 1.16.2" on a single line — extract the trailing + # token so the host-info JSON carries just the version, not the banner. + LUCEBOX_HOST_NVIDIA_CTK_VERSION="" + if command -v nvidia-ctk &>/dev/null; then + LUCEBOX_HOST_NVIDIA_CTK_VERSION=$(nvidia-ctk --version 2>/dev/null \ + | awk '/version/{print $NF; exit}' \ + || echo "") + fi + + export LUCEBOX_HOST_NPROC LUCEBOX_HOST_RAM_GB LUCEBOX_HOST_GPU_VENDOR + export LUCEBOX_HOST_GPU_NAME LUCEBOX_HOST_GPU_COUNT LUCEBOX_HOST_VRAM_GB + export LUCEBOX_HOST_GPU_SM LUCEBOX_HOST_DRIVER_VERSION LUCEBOX_HOST_DRIVER_MAJOR + export LUCEBOX_HOST_HAS_SYSTEMD LUCEBOX_HOST_IS_WSL + export LUCEBOX_HOST_HAS_DOCKER LUCEBOX_HOST_DOCKER_VERSION + export LUCEBOX_HOST_HAS_CTK + export LUCEBOX_HOST_OS_PRETTY LUCEBOX_HOST_KERNEL LUCEBOX_HOST_WSL_VERSION + export LUCEBOX_HOST_NVIDIA_CTK_VERSION LUCEBOX_HOST_CPU_MODEL + export LUCEBOX_HOST_GPU_LIST_CSV LUCEBOX_HOST_CUDA_VISIBLE_DEVICES + _LUCEBOX_HOST_PROBED=1 +} + +# Cheap idempotency wrapper. Anything that needs real host facts (vs the safe +# defaults seeded at script-load) calls this. Subcommands that go straight to +# `systemctl`/`journalctl` no longer need to remember to call probe_host. +ensure_probed() { + [ "$_LUCEBOX_HOST_PROBED" = "1" ] || probe_host +} + +pick_variant() { + # CUDA 12.8 is the supported image variant for this branch. Effective + # value goes through the same env > config.toml > default ladder as + # everything else so `config set image.variant=...` propagates. + _lucebox_resolve "${LUCEBOX_VARIANT:-}" image.variant "cuda12" +} + +# ── prereq checks (host-only) ───────────────────────────────────────────── +# Print-and-exit on anything that needs root to install. The Python CLI does +# the richer reporting; this is the bare minimum to make `docker run` viable. + +require_host_prereqs() { + local missing=0 + if ! command -v docker &>/dev/null; then + err "docker is not installed" + hint "Install: https://docs.docker.com/engine/install/" + missing=1 + elif ! docker ps &>/dev/null; then + err "docker daemon not reachable" + hint "sudo systemctl start docker (or: add your user to the 'docker' group, then re-login)" + missing=1 + fi + + if ! command -v nvidia-smi &>/dev/null; then + err "nvidia-smi not found — no NVIDIA driver detected" + hint "Install the NVIDIA driver: https://www.nvidia.com/Download/index.aspx" + missing=1 + elif ! nvidia-smi --query-gpu=name --format=csv,noheader &>/dev/null; then + err "nvidia-smi present but NVML calls fail — likely a driver/library mismatch" + hint "Reboot, or reinstall the matching NVIDIA driver package" + missing=1 + fi + + [ "$missing" = "0" ] || exit 1 +} + +require_ctk() { + case "$LUCEBOX_HOST_HAS_CTK" in + runtime|cdi) return 0 ;; + installed-unwired) + err "NVIDIA Container Toolkit installed but not wired into docker" + hint "sudo nvidia-ctk runtime configure --runtime=docker && sudo systemctl restart docker" + hint " or generate a CDI spec: sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml" + exit 1 ;; + none|*) + err "NVIDIA Container Toolkit not installed" + hint "Install: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html" + hint "Then register with docker:" + hint " sudo nvidia-ctk runtime configure --runtime=docker && sudo systemctl restart docker" + exit 1 ;; + esac +} + +require_systemd() { + # Earlier versions of this wrapper had `start`/`stop`/`logs`/etc. drop + # straight into cmd_systemctl_passthrough without probing first, which + # tripped `set -u` on the reference below. Two layers of defence now: + # 1) top-of-script seeds LUCEBOX_HOST_HAS_SYSTEMD=0 unconditionally, so + # no read can be unbound even if probe_host is bypassed entirely. + # 2) ensure_probed runs probe_host on first call so we still get the + # real answer for the require_systemd error path. + ensure_probed + if [ "$LUCEBOX_HOST_HAS_SYSTEMD" != "1" ]; then + err "user systemd is not available — required for $1" + hint "On WSL: set 'systemd=true' under [boot] in /etc/wsl.conf, then 'wsl --shutdown'." + hint "Otherwise: install systemd, or run '$SCRIPT_NAME serve' to run in the foreground without systemd." + exit 1 + fi +} + +# ── docker run construction ─────────────────────────────────────────────── +# All the Python-CLI subcommands share the same docker run incantation: +# mount the host docker socket (so the in-container CLI can spawn server / +# bench containers on the host daemon), mount $HOME at the same path (so +# paths look identical in and out), and pass host facts via env. When an +# NVIDIA GPU is detected we also pass --gpus all so the orchestrator can +# call nvidia-smi during profile snapshot export; without it nvidia_smi_csv (and +# any downstream power/utilization fields) come back empty. + +DOCKER_SOCK_PATH="${DOCKER_HOST:-/var/run/docker.sock}" +DOCKER_SOCK_PATH="${DOCKER_SOCK_PATH#unix://}" + +build_orchestrator_argv() { + local variant="$1"; shift + local tty=() + if [ -t 0 ] && [ -t 1 ]; then + tty=(-it) + else + tty=(-i) + fi + local argv=(docker run --rm "${tty[@]}") + if [ "${LUCEBOX_HOST_GPU_VENDOR:-none}" = "nvidia" ]; then + argv+=(--gpus all) + fi + argv+=(--name "${CONTAINER_NAME}-cli-$$") + argv+=(--user "$(id -u):$(id -g)") + # Only bind-mount the docker socket when DOCKER_HOST actually points + # at a unix socket on this host. With DOCKER_HOST=tcp://… or ssh://… + # the path we'd construct is `tcp` or empty, and `docker run -v` would + # bark with an "invalid mount" error before the orchestrator even + # starts. The orchestrator-in-container relies on docker access only + # when actually needed; pulling that mount when the host talks to + # docker over TCP/SSH is fine. + if [ -S "$DOCKER_SOCK_PATH" ]; then + argv+=(--group-add "$(stat -c '%g' "$DOCKER_SOCK_PATH")") + argv+=(-v "$DOCKER_SOCK_PATH:/var/run/docker.sock") + fi + argv+=(-v "$HOME:$HOME") + # Bind-mount the XDG models dir explicitly (host = container path) so + # paths line up in/out. The $HOME mount above already covers it when + # XDG_DATA_HOME is unset, but an explicit -v is required when the user + # points XDG_DATA_HOME outside $HOME. + mkdir -p "$DEFAULT_MODELS_DIR" + argv+=(-v "$DEFAULT_MODELS_DIR:$DEFAULT_MODELS_DIR") + argv+=(-w "$PWD") + argv+=(-e "HOME=$HOME") + # Host facts — Python side reads these instead of reprobing. + local var + for var in $(compgen -e | grep '^LUCEBOX_HOST_' || true); do + argv+=(-e "$var=${!var}") + done + # User overrides for image/port/container name propagate too. + argv+=(-e "LUCEBOX_IMAGE=$IMAGE_BASE") + argv+=(-e "LUCEBOX_VARIANT=$variant") + argv+=(-e "LUCEBOX_PORT=$DEFAULT_PORT") + argv+=(-e "LUCEBOX_CONTAINER=$CONTAINER_NAME") + # Always export the resolved models dir so the in-container CLI sees + # the same path the wrapper mounts (don't gate on `LUCEBOX_MODELS` being + # set — the XDG default needs to flow through too). + argv+=(-e "LUCEBOX_MODELS=$DEFAULT_MODELS_DIR") + [ -n "${HF_TOKEN:-}" ] && argv+=(-e "HF_TOKEN=$HF_TOKEN") + + argv+=("${IMAGE_BASE}:${variant}") + # `lucebox` is the entrypoint subcommand handled by server/scripts/entrypoint.sh + # — it execs `python -m lucebox` with whatever args we pass on. + argv+=(lucebox "$@") + printf '%s\n' "${argv[@]}" +} + +# ── subcommand implementations ──────────────────────────────────────────── + +cmd_serve() { + # Long-running foreground server. Also what systemd's ExecStart= calls. + # + # Two-stage so config.toml takes effect: + # 1. Run an ephemeral orchestrator container that emits the canonical + # server docker-run argv from .lucebox/config.toml (one arg per + # line on stdout). + # 2. Exec that argv. + # + # If stage 1 fails (image not pulled yet, no config), fall back to a + # conservative docker run — the container's own VRAM-tiered autotune + # picks reasonable defaults from there. + require_host_prereqs + ensure_probed + require_ctk + local variant + variant=$(pick_variant) + + # Pre-flight: refuse to stomp on something that's already serving this + # slot. Three states to distinguish, because silently `docker rm -f`-ing + # whatever is there hides real bugs (e.g. the user forgot they had a + # systemd unit up, and we'd happily race two servers on the same port): + # + # 1. systemd unit active → refuse, redirect to `logs`/`stop` + # 2. container running (no systemd)→ refuse, redirect to `docker logs` + # 3. container present but stopped → orphan from a SIGKILLed previous + # run (docker run --rm only cleans up on clean exit). Remove it, + # but TELL the user — they need to know their last run died dirty. + # CRITICAL: when systemd invokes US as the unit's ExecStart, is-active + # returns true *because of us* — refusing here would deadlock the unit + # in a restart loop (and historically did — commit a30dbe5 shipped this + # bug). systemd sets $INVOCATION_ID in every service exec, so its + # presence is the unambiguous "I am running as the systemd ExecStart" + # signal. Skip the unit-active check in that case; the container-state + # check below still catches a stale container holding the slot. + if [ -z "${INVOCATION_ID:-}" ] \ + && systemctl --user is-active --quiet "$UNIT_NAME" 2>/dev/null; then + err "${UNIT_NAME} is already running under systemd." + hint " $SCRIPT_NAME logs # follow the journal" + hint " $SCRIPT_NAME restart # bounce the service" + hint " $SCRIPT_NAME stop # stop the service" + exit 1 + fi + local container_state + container_state=$(docker inspect --format '{{.State.Status}}' "$CONTAINER_NAME" 2>/dev/null || echo absent) + case "$container_state" in + absent) + ;; + running|restarting) + err "Container '$CONTAINER_NAME' is already running (outside systemd)." + hint " docker logs -f $CONTAINER_NAME # follow output" + hint " $SCRIPT_NAME stop # stop it" + exit 1 + ;; + exited|created|paused|dead) + info "Removing stale '$CONTAINER_NAME' container (state=$container_state, likely from a previous unclean exit)" + docker rm -f "$CONTAINER_NAME" >/dev/null + ;; + *) + warn "Container '$CONTAINER_NAME' is in unexpected state '$container_state' — removing" + docker rm -f "$CONTAINER_NAME" >/dev/null + ;; + esac + + local orch_argv server_argv + mapfile -t orch_argv < <(build_orchestrator_argv "$variant" print-serve-argv) + + if mapfile -t server_argv < <("${orch_argv[@]}" 2>/dev/null) \ + && [ "${#server_argv[@]}" -gt 0 ] \ + && [ "${server_argv[0]}" = "docker" ]; then + info "Starting lucebox server (variant=$variant, from config.toml)" + _serve_and_track "${server_argv[@]}" + return $? + fi + + warn "Couldn't fetch server argv from container (image not pulled?) — using fallback" + info "Starting lucebox server (variant=$variant, port=$DEFAULT_PORT, defaults only)" + local fallback_models="$DEFAULT_MODELS_DIR" + mkdir -p "$fallback_models" + # Forward host facts even on the fallback path so the in-container + # entrypoint can still write /opt/lucebox-hub/HOST_INFO from the host's + # view of the rig. Matches the orchestrator path (see + # build_orchestrator_argv) — without it, HOST_INFO would be written + # with "source: unknown" any time print-serve-argv fails. + local fallback_argv=(docker run --rm + --name "$CONTAINER_NAME" + --gpus all + -p "$DEFAULT_PORT:8080" + -v "$HOME:$HOME" + -v "$fallback_models:/opt/lucebox-hub/server/models") + local var + for var in $(compgen -e | grep '^LUCEBOX_HOST_' || true); do + fallback_argv+=(-e "$var=${!var}") + done + fallback_argv+=("${IMAGE_BASE}:${variant}") + _serve_and_track "${fallback_argv[@]}" +} + +# Foreground server runner with controlling-process lifetime semantics: +# the docker daemon owns containers independently of the CLI, so a bare +# `exec docker run` leaves the container alive after the wrapper's parent +# (a terminal, a systemd unit, anything) goes away. `docker run --rm` only +# cleans up on the container's own clean exit, not on our death. +# +# Fix: run docker as a child, install signal traps that issue `docker stop` +# before exiting. Now `lucebox serve` behaves like a normal foreground +# program — close the terminal, kill the wrapper, send SIGTERM from +# systemd, the container goes down with it. +# +# Stops also from EXIT so even a `set -e` propagation cleans up. +_serve_and_track() { + "$@" & + local docker_pid=$! + # shellcheck disable=SC2317 # called via trap, not "unreachable" + _serve_stop() { + trap - HUP INT TERM EXIT + # Best-effort: container may already be exiting / never started. + # `docker stop` blocks up to -t seconds for graceful shutdown + # (server handles SIGTERM), then SIGKILLs. 10s is enough for the + # in-flight request to finish on a typical decode. + docker stop -t 10 "$CONTAINER_NAME" >/dev/null 2>&1 || true + wait "$docker_pid" 2>/dev/null || true + } + trap _serve_stop HUP INT TERM EXIT + wait "$docker_pid" + local rc=$? + trap - HUP INT TERM EXIT + return $rc +} + +cmd_systemd_install() { + require_host_prereqs + ensure_probed + require_systemd "service install" + local docker_bin + docker_bin=$(command -v docker) + + mkdir -p "$(dirname "$UNIT_PATH")" + # Capture the user's resolved env at install time so the unit launches + # with the same image/variant/port/models the user expected when they + # ran `lucebox install`. Systemd's user-session env is sparse — without + # this block, the wrapper inside the unit would fall back to the + # in-script defaults and silently pick a different image or models + # directory than the user's interactive session uses. + # + # ExecStartPre cleans up any orphaned container with the target name + # left behind by a previous crash (docker's `--rm` only fires on clean + # exit — a SIGKILL or daemon restart leaves the name claimed, and the + # next ExecStart would die with "name already in use" while systemd + # reports a useless "exit code 125"). + cat > "$UNIT_PATH" </dev/null | awk -F= '/^Linger=/{print $2}') + if [ "$linger" != "yes" ]; then + warn "Linger is off for $USER — the service will stop when you log out" + hint "To enable (requires sudo): sudo loginctl enable-linger \"$USER\"" + fi + + printf '\nNext:\n' + hint " $SCRIPT_NAME start # start now" + hint " $SCRIPT_NAME enable # start at every login" + hint " $SCRIPT_NAME logs # follow the journal" +} + +cmd_systemd_uninstall() { + require_systemd "service uninstall" + if systemctl --user is-active --quiet "$UNIT_NAME" 2>/dev/null; then + info "Stopping $UNIT_NAME" + systemctl --user stop "$UNIT_NAME" || true + fi + if systemctl --user is-enabled --quiet "$UNIT_NAME" 2>/dev/null; then + info "Disabling $UNIT_NAME" + systemctl --user disable "$UNIT_NAME" || true + fi + if [ -f "$UNIT_PATH" ]; then + rm -f "$UNIT_PATH" + ok "Removed $UNIT_PATH" + else + info "No unit at $UNIT_PATH — nothing to remove" + fi + systemctl --user daemon-reload + hint "Config and models are left in place. Remove them by hand if you want." +} + +cmd_systemctl_passthrough() { + local action="$1" + require_systemd "$action" + if [ ! -f "$UNIT_PATH" ]; then + err "$UNIT_NAME is not installed — run '$SCRIPT_NAME install' first" + exit 1 + fi + case "$action" in + start|restart) + # `systemctl start` is fire-and-forget for Type=exec: it returns + # success as soon as execve() completes, even if the wrapper + # exits 1 a millisecond later. That gave us the worst possible + # UX — `lucebox start` reports no error but no container ever + # binds port 8080. Poll is-active for a few seconds and dump + # status + recent journal lines so the user sees the real cause. + local current + current=$(systemctl --user is-active "$UNIT_NAME" 2>/dev/null || true) + # `start` against an already-active unit: systemctl returns 0 + # silently. That's polite for scripts but confusing for humans + # — say so explicitly. For `restart` always run through. + if [ "$action" = "start" ] && [ "$current" = "active" ]; then + ok "$UNIT_NAME is already active" + hint "logs: $SCRIPT_NAME logs" + hint "smoke: curl -s http://localhost:$DEFAULT_PORT/v1/models" + hint "(use \`$SCRIPT_NAME restart\` to bounce, \`$SCRIPT_NAME stop\` to halt)" + return 0 + fi + # `start` against a unit stuck in restart-loop ("activating") is + # the symptom of a broken ExecStart — calling start would just + # block waiting for active that never comes. Surface this + # specifically so the user goes to `lucebox logs` to find the + # exit reason rather than waiting for the poll to give up. + if [ "$action" = "start" ] && [ "$current" = "activating" ]; then + err "$UNIT_NAME is in restart-loop (state=activating)" + hint "the unit is failing and being auto-restarted by systemd" + hint " $SCRIPT_NAME stop # halt the loop first" + hint " $SCRIPT_NAME logs # find the exit reason" + exit 1 + fi + info "$action $UNIT_NAME" + if ! systemctl --user "$action" "$UNIT_NAME"; then + err "systemctl --user $action $UNIT_NAME failed" + systemctl --user status "$UNIT_NAME" --no-pager -n 30 || true + exit 1 + fi + local i state + for i in 1 2 3 4 5 6 7 8 9 10; do + state=$(systemctl --user is-active "$UNIT_NAME" 2>/dev/null || true) + case "$state" in + active) break ;; # already up — no need to keep polling + activating) ;; # still booting; keep waiting + *) break ;; # failed / inactive — fall through to error path + esac + sleep 1 + done + state=$(systemctl --user is-active "$UNIT_NAME" 2>/dev/null || true) + if [ "$state" != "active" ]; then + err "$UNIT_NAME did not reach active state (current: ${state:-unknown})" + if [ "$state" = "activating" ]; then + hint "the unit is in a restart loop — \`$SCRIPT_NAME stop\` to halt it" + fi + hint "status:" + systemctl --user status "$UNIT_NAME" --no-pager -n 30 || true + hint "recent journal:" + journalctl --user -u "$UNIT_NAME" -n 30 --no-pager || true + exit 1 + fi + ok "$UNIT_NAME is active" + hint "logs: $SCRIPT_NAME logs" + hint "smoke: curl -s http://localhost:$DEFAULT_PORT/v1/models" + ;; + stop|enable|disable) + exec systemctl --user "$action" "$UNIT_NAME" ;; + status) + exec systemctl --user status "$UNIT_NAME" --no-pager ;; + *) + die "unknown systemctl passthrough: $action" ;; + esac +} + +cmd_logs() { + require_systemd "logs" + # Pure passthrough: any flags the user wants (-f, -n, --since, ...) go + # straight to journalctl. Default is follow. + if [ $# -eq 0 ]; then + exec journalctl --user -u "$UNIT_NAME" -f + fi + exec journalctl --user -u "$UNIT_NAME" "$@" +} + +cmd_pull() { + # Pull has to run on the host. Delegating this into the container creates a + # stale-image trap: docker may start an old local tag before the fresh tag + # has been pulled. + require_host_prereqs + local variant + variant=$(pick_variant) + info "Pulling ${IMAGE_BASE}:${variant}" + exec docker pull "${IMAGE_BASE}:${variant}" +} + +cmd_update() { + # Re-run the bootstrap installer against the channel we were installed + # from. The installer is the source of truth for "how do you install + # lucebox correctly" — chmod, atomic mv, validation, baking the source + # URL back into the new copy so the channel is preserved across + # upgrades. Keeping the logic in install.sh means it can evolve + # independently (sha verify, signature check, etc.) and the installed + # `lucebox update` picks those changes up on the next run. + # + # The installer URL is derived from LUCEBOX_INSTALLED_FROM by swapping + # `lucebox.sh` → `install.sh` in the same directory, so forks don't + # need a separate registration. Override the source channel via + # $LUCEBOX_INSTALL_URL (e.g. to switch from canonical to a dev fork). + local source_url installer_url target + source_url="${LUCEBOX_INSTALL_URL:-$LUCEBOX_INSTALLED_FROM}" + if [[ "$source_url" != */lucebox.sh ]]; then + die "LUCEBOX_INSTALLED_FROM doesn't end in /lucebox.sh: $source_url" + fi + installer_url="${source_url%/lucebox.sh}/install.sh" + target=$(realpath "$SCRIPT_PATH") + + info "Updating lucebox via $installer_url" + info " source: $source_url" + info " target: $target" + + # Pass the URLs through to install.sh via env. The installer reads + # $LUCEBOX_INSTALL_URL (which we set to source_url) and + # $LUCEBOX_INSTALL_DEST (the realpath of *this* file, so a symlinked + # install replaces the actual file behind the link). + LUCEBOX_INSTALL_URL="$source_url" \ + LUCEBOX_INSTALL_DEST="$target" \ + bash -c "$(curl -fsSL "$installer_url")" \ + || die "update failed (installer exited non-zero)" +} + +cmd_completion() { + # Print shell completion script for bash / zsh / fish. Usage: + # + # # bash (in ~/.bashrc): + # source <(lucebox completion bash) + # + # # zsh (in ~/.zshrc, before `compinit`): + # source <(lucebox completion zsh) + # + # # fish: + # lucebox completion fish | source + # + # Keep this in sync with the dispatch table in main() and the sub-app + # verbs (config get/set/unset, models list/download). Adding a new + # top-level command means adding it here too. + local shell="${1:-}" + case "$shell" in + bash) + cat <<'BASH' +# lucebox bash completion. Source from ~/.bashrc: +# source <(lucebox completion bash) +_lucebox_complete() { + local cur prev cmds config_verbs models_verbs completion_shells + COMPREPLY=() + cur="${COMP_WORDS[COMP_CWORD]}" + prev="${COMP_WORDS[COMP_CWORD-1]}" + cmds="install uninstall start stop restart enable disable status logs \ + serve pull update check completion config models autotune \ + profile smoke print-run help version" + config_verbs="get set unset" + models_verbs="list download" + completion_shells="bash zsh fish" + + # Sub-app verbs / shell args. + case "$prev" in + config) COMPREPLY=( $(compgen -W "$config_verbs" -- "$cur") ); return ;; + models) COMPREPLY=( $(compgen -W "$models_verbs" -- "$cur") ); return ;; + completion) COMPREPLY=( $(compgen -W "$completion_shells" -- "$cur") ); return ;; + esac + + # Top-level command. + if [ "$COMP_CWORD" = 1 ]; then + COMPREPLY=( $(compgen -W "$cmds" -- "$cur") ) + return + fi +} +complete -F _lucebox_complete lucebox lucebox.sh +BASH + ;; + zsh) + # Bash-compat shim: zsh sources our bash completion through + # bashcompinit. Users who prefer native zsh _arguments-style + # completion can write their own; this gets `` working + # in two lines for free. + cat <<'ZSH' +# lucebox zsh completion. Source from ~/.zshrc (after compinit): +# source <(lucebox completion zsh) +autoload -Uz compinit bashcompinit +compinit +bashcompinit +ZSH + cmd_completion bash + ;; + fish) + cat <<'FISH' +# lucebox fish completion. Source from ~/.config/fish/config.fish: +# lucebox completion fish | source +complete -c lucebox -f +set -l __lucebox_cmds install uninstall start stop restart enable disable \ + status logs serve pull update check completion config models autotune \ + profile smoke print-run help version +for cmd in $__lucebox_cmds + complete -c lucebox -n "not __fish_seen_subcommand_from $__lucebox_cmds" -a $cmd +end +complete -c lucebox -n "__fish_seen_subcommand_from config" -a "get set unset" +complete -c lucebox -n "__fish_seen_subcommand_from models" -a "list download" +complete -c lucebox -n "__fish_seen_subcommand_from completion" -a "bash zsh fish" +FISH + ;; + ""|--help|-h) + cat </dev/null; then + _row 0 "docker daemon" "installed but unreachable — start the daemon or add user to 'docker' group" + else + _row 0 "docker daemon" "not installed — https://docs.docker.com/engine/install/" + fi + + # nvidia container toolkit + case "$LUCEBOX_HOST_HAS_CTK" in + runtime) _row 1 "nvidia ctk" "wired into docker (runtime)" ;; + cdi) _row 1 "nvidia ctk" "wired via CDI (nvidia.com/gpu)" ;; + installed-unwired) _row warn "nvidia ctk" "installed but not registered with docker — sudo nvidia-ctk runtime configure --runtime=docker && sudo systemctl restart docker" ;; + none|*) _row 0 "nvidia ctk" "not installed — https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html" ;; + esac + + # nvidia-smi + driver + if [ "$LUCEBOX_HOST_GPU_VENDOR" = "nvidia" ]; then + if [ "$LUCEBOX_HOST_DRIVER_MAJOR" -ge "$MIN_DRIVER_CUDA12" ]; then + _row 1 "nvidia driver" "$LUCEBOX_HOST_DRIVER_VERSION (≥ $MIN_DRIVER_CUDA12 required for cuda12)" + else + _row 0 "nvidia driver" "$LUCEBOX_HOST_DRIVER_VERSION (< $MIN_DRIVER_CUDA12 — cuda12 image will fail)" + fi + elif command -v nvidia-smi &>/dev/null; then + _row 0 "nvidia driver" "nvidia-smi present but NVML calls fail — driver/library mismatch, try reboot" + else + _row 0 "nvidia driver" "nvidia-smi not found — install the NVIDIA driver" + fi + + # GPU detail + if [ "$LUCEBOX_HOST_GPU_VENDOR" = "nvidia" ]; then + _row 1 "gpu" "$LUCEBOX_HOST_GPU_NAME × $LUCEBOX_HOST_GPU_COUNT (sm_$LUCEBOX_HOST_GPU_SM, ${LUCEBOX_HOST_VRAM_GB} GB VRAM)" + # cuda12 image arch coverage: sm_75;80;86;89;90;120 (see docker-bake.hcl) + case "$LUCEBOX_HOST_GPU_SM" in + 75|80|86|89|90|120) _row 1 "cuda12 arch" "sm_$LUCEBOX_HOST_GPU_SM covered by image" ;; + "") _row warn "cuda12 arch" "compute_cap not detected" ;; + *) _row warn "cuda12 arch" "sm_$LUCEBOX_HOST_GPU_SM not in image arch list (75;80;86;89;90;120)" ;; + esac + fi + + # systemd + if [ "$LUCEBOX_HOST_HAS_SYSTEMD" = "1" ]; then + _row 1 "user systemd" "available (needed for '$SCRIPT_NAME install')" + elif [ "$LUCEBOX_HOST_IS_WSL" = "1" ]; then + _row warn "user systemd" "WSL detected — set 'systemd=true' under [boot] in /etc/wsl.conf, then 'wsl --shutdown'" + else + _row warn "user systemd" "not available — '$SCRIPT_NAME install' (service unit) won't work; '$SCRIPT_NAME serve' (foreground) will" + fi + + # image we'd pull — marked ✗ when the host clearly can't run cuda12 + # (no nvidia driver, or no CTK wired into docker). It's still useful + # to print the line so the user knows what would be pulled, but a + # green ✓ would be misleading. + if [ "$LUCEBOX_HOST_GPU_VENDOR" != "nvidia" ]; then + _row 0 "image" "${IMAGE_BASE}:${variant} — requires NVIDIA driver" + elif [ "$LUCEBOX_HOST_HAS_CTK" = "none" ] || [ "$LUCEBOX_HOST_HAS_CTK" = "installed-unwired" ]; then + _row 0 "image" "${IMAGE_BASE}:${variant} — needs NVIDIA Container Toolkit wired into docker" + else + _row 1 "image" "${IMAGE_BASE}:${variant}" + fi + # RAM / cores (informational) + _row 1 "host" "${LUCEBOX_HOST_NPROC} cpus, ${LUCEBOX_HOST_RAM_GB} GB RAM" +} + +cmd_in_container() { + # Generic dispatcher: anything that isn't a systemd action goes here. + # Runs the in-container Python CLI with the supplied argv. + require_host_prereqs + ensure_probed + # CTK isn't strictly required for every subcommand (e.g. `config get` + # or `autotune` only touch local files), but the server-spawning + # subcommands need it. + # Letting docker error its own way is fine for the no-CTK case. + local variant + variant=$(pick_variant) + local argv + mapfile -t argv < <(build_orchestrator_argv "$variant" "$@") + exec "${argv[@]}" +} + +# Is the long-running lucebox container currently up? Used by the dispatcher +# to decide between `docker exec` into it (cheap, shares the running server's +# network namespace so localhost:8080 reaches the server) vs. `docker run` +# (cold start, isolated network — can't reach the live server). +# +# `docker ps -q -f name=^$` prints the container id when running, +# empty otherwise. The anchored regex avoids matching `lucebox-cli-12345` +# style ephemeral siblings. +_lucebox_container_running() { + # No docker on PATH → definitely not running. Don't even probe. + command -v docker >/dev/null 2>&1 || return 1 + local id + id=$(docker ps -q -f "name=^${CONTAINER_NAME}\$" 2>/dev/null || true) + [ -n "$id" ] +} + +# `docker exec` variant of cmd_in_container. Same calling convention, but: +# - shares the running container's network namespace (localhost:8080 → the +# server), filesystem, and mounts — no bind mounts needed. +# - skips the ~1-3s cold-start cost of a fresh `docker run --rm`. +# - only safe for steady-state / read-only / config-only subcommands. Any +# command that restarts the lucebox service (autotune --sweep, serve) +# would kill the very container the exec is in — caller must route those +# to cmd_in_container instead. +# +# Pass through the same env-var subset the run path uses so the in-container +# CLI sees consistent overrides whichever route it took: HOME, every +# LUCEBOX_HOST_*, the image/port/container/models scalars, and HF_TOKEN. +cmd_exec_in_container() { + require_host_prereqs + ensure_probed + local argv=(docker exec) + if [ -t 0 ] && [ -t 1 ]; then + argv+=(-it) + else + argv+=(-i) + fi + argv+=(--user "$(id -u):$(id -g)") + argv+=(-w "$PWD") + argv+=(-e "HOME=$HOME") + local var + for var in $(compgen -e | grep '^LUCEBOX_HOST_' || true); do + argv+=(-e "$var=${!var}") + done + argv+=(-e "LUCEBOX_IMAGE=$IMAGE_BASE") + argv+=(-e "LUCEBOX_VARIANT=$(pick_variant)") + argv+=(-e "LUCEBOX_PORT=$DEFAULT_PORT") + argv+=(-e "LUCEBOX_CONTAINER=$CONTAINER_NAME") + argv+=(-e "LUCEBOX_MODELS=$DEFAULT_MODELS_DIR") + [ -n "${HF_TOKEN:-}" ] && argv+=(-e "HF_TOKEN=$HF_TOKEN") + # The image has no top-level `lucebox` binary on PATH — that name only + # works as the first arg to /opt/lucebox-hub/server/scripts/entrypoint.sh, + # which then `exec uv run ... python -m lucebox`s. docker exec bypasses + # the image's ENTRYPOINT, so we invoke the entrypoint shim explicitly + # with `lucebox` as its SUBCMD and the user's argv tail. Keeps the + # exec path bit-for-bit equivalent to what docker run does on the + # SUBCMD=lucebox branch. + argv+=("$CONTAINER_NAME" /opt/lucebox-hub/server/scripts/entrypoint.sh lucebox "$@") + exec "${argv[@]}" +} + +# Decide whether a given (subcommand, argv) pair is safe to run via +# `docker exec` into the live container. Returns 0 (yes, prefer exec) or 1 +# (no, must use docker run / host-side). +# +# The safe-to-exec set is exactly the steady-state / read-only / hits-the- +# running-server subcommands. Anything that restarts the service, mutates +# images, or is itself the long-running service must stay on cmd_in_container. +# +# `autotune` is a special case: read-only (`autotune` alone, `--list-profiles`) +# is exec-safe, but `--sweep` restarts the service per cell and MUST stay on +# the docker-run path (sweeping into the live container would kill it mid-run). +_lucebox_prefer_exec() { + local cmd="$1"; shift + case "$cmd" in + config|smoke|models|check|profile|print-run|print-serve-argv) + return 0 + ;; + autotune) + # Scan the rest of the argv for --sweep. If present, this is a + # service-restarting workload and must stay on cmd_in_container. + local a + for a in "$@"; do + [ "$a" = "--sweep" ] && return 1 + done + return 0 + ;; + *) + return 1 + ;; + esac +} + +# Top-level routing for the in-container Python CLI. Picks between exec +# (cheap, shares the live server's namespace) and run (cold start, isolated). +# +# Decision tree: +# 1. LUCEBOX_NO_EXEC=1 / --no-exec was set → always run, never exec. +# Useful for debugging the wrapper or when the in-container Python is +# stale relative to the image. +# 2. cmd is not in the prefer-exec list → run (sweep, service mutators). +# 3. container is running → exec (the fast path, hits the live server). +# 4. container is not running → run (fall back so first-run / pre-install +# flows still work without a live service). +cmd_route_to_container() { + local cmd="$1"; shift + if [ "${LUCEBOX_NO_EXEC:-0}" = "1" ]; then + cmd_in_container "$cmd" "$@" + return + fi + if _lucebox_prefer_exec "$cmd" "$@" && _lucebox_container_running; then + cmd_exec_in_container "$cmd" "$@" + return + fi + cmd_in_container "$cmd" "$@" +} + +usage() { + cat < print shell completion script (bash / zsh / fish) + models list / download / activate model presets + config read / write keys in .lucebox/config.toml + autotune compute (and optionally apply) VRAM-tier DFLASH_* defaults + — `autotune --sweep` empirically picks a per-tier winner + smoke hit /v1/chat/completions on a running server + profile run luce-bench snapshot via the running container + print-run print the docker-run command for the server + +Misc: + help, --help, -h this message + version, --version print version + +Environment overrides: + LUCEBOX_IMAGE image name without tag (default: ghcr.io/luce-org/lucebox-hub) + LUCEBOX_VARIANT image tag to pull/run (default: cuda12) + LUCEBOX_PORT host port for the server (default: 8080) + LUCEBOX_CONTAINER server container name (default: lucebox) + LUCEBOX_MODELS host model directory (default: \$XDG_DATA_HOME/lucebox/models + LUCEBOX_NO_EXEC=1 force docker-run for in-container subcommands even + when the container is up (equivalent to --no-exec) + HF_TOKEN propagated to \`models download\` for gated HF repos + +Container routing: + When the long-running '$CONTAINER_NAME' container is up, steady-state + subcommands (config, smoke, models, check, profile, print-run, + print-serve-argv, autotune without --sweep) 'docker exec' into it instead + of starting a fresh container. This avoids the ~1-3s docker-run cold-start + AND shares the live server's network namespace so localhost:\$LUCEBOX_PORT + reaches the server. Service-restarting commands (autotune --sweep, serve, + pull, update, install, etc.) stay on the host-side / docker-run path. + Pass --no-exec (or LUCEBOX_NO_EXEC=1) to force the docker-run path. +EOF +} + +# ── dispatch ────────────────────────────────────────────────────────────── + +main() { + # Global flag pass: `--no-exec` anywhere before the subcommand forces the + # docker-run path even if the container is up. Equivalent to + # `LUCEBOX_NO_EXEC=1 lucebox ...`. We pop it out of argv up-front so the + # rest of dispatch doesn't have to know about it. + local args=() + while [ $# -gt 0 ]; do + case "$1" in + --no-exec) export LUCEBOX_NO_EXEC=1; shift ;; + *) args+=("$1"); shift ;; + esac + done + set -- "${args[@]}" + + local cmd="${1:-help}" + [ $# -gt 0 ] && shift + case "$cmd" in + # Systemd surface + install) cmd_systemd_install "$@" ;; + uninstall) cmd_systemd_uninstall "$@" ;; + start|stop|restart|enable|disable|status) + cmd_systemctl_passthrough "$cmd" "$@" ;; + logs) cmd_logs "$@" ;; + + # Direct server + serve) cmd_serve "$@" ;; + pull) cmd_pull "$@" ;; + + # Self-update — re-runs the bootstrap installer against the channel + # this script was installed from (LUCEBOX_INSTALLED_FROM). + update) cmd_update "$@" ;; + + # Host-only readiness check — pure shell, never enters the container. + check) cmd_check "$@" ;; + + # Shell completion — print a script the user sources into their rc + # file. Bash and zsh share the bash-style emitter (zsh users add a + # `bashcompinit; complete` shim); fish is native. + completion) cmd_completion "$@" ;; + + # Help / version + help|--help|-h) usage ;; + version|--version) printf '%s\n' "$VERSION" ;; + + # Everything else → in-container Python CLI. cmd_route_to_container + # picks between `docker exec` into the live container (cheap, shares + # the running server's network namespace) and `docker run` (cold, + # isolated) based on container state + the safe-to-exec command set. + *) cmd_route_to_container "$cmd" "$@" ;; + esac +} + +main "$@" diff --git a/lucebox/.gitignore b/lucebox/.gitignore new file mode 100644 index 00000000..15f95f0e --- /dev/null +++ b/lucebox/.gitignore @@ -0,0 +1,3 @@ + +# Generated by hatch-vcs at build time from git tags. +src/lucebox/_version.py diff --git a/lucebox/README.md b/lucebox/README.md new file mode 100644 index 00000000..86edb7b4 --- /dev/null +++ b/lucebox/README.md @@ -0,0 +1,17 @@ +# lucebox — host CLI for the lucebox-hub container + +This package ships *inside* the `ghcr.io/luce-org/lucebox-hub` Docker image +and is invoked from the host via the [`lucebox.sh`](../lucebox.sh) wrapper: + + lucebox.sh check # `docker run … lucebox check` + lucebox.sh configure + lucebox.sh print-run + +The wrapper is the only thing that runs on the host; everything else (host +checks, TOML config, docker daemon calls, autotune + sweep, smoke +tests, model download) is Python in the container. Host facts (driver, +GPU, RAM, VRAM, systemd availability) are passed in via `LUCEBOX_HOST_*` +environment variables so the Python side doesn't reprobe. + +Subcommands are defined in [`lucebox/cli.py`](src/lucebox/cli.py). See the +top-level [README.md](../README.md) for the user-facing flow. diff --git a/lucebox/pyproject.toml b/lucebox/pyproject.toml new file mode 100644 index 00000000..0659cf1b --- /dev/null +++ b/lucebox/pyproject.toml @@ -0,0 +1,54 @@ +[project] +name = "lucebox" +# Version is derived from git tags via hatch-vcs (see [tool.hatch.version] +# below). Tag `lucebox-v0.2.1` → release version `0.2.1`. Commits past a +# tag get a `.devN+g` suffix so dev installs are visibly distinct +# from releases. Single source of truth: the git tag. +dynamic = ["version"] +description = "Host-side CLI for the lucebox-hub container: orchestration, autotune + sweep" +readme = "README.md" +requires-python = ">=3.11" +authors = [{ name = "Lucebox" }] +license = { text = "Apache-2.0" } + +# Kept intentionally narrow. typer pulls click+rich; tomli-w gives us TOML +# writes (stdlib tomllib only reads). httpx for the smoke + readiness probes. +# huggingface_hub for download-models — used directly (not via subprocess) +# so we can drive a Rich progress bar + verify sha256 against the repo +# metadata before re-fetching multi-GB GGUFs. +dependencies = [ + "typer>=0.12", + "rich>=13", + "httpx>=0.27", + "tomli-w>=1.0", + "huggingface_hub>=0.27", + # luce-bench is consumed lazily by the autotune sweep scorer + # (agent_replay_pass_rate in sweep.py does a function-local + # `from lucebench.areas.agent_recorded import ...` wrapped in try/except). + # It's deliberately NOT a hard dep here because the workspace can't lock + # against it until #337 (luce-bench in-tree) lands. Install with + # `uv pip install luce-bench` on the host running the scorer. +] + +[project.scripts] +lucebox = "lucebox.cli:app" + +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +[tool.hatch.version] +source = "vcs" +# Untagged checkouts (e.g. fresh clone before tagging lucebox-v0.2.1) +# resolve to this rather than 0.0.0.dev0. +fallback-version = "0.2.1.dev0" +raw-options.tag_regex = '''^lucebox-v(?P\d+\.\d+\.\d+)$''' + +[tool.hatch.build.hooks.vcs] +# Build hook writes the resolved version into src/lucebox/_version.py +# so `__init__.py` can `from lucebox._version import __version__`. +# Generated file — see lucebox/.gitignore. +version-file = "src/lucebox/_version.py" + +[tool.hatch.build.targets.wheel] +packages = ["src/lucebox"] diff --git a/lucebox/src/lucebox/__init__.py b/lucebox/src/lucebox/__init__.py new file mode 100644 index 00000000..00a7611f --- /dev/null +++ b/lucebox/src/lucebox/__init__.py @@ -0,0 +1,15 @@ +"""lucebox — host-side CLI for the lucebox-hub container. + +Runs inside the container; the host wrapper at ../lucebox.sh handles `docker +run` plumbing and systemd integration. This package owns: TOML config, +autotune rules, docker daemon calls (via the mounted socket), the benchmark +sweep orchestrator, the smoke test, and model download. +""" + +# Version is generated by hatch-vcs at build time into _version.py. +# Fresh source-tree checkouts before any build will not yet have the +# file — fall back to a dev marker so imports don't break. +try: + from lucebox._version import __version__ +except ImportError: + __version__ = "0.0.0.dev0+unbuilt" diff --git a/lucebox/src/lucebox/__main__.py b/lucebox/src/lucebox/__main__.py new file mode 100644 index 00000000..128e2ca8 --- /dev/null +++ b/lucebox/src/lucebox/__main__.py @@ -0,0 +1,6 @@ +"""Entry point for `python -m lucebox`.""" + +from lucebox.cli import app + +if __name__ == "__main__": + app() diff --git a/lucebox/src/lucebox/autotune.py b/lucebox/src/lucebox/autotune.py new file mode 100644 index 00000000..ad027bb2 --- /dev/null +++ b/lucebox/src/lucebox/autotune.py @@ -0,0 +1,489 @@ +"""Heuristic autotune: VRAM tier → DflashRuntime defaults + preset picker. + +The recommended runtime is computed from HostFacts (VRAM, is_wsl) and the +recommended preset from VRAM tier alone. Both are stateless — they take +HostFacts in and return a fresh value — so the CLI can apply them with +``lucebox autotune --apply`` without holding any global state. + +Profiles +-------- + +``lucebox autotune --sweep --profile `` selects a workload-specific +bracket + winner-pick strategy. The default profile is ``heuristic`` — +preset-agnostic, ranks cells by mean ``decode_tokens_per_sec``. The +``coding-agent-loop`` profile brackets per architecture (gemma4 vs +qwen3.6/laguna): for gemma4 the KV-quant axis is dead (KV is hardcoded +F16 in the backend; see ``server/src/gemma4/gemma4_loader.cpp``), so +the bracket sweeps ``max_ctx × fa_window × budget`` instead (pflash is +kept off in the sweep: pflash requires both a drafter file AND +prefix_cache_slots > 0; with the default prefix_cache_slots=0 all +KV chunks are forced → zero compression). For qwen3.6/laguna the KV +quant axis is live and the bracket includes it. Winner is picked by +composite ``pass-rate then speed`` against the agent_recorded multi-turn +fixture — the sweep driver in ``sweep.py`` calls the profile's +``scorer`` per cell. + +Profiles are intentionally a lightweight dataclass + module-level +registry: add a profile by appending to ``PROFILES``; no plugin +machinery, no entry points. A second flavor (e.g. ``research-loop``) +should follow when there's a second workload worth profiling. +""" + +from __future__ import annotations + +from collections.abc import Callable +from dataclasses import dataclass, replace + +from lucebox.types import DflashRuntime, HostFacts + + +def _preset_approx_gb(preset: str) -> float: + """Rough total VRAM footprint (model + draft) for the named preset. + + Used only to pick the right max_ctx tier when the preset is known. + Returns 0.0 when the preset is unknown (caller should use tier defaults). + """ + try: + from lucebox.download import PRESETS + p = PRESETS.get(preset) + return float(p.approx_total_gb) if p is not None else 0.0 + except Exception: + return 0.0 + + +def runtime_from_host(host: HostFacts, preset: str = "") -> DflashRuntime: + """Pick a conservative DflashRuntime that 'should work' on this VRAM tier. + + Tiers (NVIDIA, baseline = Qwen3.6-27B Q4_K_M ~18 GB total): + <12 GB — too small for 27B; pick min ctx as a floor so a fallback + start at least gets an error from the daemon rather than + a silent OOM. + 12-21 — fits but tight; cap ctx. + 22-31 — 24 GB-class consumer flagships (3090/4090/5090/5090-Laptop). + Default cap at 96 K (tq3_0, ~2 GB KV + ~18 GB model ≈ 20 GB). + For ≥20 GB models (gemma-4-31b at 21 GB, qwen3.6-moe at 22 GB) + the KV headroom shrinks to ~2-3 GB → cap at 32 K instead. + Confirmed on bragi (RTX 5090 Laptop, 23 GB VRAM) 2026-05-31: + Gemma4-31B starts at 32K but would OOM at 98K with tq3_0. + 32-47 — RTX 6000 Ada / A100 40 GB. Full 128 K. + ≥48 — A100 80 GB / H100 / RTX 6000 Pro. Full 128 K. + + Prefix cache remains an explicit sweep tunable, but the automatic + baseline keeps it off because tool prompts currently exercise a daemon + snapshot path that is not reliable with prefix slots enabled. + Empirically confirmed on bragi 2026-05-31: prefix_cache_slots=32 + caused -19pp regression on agent_recorded (23.1% vs 42.3% baseline). + 5 previously-passing cases regressed; 0 new cases unlocked. See + docs/experiments/qwen3.6-27b-prefix-cache-regression-bragi-2026-05-31.md. + + On `lazy`: the C++ server requires `--prefill-drafter` (and `--draft`) + to be set for `--lazy-draft` to take effect, and silently ignores it + otherwise (`--lazy-draft ignored: requires both --prefill-drafter and + --draft`). Since the heuristic path does NOT set `prefill_drafter`, + we default `lazy=False` here — "what we say" matches "what runs". + Users who explicitly opt in via config.toml will be warned at server + startup that the flag is being dropped (see entrypoint.sh). + """ + if host.vram_gb <= 0: + return DflashRuntime() # no VRAM signal — stick with class defaults + + if host.vram_gb < 12: + return DflashRuntime(max_ctx=4096) + if host.vram_gb < 22: + return DflashRuntime(max_ctx=32768) + if host.vram_gb < 32: + # On 22-31 GB cards, model size matters. Models ≥20 GB (e.g. gemma-4-31b + # at ~21 GB, qwen3.6-moe at ~22 GB) leave only ~2-3 GB for KV, limiting + # practical max_ctx to ~32K even with tq3_0. Models <20 GB (e.g. + # qwen3.6-27b at ~18 GB, laguna-xs.2 at ~21 GB with tq3_0 KV cliff) can + # use up to 98K with tq3_0. + # Note: Laguna's max_ctx cliff is a kernel path issue (not VRAM), handled + # separately in config; the heuristic still gives 32K for Laguna to be safe. + approx_gb = _preset_approx_gb(preset) + if approx_gb >= 20.0: + # Large model: ~2 GB headroom for KV → cap at 32K + return DflashRuntime( + budget=8, max_ctx=32768, + cache_type_k="tq3_0", cache_type_v="tq3_0", + ) + # tq3_0 is required at 98K on 23 GB cards: model (~18-19 GB) + + # q8_0 KV cache at 98K (~5-6 GB) = 24-25 GB → OOM. + # tq3_0 KV (~2 GB) leaves ~3 GB headroom. Confirmed on bragi + # (RTX 5090 Laptop, 23 GB VRAM) 2026-05-30 — q8_0 timed out on + # every 98K cell; all tq3_0 cells passed. See + # docs/experiments/qwen3.6-27b-coding-agent-loop-sweep-bragi-2026-05-30.md + if host.is_wsl: + # Bumped from max_ctx=65536 → 98304 on 2026-05-30 after the + # coding-agent-loop sweep on sindri proved 98K serves real + # 90K-token agentic prompts with ~3 GB VRAM headroom and no + # CUDA VMM failures. See + # docs/experiments/gemma4-26b-coding-agent-loop-sweep-2026-05-30.md. + # The original 65K cap cited unverified VMM failures — + # bisect history showed no commit reproducing them. + return DflashRuntime( + budget=16, max_ctx=98304, + cache_type_k="tq3_0", cache_type_v="tq3_0", + ) + return DflashRuntime( + max_ctx=98304, + cache_type_k="tq3_0", cache_type_v="tq3_0", + ) + if host.vram_gb < 48: + return DflashRuntime(max_ctx=131072) + return DflashRuntime(max_ctx=131072) + + +def candidate_configs(host: HostFacts, preset: str = "") -> list[DflashRuntime]: + """Empirical bracket worth testing on this host. + + Returns ~6-12 DflashRuntime configs around runtime_from_host(host) + — small enough to sweep in under 30 min on a 24 GB rig, large + enough that the empirical winner usually beats the heuristic prior + on the host's real workload. + + ``preset`` flows into ``runtime_from_host`` so the 22-31 GB tier's + large-model 32K safety cap (e.g. gemma-4-31b, qwen3.6-moe) seeds the + bracket. Without it the base would be the small-model 98K config + and every cell on a 24 GB host trying a ≥20 GB preset would OOM. + + Per-tier brackets: + <12 GB → base only (no sweep — model barely fits) + 12-21 → 3 configs: budget × {smaller, equal, larger} + 22-31 → ~8 configs: budget × {16, 22, 32} × kv × {tq3_0, q8_0} + 32-47 → ~6 configs: budget × {22, 32} × kv × {tq3_0, q8_0, f16} + ≥48 → ~6 configs: budget × {32, 48} × kv × {tq3_0, q8_0, f16} + + The base config (from runtime_from_host) is always the seed; every + returned candidate is a `replace()` of that base so the 11 + DflashRuntime fields outside the swept axes stay aligned with the + heuristic. Duplicates are dropped via a (budget, max_ctx, + cache_type_k, cache_type_v) tuple-set so a swept axis that happens + to land on the heuristic value doesn't generate a redundant cell. + """ + base = runtime_from_host(host, preset=preset) + + # <12 GB → base only. Model barely fits; sweeping risks OOM more + # than it improves throughput. Caller is expected to treat a + # 1-config "sweep" as a smoke test, not a tuning pass. + if host.vram_gb <= 0 or host.vram_gb < 12: + return [base] + + candidates: list[DflashRuntime] = [] + seen: set[tuple[int, int, str, str]] = set() + + def add(runtime: DflashRuntime) -> None: + key = ( + runtime.budget, + runtime.max_ctx, + runtime.cache_type_k, + runtime.cache_type_v, + ) + if key in seen: + return + seen.add(key) + candidates.append(runtime) + + # The base config is always included so the sweep validates the + # heuristic prior alongside the bracket. + add(base) + + if host.vram_gb < 22: + # 12-21 GB: tight fit. Budget bracket only, keep ctx and KV at base. + # Three cells in total (including base). + for budget in (8, 16, 22): + add(replace(base, budget=budget)) + return candidates + + if host.vram_gb < 32: + # 22-31 GB: the 24 GB consumer flagships (3090/4090/5090). KV + # quantization is the highest-leverage knob here (tq3_0 frees + # several GB of VRAM at the cost of a tiny capability hit; q8_0 + # is the safer middle ground). Budget sweep covers the small + # decode-throughput sensitivity around 22. + for budget in (16, 22, 32): + for kv in ("tq3_0", "q8_0"): + add(replace(base, budget=budget, cache_type_k=kv, cache_type_v=kv)) + return candidates + + if host.vram_gb < 48: + # 32-47 GB: RTX 6000 Ada / A100 40 GB. f16 KV is in budget here, + # so we add it to the matrix. Drop budget=16 since these GPUs + # have enough decode bandwidth that the smaller budget isn't + # the win it is on the 24 GB tier. + for budget in (22, 32): + for kv in ("tq3_0", "q8_0", "f16"): + add(replace(base, budget=budget, cache_type_k=kv, cache_type_v=kv)) + return candidates + + # ≥48 GB: A100 80 GB / H100 / RTX 6000 Pro. Drop budget=22 and pick + # up budget=48 — larger trees pay off on the higher-end cards. + for budget in (32, 48): + for kv in ("tq3_0", "q8_0", "f16"): + add(replace(base, budget=budget, cache_type_k=kv, cache_type_v=kv)) + return candidates + + +# ── Profile abstraction ──────────────────────────────────────────────────── +# +# Each Profile owns: +# * a candidate_configs(host, preset) → list[DflashRuntime] +# * a scorer key string consumed by sweep.py to pick the right +# measurement + winner-selection path. We keep the scorer as a +# string discriminator (rather than a Callable) because the scorer +# needs runtime imports (subprocess, HTTP client, snapshot parser) +# that don't belong in autotune.py — sweep.py owns those. +# +# The candidate function is preset-aware so the same profile can pick +# a different bracket per architecture. Gemma4 backends ignore the +# cache_type knob; qwen3.6 and laguna respect it. Per-preset bracket +# code lives below the Profile class. + + +@dataclass(frozen=True, slots=True) +class Profile: + name: str + # Human-readable description, surfaced by `lucebox autotune --list-profiles`. + description: str + # Builder taking (host, preset_name) and returning a list of + # candidate DflashRuntime configs to sweep. Preset is the active + # ``model.preset`` string (empty when unset). + candidate_configs: Callable[[HostFacts, str], list[DflashRuntime]] + # Discriminator for which scorer the sweep driver invokes. + # Values: "decode_tps_snapshot" (legacy heuristic path) or + # "agent_replay_pass_rate" (coding-agent-loop). Adding a new + # scorer means landing a new branch in sweep.py::run_sweep. + scorer: str = "decode_tps_snapshot" + + +# ── coding-agent-loop bracket builders ───────────────────────────────────── + + +_GEMMA_PRESETS: frozenset[str] = frozenset({"gemma-4-26b"}) +_QWEN_PRESETS: frozenset[str] = frozenset({"qwen3.6-27b", "laguna-xs.2"}) + + +def _is_gemma_preset(preset: str) -> bool: + return preset in _GEMMA_PRESETS or preset.startswith("gemma") + + +def _coding_agent_loop_gemma_bracket( + host: HostFacts, base: DflashRuntime +) -> list[DflashRuntime]: + """Gemma4 bracket: ``max_ctx × fa_window × budget × pflash_mode``. + + KV-quant axis is intentionally absent — gemma4_loader.cpp forces + F16 regardless of ``cache_type_k/v``. The 24 GB tier targets up to + 131K max_ctx; higher tiers also peak at 131K because that is the + model's practical ceiling (196K KV doesn't fit alongside the Q4_K_M + weights even on 48 GB cards — model+KV would be ~33 GB). + + 131K is confirmed viable on 23-24 GB VRAM: model (~14-15 GB) + + F16 KV at 131K (~7-8 GB) ≈ 22-23 GB total. Validated on bragi + (RTX 5090 Laptop, 23 GB) 2026-05-30 — all 6 131K cells passed. + Earlier sindri run (RTX 3090 Ti, 24 GB) appeared to fail at 131K, + but that was a fixture-picker issue (selected the 100K case which + expanded to ~130K real tokens > 126976 server ceiling), not VRAM. + See docs/experiments/gemma4-26b-coding-agent-loop-sweep-bragi-2026-05-30.md. + + Per-tier: + <22 GB → base only (model barely fits; sweeping risks OOM) + 22-31 → 12 cells: max_ctx × {98K, 131K}, fa_window × {0, 2048}, + budget × {16, 22, 32}, pflash off (pflash needs both a + drafter file AND prefix_cache_slots > 0 to compress + anything; with prefix_cache_slots=0 all chunks are + forced → zero compression. Enable manually for real + multi-turn sessions; see pflash A/B test 2026-05-31) + ≥32 → same shape, defaulting max_ctx=131K (with VRAM headroom + to take fa_window=0 paths) + """ + candidates: list[DflashRuntime] = [] + seen: set[tuple[int, int, int, str]] = set() + + def add(rt: DflashRuntime) -> None: + key = (rt.max_ctx, rt.fa_window, rt.budget, rt.prefill_mode) + if key in seen: + return + seen.add(key) + candidates.append(rt) + + if host.vram_gb < 22: + add(base) + return candidates + + # 22+ GB tier: gemma's 131K ceiling is achievable. Bracket the + # interesting axes; keep cardinality modest so the full sweep + # finishes in ~20 min. Seed with the heuristic base (KV cleared to + # match gemma4's hardcoded F16, pflash off to match the rest of + # the bracket) so the heuristic prior — which on 24 GB tops out at + # safer max_ctx than 98K/131K — stays in the bracket. Without this + # seed a host that can't actually serve 131K would have every sweep + # cell OOM and produce no winner. + add(replace(base, cache_type_k="", cache_type_v="", prefill_mode="off")) + for max_ctx in (98_304, 131_072): + for fa_window in (0, 2048): + for budget in (16, 22, 32): + add( + replace( + base, + max_ctx=max_ctx, + fa_window=fa_window, + budget=budget, + cache_type_k="", + cache_type_v="", + prefill_mode="off", + ) + ) + return candidates + + +def _coding_agent_loop_qwen_bracket( + host: HostFacts, base: DflashRuntime +) -> list[DflashRuntime]: + """Qwen3.6 / laguna bracket: ``max_ctx × cache_type × budget × fa_window``. + + KV-quant axis is the high-leverage knob on this family — tq3_0 + frees several GB of VRAM relative to q8_0, which unlocks larger + max_ctx on 24 GB cards. The PFlash axis stays off here too + (operator must configure a drafter file before flipping). + """ + candidates: list[DflashRuntime] = [] + seen: set[tuple[int, int, str, int]] = set() + + def add(rt: DflashRuntime) -> None: + key = (rt.max_ctx, rt.budget, rt.cache_type_k, rt.fa_window) + if key in seen: + return + seen.add(key) + candidates.append(rt) + + if host.vram_gb < 22: + add(base) + return candidates + + if host.vram_gb < 32: + # 24 GB tier: tq3_0 vs q8_0 at 65K; tq3_0-only at 98K. + # q8_0 at 98K OOMs on 23 GB cards: model (~18-19 GB) + + # q8_0 KV at 96K (~5-6 GB) = 24-25 GB. Verified on bragi + # (RTX 5090 Laptop 23 GB) 2026-05-30 — all q8_0/98K cells + # timed out. See + # docs/experiments/qwen3.6-27b-coding-agent-loop-sweep-bragi-2026-05-30.md + # + # budget=32+q8_0 at 65K is kept in the bracket despite being known + # to fail on bragi — it causes a GPU compute hang (100% SM / 0% + # mem bandwidth) on 23 GB cards (observed 2026-06-01, same + # DFlash hang bug as Gemma4-31B). The sweep handles it by + # timing out after 300s and restarting for the next cell. The + # systemd restart clears the GPU hang state. See + # docs/experiments/qwen3.6-27b-coding-agent-loop-sweep-bragi-2026-06-01.md + for max_ctx in (65_536, 98_304): + kvs: tuple[str, ...] = ("tq3_0", "q8_0") if max_ctx <= 65_536 else ("tq3_0",) + for kv in kvs: + for budget in (16, 22, 32): + add( + replace( + base, + max_ctx=max_ctx, + budget=budget, + cache_type_k=kv, + cache_type_v=kv, + fa_window=0, + prefill_mode="off", + ) + ) + return candidates + + # 32+ GB: full f16 in budget, push max_ctx to 131K. + for kv in ("tq3_0", "q8_0", "f16"): + for budget in (22, 32): + add( + replace( + base, + max_ctx=131_072, + budget=budget, + cache_type_k=kv, + cache_type_v=kv, + fa_window=0, + prefill_mode="off", + ) + ) + return candidates + + +def _coding_agent_loop_candidates(host: HostFacts, preset: str) -> list[DflashRuntime]: + """Dispatch coding-agent-loop bracket per architecture. + + Both gemma4 and qwen3.6/laguna paths use the heuristic base from + :func:`runtime_from_host` as their seed; the per-arch builder + decides which axes to vary around that base. + """ + base = runtime_from_host(host, preset=preset) + if _is_gemma_preset(preset): + return _coding_agent_loop_gemma_bracket(host, base) + # Default to the qwen-shape bracket — the only other supported + # presets today are qwen3.6 and laguna, and the bracket + # gracefully degrades to base-only on tiny VRAM tiers. + return _coding_agent_loop_qwen_bracket(host, base) + + +def _heuristic_candidates(host: HostFacts, preset: str) -> list[DflashRuntime]: + """Legacy preset-agnostic bracket (the original ``candidate_configs``). + + ``preset`` is forwarded to ``candidate_configs`` so the large-model + safety cap on 22-31 GB hosts (gemma-4-31b, qwen3.6-moe at ≥20 GB) + seeds the bracket. The heuristic profile still sweeps KV-quant axes + for every preset — that's wrong for gemma4 (cache_type is a no-op + there) but preserves the existing bracket shape for callers still + on the heuristic path. + """ + return candidate_configs(host, preset=preset) + + +PROFILES: dict[str, Profile] = { + "heuristic": Profile( + name="heuristic", + description=( + "Preset-agnostic bracket; ranks cells by mean decode_tokens_per_sec " + "across luce-bench level1 areas." + ), + candidate_configs=_heuristic_candidates, + scorer="decode_tps_snapshot", + ), + "coding-agent-loop": Profile( + name="coding-agent-loop", + description=( + "Architecture-aware bracket for agentic coding workloads; " + "ranks cells by pass-rate on the agent_recorded multi-turn fixture, " + "then by completion_tokens / wall_seconds." + ), + candidate_configs=_coding_agent_loop_candidates, + scorer="agent_replay_pass_rate", + ), +} + + +def get_profile(name: str) -> Profile: + """Return the registered profile or raise ``KeyError``. + + Public surface for sweep.py + cli.py — they call this with the + user's ``--profile`` argument; an unknown name produces a clear + error rather than silently falling back to the heuristic. + """ + if name not in PROFILES: + known = ", ".join(sorted(PROFILES)) + raise KeyError(f"unknown profile {name!r}; known: {known}") + return PROFILES[name] + + +def recommend_preset(host: HostFacts) -> str | None: + """Pick a default preset for first-run install. None = ask the user. + + Tiers follow the model size catalog: 22 GB+ → Qwen3.6-27B (the + Lucebox default), 16-21 GB → Laguna-XS.2 (small target-only). Below + 16 GB we punt and let the user pick explicitly — the registered + presets all need at least 16 GB to run usefully. + """ + if host.vram_gb >= 22: + return "qwen3.6-27b" + if host.vram_gb >= 16: + return "laguna-xs.2" + return None diff --git a/lucebox/src/lucebox/cli.py b/lucebox/src/lucebox/cli.py new file mode 100644 index 00000000..ad212b9c --- /dev/null +++ b/lucebox/src/lucebox/cli.py @@ -0,0 +1,758 @@ +"""Typer app — the user-facing subcommands. + +Layout follows the host wrapper's dispatch table. Anything `lucebox` +doesn't intercept (everything outside the systemd surface) ends up here. + +Subcommand inventory: + check — readiness report + config get/set/unset — read / write a single key in config.toml + pull — docker pull the cuda12 image + print-run — emit the docker-run command for the server + print-serve-argv — same, raw argv lines (consumed by `lucebox serve`) + autotune — print/persist VRAM-tier DFLASH_* defaults; `--sweep` + empirically tests a per-tier bracket and persists the winner + profile — run a luce-bench snapshot via the running container + smoke — hit /props + /v1/chat/completions on a running server + models — list / download presets, activate one + claude — launch Claude Code pointed at the running server + codex — launch Codex pointed at the running server + opencode — launch OpenCode pointed at the running server + hermes — launch Hermes pointed at the running server + pi — launch Pi pointed at the running server + openclaw — launch OpenClaw pointed at the running server +""" + +from __future__ import annotations + +import json +import os +import sys +from dataclasses import asdict, replace +from pathlib import Path +from typing import Annotated, Any + +import typer +from rich.console import Console +from rich.table import Table + +import lucebox.autotune as autotune_mod +import lucebox.config as config_mod +import lucebox.docker_run as docker_run +import lucebox.download as download_mod +import lucebox.host_check as host_check +import lucebox.profile as profile_mod +import lucebox.smoke as smoke_mod +from lucebox import __version__ +from lucebox.config import config_get, config_set, config_unset, live_config +from lucebox.host_facts import from_env + +app = typer.Typer( + name="lucebox", + help="Host CLI for the lucebox-hub container. Invoked by lucebox.sh.", + no_args_is_help=True, + add_completion=False, +) +console = Console() + + +# The strict 11-field allowlist that mirrors lucebench's snapshot +# config.json. Used by `autotune --apply` to write dflash.* keys. +DFLASH_ALLOWLIST: tuple[str, ...] = ( + "budget", + "max_ctx", + "lazy", + "prefix_cache_slots", + "prefill_cache_slots", + "cache_type_k", + "cache_type_v", + "prefill_mode", + "prefill_keep_ratio", + "prefill_threshold", + "prefill_drafter", +) + + +# ── helpers ──────────────────────────────────────────────────────────────── + + +def _load_or_build() -> config_mod.Config: # type: ignore[name-defined] + """env > config.toml > dataclass defaults — the canonical precedence. + + Without the env-overlay step below, `config_mod.load()` returned the + persisted config verbatim and `LUCEBOX_IMAGE` / `LUCEBOX_VARIANT` / + `LUCEBOX_PORT` / `LUCEBOX_CONTAINER` / `LUCEBOX_MODELS` from the + systemd unit's `Environment=` (or any one-shot shell export) were + silently dropped. That contradicted the precedence lucebox.sh + documents and applies — and bit sindri when its config.toml had + `[image]` without `registry`, so the dataclass default + `ghcr.io/luce-org/lucebox-hub` won over the unit's + `LUCEBOX_IMAGE=ghcr.io/easel/lucebox-hub`. + + Fix: overlay env on top of the loaded config (or the live_config + fallback when config.toml is absent). Only the five top-level + scalars have env hooks — dflash/host/model don't, by design. + """ + cfg = config_mod.load() + if cfg is None: + cfg = live_config() + # Overlay live host facts. When ``config.toml`` exists without a + # ``[host]`` block (the common case — operators don't hand-edit + # host facts), ``cfg.host`` defaults to a zero-filled ``HostFacts`` + # and autotune/profile decisions silently fall through to the + # "no VRAM signal" path. Re-probe from env so the wrapper-exported + # LUCEBOX_HOST_* facts always win over the persisted (possibly + # absent) snapshot. + live_host = from_env() + host = live_host if live_host.vram_gb > 0 or live_host.nproc > 0 else cfg.host + return replace( + cfg, + variant=os.environ.get("LUCEBOX_VARIANT", cfg.variant), + image=os.environ.get("LUCEBOX_IMAGE", cfg.image), + container_name=os.environ.get("LUCEBOX_CONTAINER", cfg.container_name), + port=int(os.environ.get("LUCEBOX_PORT", str(cfg.port))), + models_dir=Path(os.environ.get("LUCEBOX_MODELS", str(cfg.models_dir))), + host=host, + ) + + +# ── subcommands ──────────────────────────────────────────────────────────── + + +@app.command() +def check() -> None: + """Print a readiness report (driver, docker, CTK, RAM, VRAM, systemd).""" + host = from_env() + results = host_check.run_checks(host) + worst = host_check.render(console, host, results) + if worst == "fail": + raise typer.Exit(code=1) + + +@app.command() +def pull() -> None: + """`docker pull` the image variant from config.toml.""" + cfg = _load_or_build() + tag = f"{cfg.image}:{cfg.variant}" + console.print(f"[bold]Pulling {tag}[/bold] (~14 GB; takes a while)…") + rc = docker_run.docker_pull(tag) + if rc != 0: + raise typer.Exit(code=rc) + + +@app.command("print-run") +def print_run() -> None: + """Print the docker-run command for the server (copy-pasteable).""" + cfg = _load_or_build() + spec = docker_run.server_run_spec(cfg) + print(spec.printable()) + + +@app.command("print-serve-argv") +def print_serve_argv() -> None: + """Emit the server docker-run argv, one token per line. + + Consumed by lucebox.sh's `serve` subcommand and the systemd unit. Kept as + a separate command from `print-run` so the bash side has a guaranteed + machine-readable contract that's independent of the pretty formatter. + """ + cfg = _load_or_build() + spec = docker_run.server_run_spec(cfg) + for tok in spec.argv(): + print(tok) + + +# ── autotune ─────────────────────────────────────────────────────────────── + + +@app.command() +def autotune( + apply_: Annotated[ + bool, + typer.Option("--apply", help="Write the 11 dflash.* keys to config.toml."), + ] = False, + json_out: Annotated[ + bool, + typer.Option("--json", help="Machine-readable output (the asdict of DflashRuntime)."), + ] = False, + force: Annotated[ + bool, + typer.Option( + "--force", + help=( + "With --apply: overwrite even when a persisted dflash.* key " + "already differs from the recommendation (e.g. a sweep-tuned " + "value)." + ), + ), + ] = False, + sweep: Annotated[ + bool, + typer.Option( + "--sweep", + help=( + "Empirically test a per-VRAM-tier bracket of dflash.* configs " + "against the live server and persist the winner. Uses " + "`lucebox config set` + `lucebox restart` + `luce-bench " + "snapshot` for each cell." + ), + ), + ] = False, + yes: Annotated[ + bool, + typer.Option( + "--yes", + "-y", + help="With --sweep: skip the confirmation prompt before starting.", + ), + ] = False, + profile: Annotated[ + str, + typer.Option( + "--profile", + help=( + "With --sweep: which workload profile to use. " + "'heuristic' (default) brackets KV-quant axes and scores by " + "mean decode_tps from a luce-bench snapshot. " + "'coding-agent-loop' brackets max_ctx × fa_window × budget × " + "pflash and scores by pass-rate on a real recorded agentic " + "session replay, then speed. See `lucebox autotune " + "--list-profiles` for the full set." + ), + ), + ] = "heuristic", + list_profiles: Annotated[ + bool, + typer.Option( + "--list-profiles", + help="Print registered autotune profiles + descriptions and exit.", + ), + ] = False, +) -> None: + """Compute the recommended DflashRuntime for this host. + + By default prints a Rich table comparing live defaults vs the + heuristic recommendation. Pass ``--apply`` to persist every value + in the 11-field allowlist to config.toml (sparse — only those keys + land on disk). ``--json`` dumps the recommendation as JSON for + scripting. + + Guard: when ``--apply`` would overwrite a value the user has + already persisted (typically from a sweep) with a different + recommendation, the command refuses and lists the affected keys. + Pass ``--force`` to overwrite anyway. + + ``--sweep`` is the empirical mode: builds a per-tier bracket of + candidate dflash.* configs (see ``autotune.candidate_configs``), + cycles the server through each one via ``lucebox restart`` + + readiness probe, runs ``lucebox profile --level level1`` to capture + decode_tps, picks the highest-tps cell as winner, and persists it. + Pre-sweep config.toml is backed up to ``.sweep-backup`` and restored + on interrupt or failure. ``--sweep`` is mutually exclusive with + ``--apply`` (sweep applies its own winner) and ``--json`` (sweep + is interactive). Pass ``--yes`` / ``-y`` to skip the confirmation + prompt. + """ + if list_profiles: + table = Table(title="Autotune profiles") + table.add_column("name") + table.add_column("scorer") + table.add_column("description") + for name in sorted(autotune_mod.PROFILES): + p = autotune_mod.PROFILES[name] + table.add_row(p.name, p.scorer, p.description) + console.print(table) + return + + if sweep and (apply_ or json_out): + console.print( + "[red]--sweep is mutually exclusive with --apply and --json[/red]" + ) + raise typer.Exit(code=2) + if sweep: + from lucebox.sweep import run_sweep + + rc = run_sweep(console=console, yes=yes, profile=profile) + if rc != 0: + raise typer.Exit(code=rc) + return + + host = from_env() + cfg = _load_or_build() + runtime = autotune_mod.runtime_from_host(host, preset=cfg.model.preset) + if json_out: + print(json.dumps(asdict(runtime), indent=2)) + return + + table = Table(title="Recommended DflashRuntime") + table.add_column("key") + table.add_column("recommendation") + for name in DFLASH_ALLOWLIST: + table.add_row(name, str(getattr(runtime, name))) + console.print(table) + + if apply_: + # Drift guard. config_get with no key returns every reachable + # dflash.* entry tagged "file" (persisted) or "default" (in- + # memory only). Compare the persisted value to the + # recommendation; refuse on any drift unless --force. + if not force: + entries = config_mod.config_get() + drift: list[tuple[str, Any, Any]] = [] + for name in DFLASH_ALLOWLIST: + key = f"dflash.{name}" + current, origin = entries.get(key, (None, "default")) + if origin != "file": + continue # not persisted → nothing to overwrite + recommended = getattr(runtime, name) + if current != recommended: + drift.append((name, current, recommended)) + if drift: + console.print( + "[yellow]The following config keys already differ from " + "the recommendation:[/yellow]" + ) + width = max(len(name) for name, _, _ in drift) + for name, current, recommended in drift: + console.print( + f" dflash.{name:<{width}} current={current!r} " + f"recommended={recommended!r}" + ) + console.print("[dim]Pass --force to overwrite.[/dim]") + raise typer.Exit(code=1) + for name in DFLASH_ALLOWLIST: + config_set(f"dflash.{name}", getattr(runtime, name)) + console.print( + f"[green]Applied[/green] {len(DFLASH_ALLOWLIST)} dflash.* keys to " + f"{config_mod.default_config_path()}" + ) + + +# ── config sub-app ───────────────────────────────────────────────────────── + + +config_app = typer.Typer(no_args_is_help=True, help="Read/write keys in config.toml.") +app.add_typer(config_app, name="config") + + +@config_app.command("get") +def config_get_cmd( + key: Annotated[str, typer.Argument(help="Dotted key (omit to list every key).")] = "", +) -> None: + """Print a single key (or every reachable key) with its origin annotation.""" + try: + entries = config_get(key or None) + except KeyError as exc: + console.print(f"[red]{exc}[/red]") + raise typer.Exit(code=2) from exc + for k, (value, origin) in entries.items(): + console.print(f"{k} = {value!r} ([dim]from {origin}[/dim])") + + +@config_app.command("set") +def config_set_cmd( + kv: Annotated[str, typer.Argument(help='"key=value" pair (e.g. "model.preset=qwen3.6-27b")')], +) -> None: + """Set one dotted key. Auto-creates config.toml when missing. + + Only the named key is written — other on-disk keys are preserved + untouched, unset keys stay implicit. Use `lucebox config unset` to + remove a key (next read falls back to the live default). + """ + if "=" not in kv: + console.print("[red]argument must be key=value[/red]") + raise typer.Exit(code=2) + key, _, value = kv.partition("=") + key = key.strip() + value = value.strip() + try: + config_set(key, value) + except (KeyError, ValueError) as exc: + console.print(f"[red]{exc}[/red]") + raise typer.Exit(code=2) from exc + console.print(f"[green]Set[/green] {key} = {value}") + + +@config_app.command("unset") +def config_unset_cmd( + key: Annotated[str, typer.Argument(help="Dotted key to remove from config.toml.")], +) -> None: + """Remove a key from config.toml. Next read uses the live default.""" + try: + changed = config_unset(key) + except KeyError as exc: + console.print(f"[red]{exc}[/red]") + raise typer.Exit(code=2) from exc + if changed: + console.print(f"[green]Unset[/green] {key}") + else: + console.print(f"[dim]{key} was not in config.toml; nothing to do[/dim]") + + +# ── models sub-app ───────────────────────────────────────────────────────── + + +models_app = typer.Typer( + no_args_is_help=False, help="Manage local model presets (list, download, activate)." +) +app.add_typer(models_app, name="models") + + +def _print_installed_presets() -> None: + cfg = _load_or_build() + installed = download_mod.installed_presets(cfg) + active = cfg.model.preset + console.print(f"Models dir: [bold]{cfg.models_dir}[/bold]") + if not installed: + console.print("[dim]No presets installed yet — try `lucebox models download`.[/dim]") + return + table = Table() + table.add_column("preset") + table.add_column("status") + table.add_column("size (GB)") + for pres in installed: + marker = "* " if pres.name == active else " " + size_gb = download_mod.installed_size_gb(cfg, pres) + table.add_row(f"{marker}{pres.name}", "installed", f"{size_gb:.1f}") + console.print(table) + total = sum(download_mod.installed_size_gb(cfg, p) for p in installed) + console.print(f"[dim]Total disk usage: {total:.1f} GB[/dim]") + + +@models_app.callback(invoke_without_command=True) +def models_default(ctx: typer.Context) -> None: + """Default action: list installed presets, mark active with `*`.""" + if ctx.invoked_subcommand is None: + _print_installed_presets() + + +@models_app.command("list") +def models_list() -> None: + """Show every registered preset (installed or not) with status + size.""" + cfg = _load_or_build() + active = cfg.model.preset + table = Table() + table.add_column("preset") + table.add_column("status") + table.add_column("size (GB)") + table.add_column("description") + for name in sorted(download_mod.PRESETS): + pres = download_mod.PRESETS[name] + marker = "* " if name == active else " " + status = download_mod.installed_status(cfg, pres) + size = download_mod.installed_size_gb(cfg, pres) + size_text = f"{size:.1f}" if size > 0 else f"~{pres.approx_total_gb}*" + table.add_row(f"{marker}{name}", status, size_text, pres.description or "") + console.print(table) + + +@models_app.command("download") +def models_download( + preset: Annotated[str, typer.Argument(help="Preset name (empty = recommend)")] = "", + activate: Annotated[ + bool, typer.Option("--activate", help="Also set as active preset (model.preset).") + ] = False, +) -> None: + """Fetch a preset's GGUFs into the models dir. + + With no argument and no preset configured, recommends one for this + host's VRAM tier and auto-activates it (the first-install path). + Otherwise the named preset is downloaded; pass ``--activate`` to + also flip `model.preset` to it. + """ + cfg = _load_or_build() + if not preset: + if cfg.model.preset: + console.print( + "[yellow]No preset specified and one is already active. " + "Pass an explicit preset name (or use --activate to switch).[/yellow]" + ) + raise typer.Exit(code=2) + recommended = autotune_mod.recommend_preset(cfg.host) + if recommended is None: + console.print( + "[red]Cannot recommend a preset for this host. " + "Run `lucebox models list` and pick one explicitly.[/red]" + ) + raise typer.Exit(code=2) + preset = recommended + activate = True + console.print( + f"[bold]Recommended preset: {preset}[/bold] " + "(no preset configured; auto-activating after download)" + ) + + try: + pres = download_mod.resolve_preset(preset) + except KeyError as exc: + console.print(f"[red]{exc}[/red]") + raise typer.Exit(code=2) from exc + + current = download_mod.status(cfg, pres) + console.print(f"Models dir: [bold]{cfg.models_dir}[/bold]") + console.print(f"Preset: [bold]{pres.name}[/bold]") + console.print( + f" target ({pres.target_repo}/{pres.target_file}):" + f" {'present' if current['target_present'] else 'will download'}" + ) + if pres.has_draft: + console.print( + f" draft ({pres.draft_repo}/{pres.draft_file}):" + f" {'present' if current['draft_present'] else 'will download'}" + ) + else: + console.print(" draft [dim](none — target-only preset)[/dim]") + + if current["target_present"] and current["draft_present"]: + console.print("[green]Already present.[/green]") + else: + console.print(f"[bold]Downloading[/bold] (~{pres.approx_total_gb} GB total)…") + rc = download_mod.download_preset(cfg, pres) + if rc != 0: + raise typer.Exit(code=rc) + console.print("[green]Done.[/green]") + + if activate: + config_set("model.preset", preset) + if pres.target_file: + config_set("model.target_file", pres.target_file) + if pres.has_draft and pres.draft_file: + config_set("model.draft_file", pres.draft_file) + else: + # Drop any stale draft_file from a previous activation; the + # active preset has no draft. + config_unset("model.draft_file") + console.print(f"[green]Activated:[/green] model.preset = {preset}") + + +# ── profile (collapsed wrapper) ──────────────────────────────────────────── + + +@app.command() +def profile( + level: Annotated[ + str, + typer.Option("--level", help="Snapshot tier: level0 / level1 / level2 / level3."), + ] = "level1", + url: Annotated[ + str, + typer.Option("--url", help="Server base URL; auto-detects when empty."), + ] = "", +) -> None: + """Run a luce-bench snapshot via the running container. + + Thin wrapper that probes the host, picks an output dir under + $XDG_DATA_HOME/lucebox/profile-snapshots, and exec's + ``luce-bench snapshot`` inside the running lucebox container. Errors + clearly when no container is up (hint: ``lucebox start`` first). + """ + cfg = _load_or_build() + rc = profile_mod.run_profile(cfg, level=level, url=url or None, console=console) + if rc != 0: + raise typer.Exit(code=rc) + + +# ── smoke ────────────────────────────────────────────────────────────────── + + +@app.command() +def smoke( + timeout: Annotated[float, typer.Option(help="Per-request timeout (seconds).")] = 60.0, + tools: Annotated[ + bool, + typer.Option("--tools/--no-tools", help="Also require a tool-call response."), + ] = True, +) -> None: + """Hit /props + /v1/chat/completions on the running server; report PASS/FAIL.""" + cfg = _load_or_build() + result = smoke_mod.run(cfg, timeout_s=timeout, check_tools=tools) + console.print( + f"props={result.props_ok} tools={result.tool_ok} " + f"http={result.http_status} tokens={result.n_tokens} " + f"wall={result.wall_s:.2f}s" + ) + if result.ok: + console.print("[green]PASS[/green]") + return + console.print(f"[red]FAIL[/red] {result.error}") + raise typer.Exit(code=1) + + +# ── client launchers ─────────────────────────────────────────────────────── + + +def _detect_server_url(cfg_url: str | None) -> str: + """Auto-detect a live Lucebox server URL. + + Tries an explicit override first, otherwise probes the standard + localhost/docker-host base URLs from profile_mod and takes the first + that answers /health within 1s. Falls back to the first probe candidate + if nothing answers — lets the client fail with a clearer "server down" + error than the auto-detect can give. + """ + if cfg_url: + return cfg_url + cfg = _load_or_build() + bases = profile_mod._server_base_urls(cfg) + for candidate in bases: + if profile_mod._json_get(candidate + "/health", timeout_s=1.0): + return candidate + console.print( + f"[yellow]warning:[/yellow] no /health response at {bases[0]} " + f"— starting client anyway (server may be down)." + ) + return bases[0] + + +def _exec_client(launcher_mod, *, url: str | None, model: str, prompt: str | None) -> None: + """Common entry: probe server, exec the harness client launcher.""" + base_url = _detect_server_url(url) + try: + rc = launcher_mod.launch( + base_url=base_url, + model=model, + prompt=prompt, + interactive=prompt is None, + ) + except FileNotFoundError as e: + console.print(f"[red]{e}[/red]") + raise typer.Exit(code=127) from e + if rc != 0: + raise typer.Exit(code=rc) + + +@app.command() +def claude( + prompt: Annotated[ + str | None, + typer.Option("--prompt", "-p", help="One-shot prompt (non-interactive)."), + ] = None, + url: Annotated[ + str | None, + typer.Option(help="Lucebox base URL. Auto-detects localhost / docker host."), + ] = None, + model: Annotated[str, typer.Option(help="Model ID to advertise.")] = "luce-dflash", +) -> None: + """Launch Claude Code pointed at the running Lucebox server.""" + from harness.clients import claude_code as launcher + + _exec_client(launcher, url=url, model=model, prompt=prompt) + + +@app.command() +def codex( + prompt: Annotated[ + str | None, + typer.Option("--prompt", "-p", help="One-shot prompt (non-interactive)."), + ] = None, + url: Annotated[ + str | None, + typer.Option(help="Lucebox base URL. Auto-detects localhost / docker host."), + ] = None, + model: Annotated[str, typer.Option(help="Model ID to advertise.")] = "luce-dflash", +) -> None: + """Launch Codex pointed at the running Lucebox server.""" + from harness.clients import codex as launcher + + _exec_client(launcher, url=url, model=model, prompt=prompt) + + +@app.command() +def opencode( + prompt: Annotated[ + str | None, + typer.Option("--prompt", "-p", help="One-shot prompt (non-interactive)."), + ] = None, + url: Annotated[ + str | None, + typer.Option(help="Lucebox base URL. Auto-detects localhost / docker host."), + ] = None, + model: Annotated[str, typer.Option(help="Model ID to advertise.")] = "luce-dflash", +) -> None: + """Launch OpenCode pointed at the running Lucebox server.""" + from harness.clients import opencode as launcher + + _exec_client(launcher, url=url, model=model, prompt=prompt) + + +@app.command() +def hermes( + prompt: Annotated[ + str | None, + typer.Option("--prompt", "-p", help="One-shot prompt (non-interactive)."), + ] = None, + url: Annotated[ + str | None, + typer.Option(help="Lucebox base URL. Auto-detects localhost / docker host."), + ] = None, + model: Annotated[str, typer.Option(help="Model ID to advertise.")] = "luce-dflash", +) -> None: + """Launch Hermes Agent pointed at the running Lucebox server.""" + from harness.clients import hermes as launcher + + _exec_client(launcher, url=url, model=model, prompt=prompt) + + +@app.command() +def pi( + prompt: Annotated[ + str | None, + typer.Option("--prompt", "-p", help="One-shot prompt (non-interactive)."), + ] = None, + url: Annotated[ + str | None, + typer.Option(help="Lucebox base URL. Auto-detects localhost / docker host."), + ] = None, + model: Annotated[str, typer.Option(help="Model ID to advertise.")] = "luce-dflash", +) -> None: + """Launch Pi pointed at the running Lucebox server.""" + from harness.clients import pi as launcher + + _exec_client(launcher, url=url, model=model, prompt=prompt) + + +@app.command() +def openclaw( + prompt: Annotated[ + str | None, + typer.Option("--prompt", "-p", help="One-shot prompt (non-interactive)."), + ] = None, + url: Annotated[ + str | None, + typer.Option(help="Lucebox base URL. Auto-detects localhost / docker host."), + ] = None, + model: Annotated[str, typer.Option(help="Model ID to advertise.")] = "luce-dflash", +) -> None: + """Launch OpenClaw pointed at the running Lucebox server.""" + from harness.clients import openclaw as launcher + + _exec_client(launcher, url=url, model=model, prompt=prompt) + + +@app.command() +def version() -> None: + """Print lucebox version.""" + print(__version__) + + +def _pick_variant_from_driver(driver_major: int, gpu_sm: str) -> config_mod.Variant: # type: ignore[name-defined] + """Mirrors lucebox.sh::pick_variant. Centralized so Python and bash agree. + + Kept as a thin wrapper around the LUCEBOX_VARIANT env var because + the variant tag is picked by the shell wrapper before Python runs; + this function exists so legacy callers and tests still resolve. + """ + del driver_major, gpu_sm # variant pick lives in lucebox.sh + return os.environ.get("LUCEBOX_VARIANT", "cuda12") + + +def main() -> None: + """Module entrypoint — `python -m lucebox`.""" + try: + app() + except KeyboardInterrupt: + console.print("\n[dim]interrupted[/dim]") + sys.exit(130) + + +if __name__ == "__main__": + main() diff --git a/lucebox/src/lucebox/config.py b/lucebox/src/lucebox/config.py new file mode 100644 index 00000000..f4217f18 --- /dev/null +++ b/lucebox/src/lucebox/config.py @@ -0,0 +1,463 @@ +"""Sparse TOML persistence for .lucebox/config.toml. + +Single source of truth for user-overridden configuration. We track which +dotted keys were explicitly set by the user (or by commands acting on +their behalf) and serialize ONLY those keys back to disk — defaults +stay implicit, so `config.toml` reads like a diff against live defaults +and upgrades that add new fields don't gratuitously rewrite every file. + +The dotted-key surface area is small and flat: + model.preset, model.target_file, model.draft_file + port, models_dir, variant, image, container_name + dflash. for each of the 11 DflashRuntime knobs + think_max + +Load resolves the TOML file → ``Config`` object, with anything absent +filled from ``Config()`` defaults. Save writes back only the keys that +appear in the TOML doc (tracked on ``Config._user_set``). The TOML doc +itself is a plain ``dict[str, Any]`` carrying only the set keys. +""" + +from __future__ import annotations + +import os +import re +import tomllib +from collections.abc import Callable +from dataclasses import replace +from pathlib import Path +from typing import Any + +import tomli_w + +from lucebox.types import ( + Config, + DflashRuntime, + HostFacts, + ModelMeta, + Variant, + default_models_dir, +) + + +def default_config_path() -> Path: + """Where .lucebox/config.toml lives. + + Convention: under $LUCEBOX_HOME if set, otherwise $HOME/.lucebox. Lives in + the bind-mounted host home dir so the config survives container teardown + and is editable from the host. + """ + base = os.environ.get("LUCEBOX_HOME") + if base: + return Path(base) / "config.toml" + return Path.home() / ".lucebox" / "config.toml" + + +# ── dotted-key registry ──────────────────────────────────────────────────── + +def _cast_prefill_mode(v: Any) -> str: + s = str(v) + if s not in {"off", "auto", "always"}: + raise ValueError(f"prefill_mode must be off/auto/always, got {s!r}") + return s + + +def _cast_bool(v: Any) -> bool: + """Strict-ish boolean coercion for config values. + + - Native booleans pass through. + - Strings: 1/true/yes/on → True; 0/false/no/off/"" → False (case-insensitive). + - Anything else raises ``ValueError`` rather than silently coercing, + because that's what bit ``dflash.debug_thinking_logits`` — the + built-in ``bool`` caster turned ``"false"`` into ``True``. + """ + if isinstance(v, bool): + return v + if isinstance(v, str): + s = v.strip().lower() + if s in ("1", "true", "yes", "on"): + return True + if s in ("0", "false", "no", "off", ""): + return False + raise ValueError(f"cannot parse boolean: {v!r}") + if isinstance(v, int): + return bool(v) + raise ValueError(f"cannot parse boolean: {v!r}") + + +# Each entry: dotted-key → (toml_path, type_caster, default_getter). +# ``toml_path`` is the (section, field) pair on disk; ``"_root"`` means the +# key lives at the top level (no [section]). ``default_getter`` returns the +# in-memory default so ``config get`` can annotate origin. +KEY_REGISTRY: dict[str, tuple[tuple[str, str], Callable[[Any], Any]]] = { + "variant": (("image", "variant"), str), + "image": (("image", "registry"), str), + "container_name": (("runtime", "container_name"), str), + "port": (("runtime", "port"), int), + "models_dir": (("paths", "models"), str), + "model.preset": (("model", "preset"), str), + "model.target_file": (("model", "target_file"), str), + "model.draft_file": (("model", "draft_file"), str), + "dflash.budget": (("dflash", "budget"), int), + "dflash.max_ctx": (("dflash", "max_ctx"), int), + "dflash.lazy": (("dflash", "lazy"), _cast_bool), + "dflash.prefix_cache_slots": (("dflash", "prefix_cache_slots"), int), + "dflash.prefill_cache_slots": (("dflash", "prefill_cache_slots"), int), + "dflash.cache_type_k": (("dflash", "cache_type_k"), str), + "dflash.cache_type_v": (("dflash", "cache_type_v"), str), + "dflash.prefill_mode": (("dflash", "prefill_mode"), _cast_prefill_mode), + "dflash.prefill_keep_ratio": (("dflash", "prefill_keep_ratio"), float), + "dflash.prefill_threshold": (("dflash", "prefill_threshold"), int), + "dflash.prefill_drafter": (("dflash", "prefill_drafter"), str), + "dflash.think_max": (("dflash", "think_max"), int), + "dflash.fa_window": (("dflash", "fa_window"), int), + "dflash.think_soft_close_min_ratio": ( + ("dflash", "think_soft_close_min_ratio"), float), + "dflash.debug_thinking_logits": ( + ("dflash", "debug_thinking_logits"), _cast_bool), +} + + +def _doc_get(doc: dict[str, Any], section: str, field: str) -> Any: + if section == "_root": + return doc.get(field) + sub = doc.get(section) + if isinstance(sub, dict): + return sub.get(field) + return None + + +def _doc_set(doc: dict[str, Any], section: str, field: str, value: Any) -> None: + if section == "_root": + doc[field] = value + return + doc.setdefault(section, {})[field] = value + + +def _doc_unset(doc: dict[str, Any], section: str, field: str) -> bool: + """Remove a dotted key from the doc. Returns True iff something was removed.""" + if section == "_root": + if field in doc: + del doc[field] + return True + return False + sub = doc.get(section) + if isinstance(sub, dict) and field in sub: + del sub[field] + if not sub: + del doc[section] + return True + return False + + +# ── load ─────────────────────────────────────────────────────────────────── + + +def load(path: Path | None = None) -> Config | None: + """Load config.toml, or return None if missing. + + If a legacy `.env` sits next to it (or in place of it), migrate that + first and write back as TOML. + """ + path = path or default_config_path() + if path.exists(): + return _load_toml(path) + + legacy = path.with_suffix(".env") + if legacy.exists(): + cfg, doc = _load_legacy_env(legacy) + save(cfg, path, doc=doc) + return cfg + + return None + + +def _load_toml(path: Path) -> Config: + raw = tomllib.loads(path.read_text()) + return _from_dict(raw) + + +def load_doc(path: Path | None = None) -> dict[str, Any]: + """Return the raw TOML doc (a dict). Empty when no file or empty file.""" + path = path or default_config_path() + if not path.exists(): + return {} + return tomllib.loads(path.read_text()) + + +_LEGACY_KEY_MAP: dict[str, tuple[str, str, Callable[[str], Any]]] = { + "DFLASH_BUDGET": ("dflash", "budget", int), + "DFLASH_MAX_CTX": ("dflash", "max_ctx", int), + "DFLASH_LAZY": ("dflash", "lazy", + lambda v: str(v).strip().lower() in ("1", "true", "yes", "on")), + "DFLASH_PREFIX_CACHE_SLOTS": ("dflash", "prefix_cache_slots", int), + "DFLASH_PORT": ("runtime", "port", int), + "LUCEBOX_VARIANT": ("image", "variant", str), + "LUCEBOX_IMAGE": ("image", "registry", str), + "LUCEBOX_MODELS": ("paths", "models", str), +} + + +def _load_legacy_env(path: Path) -> tuple[Config, dict[str, Any]]: + """Best-effort migration from the bash-era .lucebox/config.env.""" + raw: dict[str, Any] = {} + line_re = re.compile(r"^([A-Z_][A-Z0-9_]*)=(.*)$") + for line in path.read_text().splitlines(): + line = line.strip() + if not line or line.startswith("#"): + continue + m = line_re.match(line) + if not m: + continue + key, val = m.group(1), m.group(2).strip().strip('"').strip("'") + if key not in _LEGACY_KEY_MAP: + continue + section, field, cast_fn = _LEGACY_KEY_MAP[key] + try: + raw.setdefault(section, {})[field] = cast_fn(val) + except (TypeError, ValueError): + continue + return _from_dict(raw), raw + + +def _from_dict(raw: dict[str, Any]) -> Config: + img = raw.get("image", {}) + variant: Variant = str(img.get("variant", "cuda12")) + registry = img.get("registry", "ghcr.io/luce-org/lucebox-hub") + + runtime = raw.get("runtime", {}) + port = int(runtime.get("port", 8080)) + container_name = str(runtime.get("container_name", "lucebox")) + + paths = raw.get("paths", {}) + models_dir = Path(paths.get("models", str(default_models_dir()))) + + df = raw.get("dflash", {}) + dflash = DflashRuntime( + budget=int(df.get("budget", 22)), + max_ctx=int(df.get("max_ctx", 16384)), + lazy=bool(df.get("lazy", False)), + prefix_cache_slots=int(df.get("prefix_cache_slots", 0)), + prefill_cache_slots=int(df.get("prefill_cache_slots", 0)), + cache_type_k=str(df.get("cache_type_k", "")), + cache_type_v=str(df.get("cache_type_v", "")), + prefill_mode=df.get("prefill_mode", "off"), + prefill_keep_ratio=float(df.get("prefill_keep_ratio", 0.05)), + prefill_threshold=int(df.get("prefill_threshold", 32000)), + prefill_drafter=str(df.get("prefill_drafter", "")), + think_max=int(df.get("think_max", 15488)), + fa_window=int(df.get("fa_window", 0)), + think_soft_close_min_ratio=float( + df.get("think_soft_close_min_ratio", 0.0)), + debug_thinking_logits=bool(df.get("debug_thinking_logits", False)), + ) + + host_raw = raw.get("host", {}) + host = HostFacts( + nproc=int(host_raw.get("nproc", 0)), + ram_gb=int(host_raw.get("ram_gb", 0)), + gpu_vendor=host_raw.get("gpu_vendor", "none"), + gpu_name=str(host_raw.get("gpu_name", "")), + gpu_count=int(host_raw.get("gpu_count", 0)), + vram_gb=int(host_raw.get("vram_gb", 0)), + gpu_sm=str(host_raw.get("gpu_sm", "")), + driver_version=str(host_raw.get("driver_version", "")), + driver_major=int(host_raw.get("driver_major", 0)), + has_systemd=bool(host_raw.get("has_systemd", False)), + is_wsl=bool(host_raw.get("is_wsl", False)), + has_docker=bool(host_raw.get("has_docker", False)), + docker_version=str(host_raw.get("docker_version", "")), + ctk=host_raw.get("ctk", "none"), + ) + + # `[model]` is optional — legacy configs (pre-multi-model) carry no + # such section and we want them to keep working unchanged. If + # `preset` is set but `target_file` / `draft_file` isn't, derive + # them from the registry so users only have to write one key. + mdl = raw.get("model", {}) + preset_name = str(mdl.get("preset", "")) + target_file = str(mdl.get("target_file", "")) + draft_file = str(mdl.get("draft_file", "")) + if preset_name and (not target_file or not draft_file): + from lucebox.download import PRESETS + + if preset_name in PRESETS: + pres = PRESETS[preset_name] + if not target_file: + target_file = pres.target_file + if not draft_file and pres.has_draft and pres.draft_file: + draft_file = pres.draft_file + model = ModelMeta(preset=preset_name, target_file=target_file, draft_file=draft_file) + + return Config( + variant=variant, + image=registry, + container_name=container_name, + port=port, + models_dir=models_dir, + dflash=dflash, + host=host, + model=model, + ) + + +# ── save ─────────────────────────────────────────────────────────────────── + + +def save(cfg: Config, path: Path | None = None, *, doc: dict[str, Any] | None = None) -> Path: + """Persist a Config to ``path``. Only keys present in ``doc`` are written. + + ``doc`` is the raw TOML mapping returned by ``load_doc`` — it carries + exactly the keys the user (or a command on their behalf) has set. When + ``doc=None`` and the file exists we re-use the on-disk doc; when both + are absent we write an empty file. + """ + path = path or default_config_path() + path.parent.mkdir(parents=True, exist_ok=True) + if doc is None: + doc = load_doc(path) + # Atomic write. + tmp = path.with_suffix(".toml.tmp") + tmp.write_bytes(tomli_w.dumps(doc).encode("utf-8")) + tmp.replace(path) + # Silence unused-arg: cfg is the on-disk representation's source of + # truth for callers that want to round-trip through a Config object, + # but the sparse write never re-derives keys from it. + del cfg + return path + + +# ── dotted-key API ───────────────────────────────────────────────────────── + + +def _value_to_toml(value: Any) -> Any: + """Make a Python value safe for tomli_w (no None, Path→str).""" + if isinstance(value, Path): + return str(value) + return value + + +def _live_default(key: str) -> Any: + """Return the in-memory default for ``key`` (from a fresh Config()).""" + cfg = Config() + section_field = KEY_REGISTRY[key][0] + section, field = section_field + if section == "image": + return {"variant": cfg.variant, "registry": cfg.image}[field] + if section == "runtime": + return {"port": cfg.port, "container_name": cfg.container_name}[field] + if section == "paths": + return str(cfg.models_dir) if field == "models" else None + if section == "dflash": + return getattr(cfg.dflash, field) + if section == "model": + return getattr(cfg.model, field) + return None + + +def config_set(key: str, value: Any, *, path: Path | None = None) -> None: + """Set one dotted key and write the file. Auto-creates a missing file.""" + if key not in KEY_REGISTRY: + raise KeyError(f"unknown config key {key!r}; known: {sorted(KEY_REGISTRY)}") + section_field, caster = KEY_REGISTRY[key] + section, field = section_field + try: + cast_value = caster(value) + except (TypeError, ValueError) as exc: + raise ValueError(f"cannot coerce {value!r} for {key}: {exc}") from exc + path = path or default_config_path() + doc = load_doc(path) if path.exists() else {} + _doc_set(doc, section, field, _value_to_toml(cast_value)) + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(".toml.tmp") + tmp.write_bytes(tomli_w.dumps(doc).encode("utf-8")) + tmp.replace(path) + + +def config_unset(key: str, *, path: Path | None = None) -> bool: + """Remove a dotted key from the file. Returns True if something changed.""" + if key not in KEY_REGISTRY: + raise KeyError(f"unknown config key {key!r}; known: {sorted(KEY_REGISTRY)}") + section_field, _ = KEY_REGISTRY[key] + section, field = section_field + path = path or default_config_path() + if not path.exists(): + return False + doc = load_doc(path) + changed = _doc_unset(doc, section, field) + if changed: + # Leave the file in place even when empty — `config set` will + # repopulate; deleting would surprise users who expect their + # config dir to exist. + tmp = path.with_suffix(".toml.tmp") + tmp.write_bytes(tomli_w.dumps(doc).encode("utf-8")) + tmp.replace(path) + return changed + + +def config_get(key: str | None = None, *, path: Path | None = None) -> dict[str, tuple[Any, str]]: + """Return ``{key: (value, origin)}``. ``origin`` is ``"file"`` or ``"default"``. + + When ``key`` is None or empty, every registered key is returned. + Otherwise just that one key (still as a single-item dict, for caller + uniformity). + """ + path = path or default_config_path() + doc = load_doc(path) if path.exists() else {} + keys = [key] if key else list(KEY_REGISTRY) + out: dict[str, tuple[Any, str]] = {} + for k in keys: + if k not in KEY_REGISTRY: + raise KeyError(f"unknown config key {k!r}; known: {sorted(KEY_REGISTRY)}") + section_field, _ = KEY_REGISTRY[k] + section, field = section_field + in_file = _doc_get(doc, section, field) + if in_file is not None: + out[k] = (in_file, "file") + else: + out[k] = (_live_default(k), "default") + return out + + +def live_config(preset_name: str | None = None) -> Config: + """Build a fresh Config from current host facts + heuristic autotune. + + Renamed from the older `_build_default_config` so callers outside + `cli.py` (the new `autotune` subcommand, the `models` sub-app) can + reuse the same materialization without duplicating the host probe + + autotune apply + env-override logic. + + When ``preset_name`` is set, the returned Config pins ``[model]`` to + that preset's target_file/draft_file so `lucebox serve` emits the + DFLASH_TARGET / DFLASH_DRAFT envs. Invalid preset names raise + ``KeyError`` so the caller can map them to a typer-friendly error. + """ + # Lazy imports to avoid the autotune ↔ config ↔ download cycle the + # importer would hit if these moved to module scope. + import lucebox.autotune as autotune_mod + import lucebox.download as download_mod + from lucebox.host_facts import from_env + + host = from_env() + variant = os.environ.get("LUCEBOX_VARIANT", "cuda12") + dflash = autotune_mod.runtime_from_host(host) + default = Config() + model = ModelMeta() + if preset_name: + preset = download_mod.resolve_preset(preset_name) + draft = preset.draft_file or "" if preset.has_draft else "" + model = ModelMeta( + preset=preset.name, + target_file=preset.target_file, + draft_file=draft, + ) + return replace( + default, + variant=variant, + image=os.environ.get("LUCEBOX_IMAGE", default.image), + container_name=os.environ.get("LUCEBOX_CONTAINER", default.container_name), + port=int(os.environ.get("LUCEBOX_PORT", str(default.port))), + models_dir=Path(os.environ.get("LUCEBOX_MODELS", str(default.models_dir))), + dflash=dflash, + host=host, + model=model, + ) diff --git a/lucebox/src/lucebox/docker_run.py b/lucebox/src/lucebox/docker_run.py new file mode 100644 index 00000000..ff3615b2 --- /dev/null +++ b/lucebox/src/lucebox/docker_run.py @@ -0,0 +1,232 @@ +"""Build and execute `docker run` argv for the server and download containers. + +We shell out to the `docker` CLI rather than using the docker SDK because +(a) the CLI is the user-visible contract — errors look the same whether +issued by lucebox or the user; (b) zero import cost; (c) trivially mockable +via subprocess in tests. Wrap everything in one module so swapping to the +SDK later is a single-file change. +""" + +from __future__ import annotations + +import os +import shlex +import subprocess +from dataclasses import dataclass +from pathlib import Path + +from lucebox.types import Config + + +def _host_facts_env() -> list[tuple[str, str]]: + """Forward LUCEBOX_HOST_* from the orchestrator's env into the server. + + lucebox.sh's probe_host() exports every host-identity fact (OS, + kernel, GPU list CSV, CTK version, …) before invoking ``docker run`` + on the orchestrator. The orchestrator inherits them and we pass + them through verbatim so the server entrypoint can write + /opt/lucebox-hub/HOST_INFO without re-probing inside the container + (where /proc and nvidia-smi see the container's view, not the + rig's). See entrypoint.sh::write_host_info and http_server.cpp's + /props.host block. + """ + out: list[tuple[str, str]] = [] + for key, value in sorted(os.environ.items()): + if key.startswith("LUCEBOX_HOST_"): + out.append((key, value)) + return out + + +def _resolve_model_files(cfg: Config) -> tuple[str, str, str]: + """Return (target_file, draft_file, draft_dir) for DFLASH_TARGET / DFLASH_DRAFT. + + Resolution order — first non-empty wins per field: + 1. cfg.model.target_file / draft_file (explicit override in config.toml) + 2. PRESETS[cfg.model.preset].target_file / draft_file / speculator_dir (registry) + 3. "" (entrypoint autodetect path runs unchanged). + + ``draft_dir`` is a directory name under ``models/draft/`` holding a + safetensors speculator (e.g. ``laguna-xs2-speculator``). It is only set + when the preset declares one AND the directory exists on disk; otherwise + it is empty. When non-empty, docker_run_spec uses it as DFLASH_DRAFT + (a directory path) instead of the GGUF-file path, allowing the entrypoint + to discover the safetensors file inside it. + + Imported lazily to avoid the lucebox.types ↔ lucebox.download circular + import that surfaces when this module is imported from ``__init__``. + """ + target = cfg.model.target_file + draft = cfg.model.draft_file + draft_dir = "" + if (not target or not draft) and cfg.model.preset: + from lucebox.download import PRESETS + + pres = PRESETS.get(cfg.model.preset) + if pres is not None: + if not target: + target = pres.target_file + if not draft and pres.has_draft and pres.draft_file: + draft = pres.draft_file + if not draft and pres.speculator_dir: + spec_path = cfg.models_dir / "draft" / pres.speculator_dir + if spec_path.is_dir(): + draft_dir = pres.speculator_dir + return target, draft, draft_dir + + +def _runtime_volumes(cfg: Config) -> tuple[tuple[str, str], ...]: + """Mount models plus $HOME so absolute symlink targets remain valid.""" + home = str(Path.home()) + models = str(cfg.models_dir) + volumes = [(models, "/opt/lucebox-hub/server/models")] + if home != models: + volumes.append((home, home)) + return tuple(volumes) + + +@dataclass(frozen=True, slots=True) +class DockerRunSpec: + """Pre-render of a docker-run command. Render via `argv()` or `printable()`.""" + + image: str + name: str + gpus: bool = True + detach: bool = False + remove: bool = True + port_publish: tuple[int, int] | None = None # (host, container) + volumes: tuple[tuple[str, str], ...] = () + env: tuple[tuple[str, str], ...] = () + entrypoint_args: tuple[str, ...] = () + extra: tuple[str, ...] = () + + def argv(self) -> list[str]: + out = ["docker", "run"] + if self.remove: + out.append("--rm") + if self.detach: + out.append("-d") + out += ["--name", self.name] + if self.gpus: + out += ["--gpus", "all"] + if self.port_publish is not None: + host, container = self.port_publish + out += ["-p", f"{host}:{container}"] + for host_path, container_path in self.volumes: + out += ["-v", f"{host_path}:{container_path}"] + for k, v in self.env: + out += ["-e", f"{k}={v}"] + out += list(self.extra) + out.append(self.image) + out += list(self.entrypoint_args) + return out + + def printable(self) -> str: + """Human-readable, one-flag-per-line docker run. Copy-pasteable.""" + argv = self.argv() + if not argv: + return "" + out = argv[0] + i = 1 + while i < len(argv): + tok = argv[i] + out += " \\\n " + tok + # Glue value-taking flags onto the same line. + if tok in { + "-p", + "-v", + "-e", + "--name", + "--gpus", + "--env", + "--volume", + "--publish", + "--entrypoint", + } and i + 1 < len(argv): + i += 1 + out += " " + shlex.quote(argv[i]) + i += 1 + return out + + +# ── server argv from Config ──────────────────────────────────────────────── + + +def server_run_spec(cfg: Config) -> DockerRunSpec: + """Long-running OpenAI-compatible server. Foreground (systemd manages + lifecycle), --gpus all, models bind-mounted, DFLASH_* propagated. + """ + # LUCEBOX_HOST_* first so they ride out front in the rendered argv, + # making it obvious in `print-run` output what host facts get forwarded. + env: list[tuple[str, str]] = list(_host_facts_env()) + env += [ + ("DFLASH_BUDGET", str(cfg.dflash.budget)), + ("DFLASH_MAX_CTX", str(cfg.dflash.max_ctx)), + ("DFLASH_PREFIX_CACHE_SLOTS", str(cfg.dflash.prefix_cache_slots)), + ("DFLASH_PREFILL_CACHE_SLOTS", str(cfg.dflash.prefill_cache_slots)), + ("DFLASH_THINK_MAX", str(cfg.dflash.think_max)), + ("DFLASH_PORT", "8080"), + ] + # Resolve target/draft GGUFs in priority order: + # 1. cfg.model.target_file / draft_file (explicit override in config.toml) + # 2. PRESETS[cfg.model.preset].target_file / draft_file / speculator_dir (registry) + # 3. unset — entrypoint's autodetect path runs unchanged. + # Container view of the models dir is /opt/lucebox-hub/server/models + # (see _runtime_volumes); the entrypoint reads DFLASH_TARGET / DFLASH_DRAFT. + # draft_dir is a subdirectory of models/draft/ holding a safetensors speculator; + # it takes effect only when draft_file is empty and the directory exists on disk. + target_file, draft_file, draft_dir = _resolve_model_files(cfg) + if target_file: + env.append(("DFLASH_TARGET", f"/opt/lucebox-hub/server/models/{target_file}")) + if draft_file: + env.append(("DFLASH_DRAFT", f"/opt/lucebox-hub/server/models/draft/{draft_file}")) + elif draft_dir: + env.append(("DFLASH_DRAFT", f"/opt/lucebox-hub/server/models/draft/{draft_dir}")) + if cfg.dflash.lazy: + env.append(("DFLASH_LAZY", "1")) + if cfg.dflash.cache_type_k: + env.append(("DFLASH_CACHE_TYPE_K", cfg.dflash.cache_type_k)) + if cfg.dflash.cache_type_v: + env.append(("DFLASH_CACHE_TYPE_V", cfg.dflash.cache_type_v)) + if cfg.dflash.prefill_mode != "off": + env += [ + ("DFLASH_PREFILL_MODE", cfg.dflash.prefill_mode), + ("DFLASH_PREFILL_KEEP", str(cfg.dflash.prefill_keep_ratio)), + ("DFLASH_PREFILL_THRESHOLD", str(cfg.dflash.prefill_threshold)), + ] + if cfg.dflash.prefill_drafter: + env.append(("DFLASH_PREFILL_DRAFTER", cfg.dflash.prefill_drafter)) + # fa_window=0 is the server's own default (full attention); only emit + # the env when the operator has selected a sparse decode window. The + # entrypoint mirrors this guard so an unset env reproduces the + # server's stock behavior. + if cfg.dflash.fa_window > 0: + env.append(("DFLASH_FA_WINDOW", str(cfg.dflash.fa_window))) + # Soft-close ratio: 0.0 is server-side disabled (byte-identical + # to pre-PR-#326 behavior). Emit only when nonzero to keep the + # docker env minimal and mirror the entrypoint's `case` guard. + if cfg.dflash.think_soft_close_min_ratio > 0.0: + env.append(( + "DFLASH_THINK_SOFT_CLOSE_MIN_RATIO", + f"{cfg.dflash.think_soft_close_min_ratio:g}", + )) + if cfg.dflash.debug_thinking_logits: + env.append(("DFLASH_DEBUG_THINKING_LOGITS", "1")) + + return DockerRunSpec( + image=f"{cfg.image}:{cfg.variant}", + name=cfg.container_name, + gpus=True, + remove=True, + detach=False, + port_publish=(cfg.port, 8080), + volumes=_runtime_volumes(cfg), + env=tuple(env), + ) + + +# ── subprocess helpers ───────────────────────────────────────────────────── + + +def docker_pull(image_tag: str) -> int: + """Pull an image, streaming progress. Returns docker's exit code.""" + return subprocess.call(["docker", "pull", image_tag]) diff --git a/lucebox/src/lucebox/download.py b/lucebox/src/lucebox/download.py new file mode 100644 index 00000000..3bb8c713 --- /dev/null +++ b/lucebox/src/lucebox/download.py @@ -0,0 +1,500 @@ +"""Model download orchestration. + +Runs *inside* the orchestrator container. Uses `huggingface_hub` directly +(no subprocess) so we can: + + * drive a Rich progress bar based on real byte counts (the previous + `uvx hf download` subprocess produced no visible progress inside the + container — hf-xet's TTY detection misfires there), + * verify each candidate file's size and sha256 against the repo + metadata BEFORE downloading, so a re-run on a host that already has + the target GGUF (e.g. previous download into the same models_dir) + skips the multi-GB fetch entirely. + +The :data:`PRESETS` registry encodes the canonical (target_repo, +target_file, draft_repo, draft_file) tuple per model — selectable via +``lucebox models download ``. ``DEFAULT_PRESET`` stays pinned to +Qwen3.6-27B for back-compat with callers that pre-date the registry. +Drafts are optional: presets that have no published DFlash draft +(e.g. Laguna's speculator is safetensors, not GGUF) carry +``draft_repo=None`` and run target-only. +""" + +from __future__ import annotations + +import hashlib +import os +import threading +import time +from dataclasses import dataclass +from pathlib import Path + +# hf-xet (huggingface_hub ≥ 1.16) streams the entire file in one final +# burst — the polling-based progress bar sits at 0% for ~14 minutes +# then snaps to 100% on a 17 GB GGUF. Force the chunked Python +# downloader instead so bytes grow continuously and the Rich bar tracks +# reality. Set before importing hf_hub_download so the import picks +# the env up. `setdefault` lets a user override on the command line. +os.environ.setdefault("HF_HUB_DISABLE_XET", "1") + +from huggingface_hub import HfApi, hf_hub_download # noqa: E402 +from huggingface_hub._local_folder import get_local_download_paths # noqa: E402 +from rich.console import Console +from rich.progress import ( + BarColumn, + DownloadColumn, + Progress, + TextColumn, + TimeRemainingColumn, + TransferSpeedColumn, +) + +from lucebox.types import Config + + +@dataclass(frozen=True, slots=True) +class ModelPreset: + """Canonical (target, draft) repo+filename pair for a supported model. + + ``draft_repo`` and ``draft_file`` may both be ``None`` for models + where no GGUF DFlash draft is published (e.g. Laguna's safetensors + speculator). In that case the entrypoint runs target-only — DFlash + speculative decoding is disabled but the server still works. + + ``speculator_dir`` names a directory under ``models/draft/`` that holds + a safetensors-format speculator (e.g. ``model.safetensors``). When + present on disk the server launch sets ``DFLASH_DRAFT`` to that + directory; absent, the server runs target-only. Unlike ``draft_file`` + (which marks the preset as incomplete when missing), ``speculator_dir`` + is optional supplementary hardware and doesn't affect installed_status. + """ + + name: str + target_repo: str + target_file: str + draft_repo: str | None + draft_file: str | None + approx_total_gb: int + description: str = "" + speculator_dir: str | None = None + + @property + def has_draft(self) -> bool: + return bool(self.draft_repo and self.draft_file) + + +# Registry of supported models. Keyed by preset name; the CLI surface +# exposes these via `lucebox models download ` and the +# `lucebox models list` table. The values come straight from the model +# cards under share/model_cards/ — keep them in sync. +PRESETS: dict[str, ModelPreset] = { + "qwen3.6-27b": ModelPreset( + name="qwen3.6-27b", + target_repo="unsloth/Qwen3.6-27B-GGUF", + target_file="Qwen3.6-27B-Q4_K_M.gguf", + draft_repo="spiritbuun/Qwen3.6-27B-DFlash-GGUF", + draft_file="dflash-draft-3.6-q4_k_m.gguf", + approx_total_gb=17, + description="Qwen3.6 27B dense (Q4_K_M) + Qwen3.6 DFlash draft. Lucebox default.", + ), + "gemma-4-26b": ModelPreset( + name="gemma-4-26b", + target_repo="bartowski/google_gemma-4-26B-A4B-it-GGUF", + target_file="google_gemma-4-26B-A4B-it-Q4_K_M.gguf", + draft_repo="Lucebox/gemma-4-26B-A4B-it-DFlash-GGUF", + draft_file="gemma-4-26B-A4B-it-DFlash-q8_0.gguf", + approx_total_gb=18, + description="Gemma 4 26B-A4B IT MoE (Q4_K_M) + Lucebox DFlash q8_0 draft.", + ), + "gemma-4-31b": ModelPreset( + name="gemma-4-31b", + target_repo="bartowski/google_gemma-4-31B-it-GGUF", + target_file="google_gemma-4-31B-it-Q4_K_M.gguf", + draft_repo="Lucebox/gemma-4-31B-it-DFlash-GGUF", + draft_file="gemma-4-31B-it-DFlash-q8_0.gguf", + approx_total_gb=21, + description="Gemma 4 31B IT dense (Q4_K_M) + Lucebox DFlash q8_0 draft.", + ), + "laguna-xs.2": ModelPreset( + name="laguna-xs.2", + target_repo="Lucebox/Laguna-XS.2-GGUF", + target_file="laguna-xs2-Q4_K_M.gguf", + # Laguna's DFlash speculator is safetensors-format + # (poolside/Laguna-XS.2-speculator.dflash), downloaded manually + # into models/draft/laguna-xs2-speculator/. The download command + # doesn't fetch it automatically — it's opt-in. When present, + # speculator_dir wires it into DFLASH_DRAFT at server launch. + draft_repo=None, + draft_file=None, + speculator_dir="laguna-xs2-speculator", + approx_total_gb=20, + description=( + "Laguna-XS.2 MoE code model (Q4_K_M). " + "DFlash safetensors speculator in draft/laguna-xs2-speculator/ " + "is used automatically when present." + ), + ), + "qwen3.6-moe": ModelPreset( + name="qwen3.6-moe", + target_repo="unsloth/Qwen3.6-35B-A3B-GGUF", + # Unsloth's MoE repo publishes both a "UD" (dynamic) and a plain + # Q4_K_M family. Verified 2026-05-28 via HfApi.repo_info: the + # `-UD-Q4_K_M.gguf` variant (22.1 GB) is the canonical Q4_K_M + # release — there is no plain `Q4_K_M.gguf` on the MoE repo. + target_file="Qwen3.6-35B-A3B-UD-Q4_K_M.gguf", + # No DFlash draft GGUF has been published for the MoE variant + # (probed Lucebox/* and spiritbuun/* repos 2026-05-28 — none + # exist). Target-only, mirroring laguna-xs.2's wiring. The + # lucebox C++ server speaks the `qwen35moe` arch natively + # (server/src/qwen35moe/) so this runs without a draft. + draft_repo=None, + draft_file=None, + approx_total_gb=22, + description=( + "Qwen3.6 35B-A3B MoE (3B active per token), Q4_K_M unsloth " + "dynamic quant. Target-only — no DFlash MoE draft published " + "yet. Uses lucebox's qwen35moe arch backend." + ), + ), +} + +DEFAULT_PRESET = PRESETS["qwen3.6-27b"] + + +def resolve_preset(name: str | None) -> ModelPreset: + """Look up a preset by name, with a friendly error on typos. + + ``None`` (or empty string) resolves to :data:`DEFAULT_PRESET` so + callers and the CLI default both flow through one code path. + """ + if not name: + return DEFAULT_PRESET + if name in PRESETS: + return PRESETS[name] + # Build a suggestion list — show every known preset; the user's + # search space is small (4 entries today) so listing them all is + # cheaper and clearer than a fuzzy-match heuristic. + known = ", ".join(sorted(PRESETS.keys())) + raise KeyError(f"unknown preset {name!r}. Known presets: {known}") + + +def _file_meta(api: HfApi, repo_id: str, filename: str) -> tuple[int, str | None]: + """Return (expected_size, lfs_sha256_or_None) for filename in repo_id.""" + info = api.model_info(repo_id, files_metadata=True) + for sib in info.siblings or []: + if sib.rfilename == filename: + sha = getattr(sib.lfs, "sha256", None) if sib.lfs else None + return int(sib.size or 0), sha + raise FileNotFoundError(f"{filename} not present in repo {repo_id}") + + +def _sha256(path: Path, chunk_mb: int = 16) -> str: + h = hashlib.sha256() + chunk = chunk_mb * 1024 * 1024 + with path.open("rb") as f: + while buf := f.read(chunk): + h.update(buf) + return h.hexdigest() + + +def _local_matches(path: Path, size: int, sha256: str | None, console: Console) -> bool: + """True iff a local file at `path` matches the expected size + sha256. + + Size mismatch shortcircuits (cheap). Sha256 is verified for LFS files + (multi-GB GGUFs always carry one) and skipped when the repo doesn't + expose a hash. Hashing 17 GB takes ~30s on a fast SSD — worth it to + avoid a multi-GB re-download on rate-limited / metered links. + """ + if not path.exists(): + return False + actual_size = path.stat().st_size + if actual_size != size: + console.print( + f" [yellow]✗[/yellow] {path.name} present but size {actual_size:,} != " + f"expected {size:,} — will re-download" + ) + return False + if sha256: + console.print(f" [dim]verifying sha256 of {path.name} ({actual_size / 1e9:.1f} GB)…[/dim]") + actual_sha = _sha256(path) + if actual_sha != sha256: + console.print( + f" [yellow]✗[/yellow] {path.name} sha256 {actual_sha[:12]}… != " + f"expected {sha256[:12]}… — will re-download" + ) + return False + return True + + +def _incomplete_path_candidates(local_dir: Path, filename: str, etag: str | None) -> list[Path]: + """Return likely paths of the partial file currently being written. + + huggingface_hub 1.x (with hf-xet) stages downloads under + ``{local_dir}/.cache/huggingface/download/`` using a *hashed* name — + ``{short_hash(metadata_filename)}.{etag}.incomplete`` — so a naive + ``{filename}.incomplete`` poll never sees any growth and the + progress bar sits at 0 % for the whole multi-GB transfer. + + We get the *exact* expected staging path from + ``get_local_download_paths().incomplete_path(etag)`` when we already + know the LFS sha256 (which acts as the etag for Xet downloads), and + fall back to globbing every ``*.incomplete`` in the staging dir + otherwise. The legacy non-Xet downloader writes a ``.incomplete`` + next to the destination blob in ``~/.cache/huggingface/hub`` — but + when ``local_dir`` is set hf-hub always uses the local staging dir, + so the two candidates above cover every code path we hit. + """ + paths = get_local_download_paths(local_dir, filename) + candidates: list[Path] = [] + if etag: + candidates.append(paths.incomplete_path(etag)) + # Fallback: every .incomplete file in the staging dir. This is what + # rescues us when sha256 is unknown (non-LFS file) or when hf-hub + # changes the etag derivation again in some future release. + candidates.append(paths.metadata_path.parent) # sentinel: glob this dir + return candidates + + +def _current_bytes(target: Path, candidates: list[Path]) -> int: + """Best-effort byte count of the file currently being written.""" + if target.exists(): + try: + return target.stat().st_size + except OSError: + pass + for c in candidates: + if c.is_dir(): + # Glob every .incomplete in the staging dir; return the + # largest (there's typically only one in-flight transfer). + largest = 0 + try: + for p in c.glob("*.incomplete"): + try: + largest = max(largest, p.stat().st_size) + except OSError: + continue + except OSError: + continue + if largest: + return largest + else: + try: + if c.exists(): + return c.stat().st_size + except OSError: + continue + return 0 + + +def _download_with_progress( + repo_id: str, + filename: str, + local_dir: Path, + expected_size: int, + console: Console, + etag: str | None = None, +) -> Path: + """Download a single HF file with a Rich progress bar. + + Runs hf_hub_download in a worker thread; the main thread polls the + growing file size and updates the Rich progress bar. The polled + target is computed via ``get_local_download_paths`` so we hit the + actual hf-xet staging path (a hashed filename under + ``.cache/huggingface/download/``), not a guess. + """ + local_dir.mkdir(parents=True, exist_ok=True) + target = local_dir / filename + candidates = _incomplete_path_candidates(local_dir, filename, etag) + + result: list[str | None] = [None] + error: list[BaseException | None] = [None] + + def _worker() -> None: + try: + result[0] = hf_hub_download( + repo_id=repo_id, + filename=filename, + local_dir=str(local_dir), + ) + except BaseException as exc: # propagate to main thread + error[0] = exc + + t = threading.Thread(target=_worker, daemon=True) + t.start() + + with Progress( + TextColumn("[cyan]{task.description}"), + BarColumn(bar_width=40), + DownloadColumn(), + TransferSpeedColumn(), + TimeRemainingColumn(), + console=console, + transient=False, + ) as progress: + task = progress.add_task(filename, total=expected_size or 1) + while t.is_alive(): + current = _current_bytes(target, candidates) + # Always tick the bar — even at 0 bytes — so Rich repaints + # the spinner/ETA and the user sees the UI is alive within + # the first poll tick rather than a blank "Downloading…" line. + progress.update(task, completed=min(current, expected_size or current or 1)) + time.sleep(0.5) + # Final tick after the worker finishes so the bar paints 100%. + if target.exists(): + progress.update(task, completed=target.stat().st_size) + + t.join(timeout=5) + if error[0] is not None: + raise error[0] + if result[0] is None: + raise RuntimeError(f"hf_hub_download returned no path for {filename}") + return Path(result[0]) + + +def _fetch( + api: HfApi, + repo_id: str, + filename: str, + local_dir: Path, + console: Console, +) -> Path: + """Verify-or-download a single file. Skips when the local copy matches.""" + size, sha = _file_meta(api, repo_id, filename) + target = local_dir / filename + if _local_matches(target, size, sha, console): + console.print(f" [green]✓[/green] {filename} already present (size + sha256 match)") + return target + # `sha` doubles as the etag for hf-xet's staging path + # ({local_dir}/.cache/huggingface/download/{hash}.{etag}.incomplete); + # passing it through is what makes the Rich progress bar see real + # byte counts during the multi-GB transfer. + return _download_with_progress(repo_id, filename, local_dir, size, console, etag=sha) + + +def download_preset(cfg: Config, preset: ModelPreset | None = None) -> int: + """Fetch the target GGUF + (optional) DFlash draft into cfg.models_dir. + + Returns 0 on success, non-zero on failure. Verifies each file's size + and (LFS) sha256 against the repo metadata before downloading, so a + repeat run with the files already on disk is a no-op + sha256 walk. + + ``preset=None`` resolves to :data:`DEFAULT_PRESET` for back-compat; + presets with ``has_draft=False`` (e.g. Laguna) skip the draft fetch + entirely and let the server run target-only. + """ + preset = preset or DEFAULT_PRESET + console = Console() + api = HfApi() + models = cfg.models_dir + models.mkdir(parents=True, exist_ok=True) + draft = models / "draft" + draft.mkdir(exist_ok=True) + + try: + _fetch(api, preset.target_repo, preset.target_file, models, console) + if preset.has_draft: + # Narrow the optionals for the type-checker — has_draft is + # exactly the predicate that proves these aren't None. + assert preset.draft_repo is not None and preset.draft_file is not None + _fetch(api, preset.draft_repo, preset.draft_file, draft, console) + else: + console.print( + f" [dim]no DFlash draft published for {preset.name} — running target-only[/dim]" + ) + except Exception as exc: + console.print(f"[red]download failed:[/red] {exc}") + return 1 + return 0 + + +def _local_target_path(cfg: Config, preset: ModelPreset) -> Path: + return cfg.models_dir / preset.target_file + + +def _local_draft_path(cfg: Config, preset: ModelPreset) -> Path | None: + if not (preset.has_draft and preset.draft_file): + return None + return cfg.models_dir / "draft" / preset.draft_file + + +def installed_status(cfg: Config, preset: ModelPreset) -> str: + """Return ``"installed"`` / ``"partial"`` / ``"absent"`` for a preset. + + Size-only — doesn't hash. ``"installed"`` requires the target (and + draft when one is published) to exist on disk; ``"partial"`` means + at least one of the two is present but the set is incomplete. + """ + target_exists = _local_target_path(cfg, preset).exists() + draft_path = _local_draft_path(cfg, preset) + if draft_path is None: + return "installed" if target_exists else "absent" + draft_exists = draft_path.exists() + if target_exists and draft_exists: + return "installed" + if target_exists or draft_exists: + return "partial" + return "absent" + + +def installed_size_gb(cfg: Config, preset: ModelPreset) -> float: + """Sum of on-disk byte sizes for the preset's files, in GB (binary 1e9).""" + total = 0 + target = _local_target_path(cfg, preset) + if target.exists(): + try: + total += target.stat().st_size + except OSError: + pass + draft = _local_draft_path(cfg, preset) + if draft is not None and draft.exists(): + try: + total += draft.stat().st_size + except OSError: + pass + return total / 1e9 + + +def installed_presets(cfg: Config) -> list[ModelPreset]: + """Return every preset whose files are currently present in cfg.models_dir. + + "Present" follows ``installed_status`` — fully installed only. + Partial states (target without draft, etc.) are excluded so the + default ``lucebox models`` view stays uncluttered. + """ + out: list[ModelPreset] = [] + for name in sorted(PRESETS): + pres = PRESETS[name] + if installed_status(cfg, pres) == "installed": + out.append(pres) + return out + + +def status(cfg: Config, preset: ModelPreset | None = None) -> dict[str, bool]: + """Quick presence check — what's already on disk? Size-only, no sha256. + + For presets without a published DFlash draft, ``draft_present`` is + reported as ``True`` (nothing to fetch → nothing missing). That + keeps the "all present, nothing to do" UX path uniform whether or + not a draft exists. + """ + preset = preset or DEFAULT_PRESET + api = HfApi() + out: dict[str, bool] = {} + try: + size, _ = _file_meta(api, preset.target_repo, preset.target_file) + local = cfg.models_dir / preset.target_file + out["target_present"] = local.exists() and local.stat().st_size == size + except Exception: + out["target_present"] = False + + if preset.has_draft: + assert preset.draft_repo is not None and preset.draft_file is not None + try: + size, _ = _file_meta(api, preset.draft_repo, preset.draft_file) + local = cfg.models_dir / "draft" / preset.draft_file + out["draft_present"] = local.exists() and local.stat().st_size == size + except Exception: + out["draft_present"] = False + else: + out["draft_present"] = True + return out diff --git a/lucebox/src/lucebox/host_check.py b/lucebox/src/lucebox/host_check.py new file mode 100644 index 00000000..2ce8d388 --- /dev/null +++ b/lucebox/src/lucebox/host_check.py @@ -0,0 +1,232 @@ +"""Readiness check: aggregate HostFacts (provided by lucebox.sh) with the +docker-daemon checks we can do from inside the container via the mounted +socket. Prints a status report and returns an aggregate severity. +""" + +from __future__ import annotations + +import os +from dataclasses import dataclass +from typing import Literal + +from rich.console import Console + +from lucebox.types import HostFacts + +Severity = Literal["ok", "warn", "fail"] +_SEVERITY_ORDER: dict[Severity, int] = {"ok": 0, "warn": 1, "fail": 2} + + +@dataclass(frozen=True, slots=True) +class CheckResult: + name: str + severity: Severity + message: str + hint: str | None = None + + +def run_checks(host: HostFacts) -> list[CheckResult]: + return [ + _check_docker(host), + _check_nvidia_driver(host), + _check_ctk(host), + _check_ram(host), + _check_vram(host), + _check_systemd(host), + ] + + +def _check_docker(host: HostFacts) -> CheckResult: + if not host.has_docker: + return CheckResult( + "docker", + "fail", + "docker daemon unreachable", + "sudo systemctl start docker, or add your user to the 'docker' group", + ) + return CheckResult("docker", "ok", f"daemon reachable ({host.docker_version})") + + +def _check_nvidia_driver(host: HostFacts) -> CheckResult: + if host.gpu_vendor != "nvidia": + if host.gpu_vendor == "amd": + return CheckResult( + "gpu", + "fail", + "AMD GPU detected — prebuilt images are NVIDIA-only", + "Build dflash from source with HIP; see dflash/README.md", + ) + return CheckResult("gpu", "fail", "no NVIDIA GPU detected") + if not host.driver_version: + return CheckResult( + "driver", + "warn", + "nvidia-smi present but NVML query failed (likely driver/library mismatch)", + "reboot, or reinstall the matching NVIDIA driver", + ) + if host.driver_major < 525: + return CheckResult( + "driver", + "fail", + f"driver r{host.driver_major} too old (need r525+ for cuda12)", + "upgrade the NVIDIA driver", + ) + return CheckResult("driver", "ok", f"nvidia r{host.driver_major} ({host.driver_version})") + + +def _check_ctk(host: HostFacts) -> CheckResult: + match host.ctk: + case "runtime": + return CheckResult("ctk", "ok", "NVIDIA Container Toolkit registered as docker runtime") + case "cdi": + return CheckResult("ctk", "ok", "NVIDIA Container Toolkit available via CDI") + case "installed-unwired": + return CheckResult( + "ctk", + "warn", + "NVIDIA Container Toolkit installed but not wired into docker", + "sudo nvidia-ctk runtime configure --runtime=docker && " + "sudo systemctl restart docker", + ) + case _: + return CheckResult( + "ctk", + "fail", + "NVIDIA Container Toolkit not installed", + "https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html", + ) + + +def _check_ram(host: HostFacts) -> CheckResult: + if host.ram_gb == 0: + return CheckResult("ram", "warn", "RAM unknown") + if host.ram_gb < 16: + return CheckResult("ram", "warn", f"{host.ram_gb} GB RAM — model load may swap") + return CheckResult("ram", "ok", f"{host.ram_gb} GB RAM") + + +def _check_vram(host: HostFacts) -> CheckResult: + if host.vram_gb == 0: + return CheckResult("vram", "warn", "VRAM unknown") + if host.vram_gb < 12: + return CheckResult( + "vram", + "fail", + f"VRAM {host.vram_gb} GB < 12 GB — 27B target won't fit", + "use a smaller model preset or larger GPU", + ) + if host.vram_gb < 22: + return CheckResult( + "vram", + "warn", + f"VRAM {host.vram_gb} GB — 27B fits but max_ctx will be capped near 32K", + ) + return CheckResult("vram", "ok", f"VRAM {host.vram_gb} GB ({host.gpu_name})") + + +def _check_systemd(host: HostFacts) -> CheckResult: + if not host.has_systemd: + return CheckResult( + "systemd", + "warn", + "user systemd not available", + "WSL: enable systemd in /etc/wsl.conf; otherwise 'lucebox serve' " + "still works in the foreground", + ) + return CheckResult("systemd", "ok", "user systemd available") + + +def aggregate(results: list[CheckResult]) -> Severity: + worst: Severity = "ok" + for r in results: + if _SEVERITY_ORDER[r.severity] > _SEVERITY_ORDER[worst]: + worst = r.severity + return worst + + +def render(console: Console, host: HostFacts, results: list[CheckResult]) -> Severity: + """Print a status block, return the worst severity.""" + summary = f"[bold]Host:[/bold] {host.nproc} CPUs · {host.ram_gb} GB RAM" + if host.gpu_vendor == "nvidia" and host.gpu_name: + summary += f" · {host.gpu_name} · {host.vram_gb} GB VRAM" + ( + f" (sm_{host.gpu_sm})" if host.gpu_sm else "" + ) + if host.is_wsl: + summary += " · WSL2" + console.print(summary) + console.print() + + sev_style = { + "ok": "[green]OK[/green]", + "warn": "[yellow]WARN[/yellow]", + "fail": "[red]FAIL[/red]", + } + for r in results: + console.print(f" {sev_style[r.severity]:<22} {r.name:<8} {r.message}") + if r.hint: + console.print(f" {'':<22} {'':<8} [dim]{r.hint}[/dim]") + + render_host_facts(console) + + worst = aggregate(results) + console.print() + if worst == "ok": + console.print("[green]All checks passed.[/green]") + elif worst == "warn": + console.print("[yellow]Checks passed with warnings.[/yellow]") + else: + console.print( + "[red]Critical checks failed — fix the issues above before 'lucebox start'.[/red]" + ) + return worst + + +def render_host_facts(console: Console) -> None: + """Print a pretty 'Host facts' section sourced from LUCEBOX_HOST_*. + + Same data that ends up in /opt/lucebox-hub/HOST_INFO inside the + container — printed here so the operator can sanity-check the + rig classification BEFORE starting a long bench run, and so the + CI exit-code gate (the pass/fail checks above) stays orthogonal + to the informational host facts. + + Reads from the same LUCEBOX_HOST_* env the host wrapper exports + (see lucebox.sh::probe_host). Quiet — emits the section header + even when most facts are unset, since "no host facts probed at + all" is itself a useful signal. + """ + console.print() + console.print("[bold]Host facts[/bold] (LUCEBOX_HOST_*, surfaced as /props.host)") + facts = [ + ("os", os.environ.get("LUCEBOX_HOST_OS_PRETTY", "")), + ("kernel", os.environ.get("LUCEBOX_HOST_KERNEL", "")), + ("wsl_version", os.environ.get("LUCEBOX_HOST_WSL_VERSION", "")), + ("docker", os.environ.get("LUCEBOX_HOST_DOCKER_VERSION", "")), + ("nvidia_driver", os.environ.get("LUCEBOX_HOST_DRIVER_VERSION", "")), + ("nvidia_ctk", os.environ.get("LUCEBOX_HOST_NVIDIA_CTK_VERSION", "")), + ("cpu", os.environ.get("LUCEBOX_HOST_CPU_MODEL", "")), + ("cuda_visible_devices", os.environ.get("LUCEBOX_HOST_CUDA_VISIBLE_DEVICES", "")), + ] + for key, value in facts: + display = value if value else "[dim](unset)[/dim]" + console.print(f" {key:<22} {display}") + + # Multi-GPU table — one line per device. LUCEBOX_HOST_GPU_LIST_CSV + # carries the verbatim nvidia-smi CSV the host wrapper probed. + csv = os.environ.get("LUCEBOX_HOST_GPU_LIST_CSV", "") + if csv: + console.print(" gpus:") + for line in csv.splitlines(): + line = line.strip() + if not line: + continue + parts = [c.strip() for c in line.split(",")] + if len(parts) >= 7: + idx, _uuid, _pci, name, sm, mem, plimit = parts[:7] + console.print( + f" [{idx}] {name} (sm_{sm}, {mem}, {plimit})" + ) + else: + console.print(f" {line}") + else: + console.print(" gpus [dim](none — nvidia-smi unavailable)[/dim]") diff --git a/lucebox/src/lucebox/host_facts.py b/lucebox/src/lucebox/host_facts.py new file mode 100644 index 00000000..5deb6721 --- /dev/null +++ b/lucebox/src/lucebox/host_facts.py @@ -0,0 +1,58 @@ +"""Read HostFacts from the LUCEBOX_HOST_* env vars that lucebox.sh exports. + +We deliberately don't try to detect anything ourselves on the Python side — +inside the container, /proc/meminfo reports the container's view, not the +host's, and nvidia-smi may or may not be available depending on how the +caller invoked us. The host wrapper is the only thing that can see the +truth, and it's already paid for the probe. +""" + +from __future__ import annotations + +import os +from typing import cast + +from lucebox.types import CtkStatus, GpuVendor, HostFacts + + +def _env_int(key: str, default: int = 0) -> int: + raw = os.environ.get(key, "").strip() + if not raw: + return default + try: + return int(raw) + except ValueError: + return default + + +def _env_bool(key: str) -> bool: + return os.environ.get(key, "").strip() in {"1", "true", "yes", "on"} + + +def from_env() -> HostFacts: + vendor: GpuVendor = "none" + raw_vendor = os.environ.get("LUCEBOX_HOST_GPU_VENDOR", "none") + if raw_vendor in {"nvidia", "amd", "none"}: + vendor = cast(GpuVendor, raw_vendor) + + ctk: CtkStatus = "none" + raw_ctk = os.environ.get("LUCEBOX_HOST_HAS_CTK", "none") + if raw_ctk in {"runtime", "cdi", "installed-unwired", "none"}: + ctk = cast(CtkStatus, raw_ctk) + + return HostFacts( + nproc=_env_int("LUCEBOX_HOST_NPROC"), + ram_gb=_env_int("LUCEBOX_HOST_RAM_GB"), + gpu_vendor=vendor, + gpu_name=os.environ.get("LUCEBOX_HOST_GPU_NAME", ""), + gpu_count=_env_int("LUCEBOX_HOST_GPU_COUNT"), + vram_gb=_env_int("LUCEBOX_HOST_VRAM_GB"), + gpu_sm=os.environ.get("LUCEBOX_HOST_GPU_SM", ""), + driver_version=os.environ.get("LUCEBOX_HOST_DRIVER_VERSION", ""), + driver_major=_env_int("LUCEBOX_HOST_DRIVER_MAJOR"), + has_systemd=_env_bool("LUCEBOX_HOST_HAS_SYSTEMD"), + is_wsl=_env_bool("LUCEBOX_HOST_IS_WSL"), + has_docker=_env_bool("LUCEBOX_HOST_HAS_DOCKER"), + docker_version=os.environ.get("LUCEBOX_HOST_DOCKER_VERSION", ""), + ctk=ctk, + ) diff --git a/lucebox/src/lucebox/profile.py b/lucebox/src/lucebox/profile.py new file mode 100644 index 00000000..b3ff38a2 --- /dev/null +++ b/lucebox/src/lucebox/profile.py @@ -0,0 +1,203 @@ +"""``lucebox profile`` — thin wrapper around ``luce-bench snapshot``. + +The previous incarnation owned its own step registry / audit pipeline / +fingerprint cache. That's all moved into ``lucebench.snapshot``; the +host-side wrapper now just: + + 1. probes host facts (LUCEBOX_HOST_* env vars are passed into the container), + 2. picks an output dir under ``$XDG_DATA_HOME/lucebox/profile-snapshots/``, + 3. exec's ``docker exec luce-bench snapshot --level N + --url --host-info --out-dir ``, + 4. streams the subprocess output to the operator's terminal. + +If no container is running we bail with a clear hint instead of trying +to bootstrap one — keeping ``profile`` predictable in CI and dry-run +scripts. +""" + +from __future__ import annotations + +import json +import os +import shlex +import subprocess +import urllib.error +import urllib.request +from dataclasses import asdict +from pathlib import Path +from typing import Any + +from rich.console import Console + +from lucebox.types import Config + +# ── helpers reused by other modules (cli.py imports these) ───────────────── + + +def _server_base_urls(cfg: Config, base_url: str | None = None) -> list[str]: + """Candidate URLs for the running lucebox server, host's-eye view. + + Tried in order: explicit override → 127.0.0.1 → host.docker.internal → + the default docker bridge gateway. Keeps the discovery logic in one + place so the CLI client launchers and the profile wrapper agree on + where the server lives. + """ + if base_url: + return [base_url.rstrip("/")] + return [ + f"http://127.0.0.1:{cfg.port}", + f"http://host.docker.internal:{cfg.port}", + f"http://172.17.0.1:{cfg.port}", + ] + + +def _json_get(url: str, timeout_s: float = 5.0) -> dict[str, Any]: + """GET ``url`` as JSON. Returns {} on any transport / decode error.""" + try: + with urllib.request.urlopen(url, timeout=timeout_s) as resp: + return json.loads(resp.read()) + except (OSError, urllib.error.URLError, json.JSONDecodeError): + return {} + + +# ── host fact dump → JSON the bench can ingest ───────────────────────────── + + +def _host_info_payload(cfg: Config) -> dict[str, Any]: + """Convert our HostFacts (env-driven) into the bench's host-info shape. + + The bench expects the canonical keys returned by + ``lucebench.hostinfo.probe_host_info`` — cpu_model, nproc, ram_gb, + gpu_name, gpu_count, vram_gb, gpu_sm, gpu_power_limit_w, + driver_version, cuda_runtime_version, nvidia_smi_csv. We don't have + a power limit or CUDA runtime version on the host side, so those + fields stay None. + """ + host = cfg.host + # lucebox.sh::probe_host exports LUCEBOX_HOST_CPU_MODEL from + # /proc/cpuinfo; surface it here so the snapshot's host-info block + # carries the operator's real CPU rather than a None placeholder. + # The wrapper script may have run in a context that didn't probe + # (e.g. CI with the var pre-cleared), in which case we fall back to + # None — the bench tolerates missing fields. + cpu_model = os.environ.get("LUCEBOX_HOST_CPU_MODEL", "").strip() or None + return { + "cpu_model": cpu_model, + "nproc": host.nproc or None, + "ram_gb": host.ram_gb or None, + "gpu_name": host.gpu_name or None, + "gpu_count": host.gpu_count or None, + "vram_gb": host.vram_gb or None, + "gpu_sm": host.gpu_sm or None, + "gpu_power_limit_w": None, + "driver_version": host.driver_version or None, + "cuda_runtime_version": None, + "nvidia_smi_csv": None, + # Keep the rest of HostFacts under a vendor field so we don't + # lose the lucebox-specific bits when the snapshot is replayed. + "lucebox_host_facts": asdict(host), + } + + +def _profile_out_dir() -> Path: + """Resolve the profile-snapshots root: $XDG_DATA_HOME/lucebox/profile-snapshots.""" + base = os.environ.get("XDG_DATA_HOME") or str(Path.home() / ".local" / "share") + return Path(base) / "lucebox" / "profile-snapshots" + + +def _container_running(name: str) -> bool: + """True iff ``docker inspect`` reports the named container as Running.""" + try: + out = subprocess.check_output( + ["docker", "inspect", "-f", "{{.State.Running}}", name], + text=True, + stderr=subprocess.DEVNULL, + ) + except (OSError, subprocess.CalledProcessError): + return False + return out.strip() == "true" + + +def _detect_url(cfg: Config, override: str | None) -> str: + """Pick the first /health-answering base URL, or the first candidate.""" + if override: + return override.rstrip("/") + for url in _server_base_urls(cfg): + if _json_get(url + "/health", timeout_s=1.0): + return url + # Caller will surface a clearer error from docker exec when the + # server is genuinely down — we just default to localhost. + return f"http://127.0.0.1:{cfg.port}" + + +def run_profile( + cfg: Config, + *, + level: str, + url: str | None = None, + console: Console | None = None, + out_dir: Path | None = None, + name: str | None = None, + label: str | None = None, +) -> int: + """Drive ``docker exec luce-bench snapshot ...`` end-to-end. + + Returns the exit code from the subprocess. ``console`` defaults to + a fresh ``Console`` when omitted — passing one in lets the host CLI + keep its themed output stream. + + ``out_dir``, ``name``, and ``label`` are forwarded to ``luce-bench + snapshot`` when set — used by ``lucebox autotune --sweep`` to pin + every cell's output under a single sweep dir (``profile-snapshots/ + sweep/cell-N/``) so the corpus stays queryable later by + ``luce-bench report``. + """ + console = console or Console() + if level not in ("level0", "level1", "level2", "level3"): + console.print(f"[red]Unknown profile level: {level!r} (expected level0..level3)[/red]") + return 2 + + if not _container_running(cfg.container_name): + console.print( + f"[red]No running container named {cfg.container_name!r}.[/red]\n" + "[dim]Hint: run `lucebox start` (or `lucebox serve`) first.[/dim]" + ) + return 2 + + resolved_out = out_dir if out_dir is not None else _profile_out_dir() + resolved_out.mkdir(parents=True, exist_ok=True) + + base_url = _detect_url(cfg, url) + # Hand the bench a static host-info JSON so it doesn't have to + # probe inside the container (where /proc and nvidia-smi point at + # the container's namespace, not the host's). + host_info_path = resolved_out / "_host-info.json" + host_info_path.write_text(json.dumps(_host_info_payload(cfg), indent=2) + "\n") + + cmd = [ + "docker", + "exec", + cfg.container_name, + "luce-bench", + "snapshot", + "--level", + level, + "--url", + base_url, + "--host-info", + str(host_info_path), + "--out-dir", + str(resolved_out), + ] + if name is not None: + cmd += ["--name", name] + if label is not None: + cmd += ["--label", label] + console.print(f"[bold]running:[/bold] {' '.join(shlex.quote(a) for a in cmd)}") + proc = subprocess.run(cmd, check=False) + return proc.returncode + + +# Re-exports so legacy callers (lucebox.cli._detect_server_url, tests) +# still import ``_server_base_urls`` / ``_json_get`` from here. +__all__ = ["_json_get", "_server_base_urls", "run_profile"] diff --git a/lucebox/src/lucebox/py.typed b/lucebox/src/lucebox/py.typed new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/lucebox/src/lucebox/py.typed @@ -0,0 +1 @@ + diff --git a/lucebox/src/lucebox/smoke.py b/lucebox/src/lucebox/smoke.py new file mode 100644 index 00000000..e77e1cd0 --- /dev/null +++ b/lucebox/src/lucebox/smoke.py @@ -0,0 +1,247 @@ +"""Liveness smoke test. + +Checks that the running server reports a healthy `/props` shape, streams text, +and can emit an OpenAI-format tool call. Semantic quality is the benchmark's +job; smoke proves the API surface is wired to the intended lucebox server. + +Talks to the server via the host docker socket — the server container's port +is mapped to host port `cfg.port`, and the orchestrator container reaches it +via `host.docker.internal` on Docker Desktop or via the docker bridge +gateway on Linux. We resolve to `host.docker.internal` first and fall back. +""" + +from __future__ import annotations + +import json +import socket +import time +from dataclasses import dataclass + +import httpx + +from lucebox.types import Config + +DEFAULT_TIMEOUT_S = 60.0 +DEFAULT_PROMPT = "Reply with exactly one word: hello" + + +@dataclass(frozen=True, slots=True) +class SmokeResult: + ok: bool + http_status: int + n_tokens: int + wall_s: float + props_ok: bool = False + tool_ok: bool = False + error: str = "" + + +def _server_base_url(cfg: Config) -> str: + """Where to reach the server from inside the orchestrator container. + + Tries `host.docker.internal` (Docker Desktop, also added by recent + docker-ce via --add-host), then falls back to the default bridge + gateway 172.17.0.1. + """ + host = "host.docker.internal" + try: + socket.gethostbyname(host) + except OSError: + host = "172.17.0.1" + return f"http://{host}:{cfg.port}" + + +def _check_props(client: httpx.Client, base_url: str) -> tuple[bool, str]: + try: + resp = client.get(base_url + "/props") + except httpx.HTTPError as e: + return False, f"/props failed: {e}" + if resp.status_code != 200: + return False, f"/props HTTP {resp.status_code}" + try: + props = resp.json() + except ValueError as e: + return False, f"/props invalid JSON: {e}" + required_top = ( + "default_generation_settings", + "model_alias", + "model_path", + "build_info", + "speculative_mode", + ) + missing = [k for k in required_top if k not in props] + if missing: + return False, f"/props missing {', '.join(missing)}" + dgs = props.get("default_generation_settings") + if not isinstance(dgs, dict) or not all( + k in dgs for k in ("n_ctx", "temperature", "top_p", "top_k", "min_p") + ): + return False, "/props default_generation_settings incomplete" + runtime = props.get("runtime") + if not isinstance(runtime, dict) or not runtime.get("backend"): + return False, "/props runtime.backend missing" + if props.get("speculative_mode") not in {"off", "mtp", "dflash", "pflash"}: + return False, "/props speculative_mode invalid" + return True, "" + + +def _check_tool_call(client: httpx.Client, base_url: str, timeout_s: float) -> tuple[bool, str]: + body = { + "model": "luce-dflash", + "messages": [ + { + "role": "user", + "content": ( + "Use the provided tool now. Call report_status with " + 'status="ok". Do not answer in prose.' + ), + } + ], + "tools": [ + { + "type": "function", + "function": { + "name": "report_status", + "description": "Report smoke-test status.", + "parameters": { + "type": "object", + "properties": {"status": {"type": "string"}}, + "required": ["status"], + }, + }, + } + ], + "tool_choice": {"type": "function", "function": {"name": "report_status"}}, + "temperature": 0, + "max_tokens": 128, + "stream": False, + "chat_template_kwargs": {"enable_thinking": False}, + } + last_err = "" + for attempt in range(1, 4): + try: + resp = client.post(base_url + "/v1/chat/completions", json=body, timeout=timeout_s) + except httpx.HTTPError as e: + last_err = f"tool call request failed: {e}" + continue + if resp.status_code != 200: + last_err = f"tool call HTTP {resp.status_code}: {resp.text[:300]}" + continue + try: + data = resp.json() + except ValueError as e: + last_err = f"tool call invalid JSON: {e}" + continue + choices = data.get("choices") or [] + if not choices: + last_err = "tool call response had no choices" + continue + msg = choices[0].get("message") or {} + calls = msg.get("tool_calls") or [] + if not calls: + finish = choices[0].get("finish_reason") + content = (msg.get("content") or "")[:300] + last_err = ( + f"attempt {attempt}: no tool_calls emitted (finish={finish}, content={content!r})" + ) + continue + names = [((c.get("function") or {}).get("name")) for c in calls] + if "report_status" in names: + return True, "" + last_err = f"attempt {attempt}: wrong tool call names: {names!r}" + return False, last_err + + +def run( + cfg: Config, + *, + prompt: str = DEFAULT_PROMPT, + timeout_s: float = DEFAULT_TIMEOUT_S, + check_tools: bool = True, +) -> SmokeResult: + base_url = _server_base_url(cfg) + url = base_url + "/v1/chat/completions" + body = { + "model": "luce-dflash", + "messages": [{"role": "user", "content": prompt}], + "max_tokens": 16, + "stream": True, + } + + t0 = time.perf_counter() + n_tokens = 0 + try: + with httpx.Client(timeout=timeout_s) as client: + props_ok, props_err = _check_props(client, base_url) + if not props_ok: + return SmokeResult( + ok=False, + http_status=0, + n_tokens=0, + wall_s=time.perf_counter() - t0, + props_ok=False, + tool_ok=False, + error=props_err, + ) + + with client.stream("POST", url, json=body) as resp: + status = resp.status_code + if status != 200: + return SmokeResult( + ok=False, + http_status=status, + n_tokens=0, + wall_s=time.perf_counter() - t0, + props_ok=props_ok, + tool_ok=False, + error=f"HTTP {status}", + ) + for line in resp.iter_lines(): + if not line.startswith("data:"): + continue + payload = line[5:].strip() + if payload == "[DONE]": + break + try: + chunk = json.loads(payload) + except json.JSONDecodeError: + continue + choices = chunk.get("choices") or [] + if not choices: + continue + delta = choices[0].get("delta") or {} + if delta.get("content"): + n_tokens += 1 + if n_tokens < 1: + return SmokeResult( + ok=False, + http_status=status, + n_tokens=n_tokens, + wall_s=time.perf_counter() - t0, + props_ok=props_ok, + tool_ok=False, + error="no tokens streamed", + ) + + tool_ok = True + tool_err = "" + if check_tools: + tool_ok, tool_err = _check_tool_call(client, base_url, timeout_s) + wall = time.perf_counter() - t0 + return SmokeResult( + ok=(status == 200 and n_tokens >= 1 and props_ok and tool_ok), + http_status=status, + n_tokens=n_tokens, + wall_s=wall, + props_ok=props_ok, + tool_ok=tool_ok, + error=tool_err, + ) + except httpx.HTTPError as e: + return SmokeResult( + ok=False, + http_status=0, + n_tokens=n_tokens, + wall_s=time.perf_counter() - t0, + error=str(e), + ) diff --git a/lucebox/src/lucebox/sweep.py b/lucebox/src/lucebox/sweep.py new file mode 100644 index 00000000..f7655ab0 --- /dev/null +++ b/lucebox/src/lucebox/sweep.py @@ -0,0 +1,868 @@ +"""``lucebox autotune --sweep`` — empirical DFLASH_* bracket on the live server. + +Reuses existing primitives rather than re-inventing per-cell server +spawning: + + * ``autotune.candidate_configs(host)`` builds the per-tier bracket. + * ``config.config_set("dflash.*")`` writes each candidate to + ``~/.lucebox/config.toml`` — the same sparse-write path + ``autotune --apply`` uses. + * ``subprocess.run(["systemctl", "--user", "restart", "lucebox.service"])`` + cycles the server. We shell out instead of re-implementing the + restart so the systemd unit's lifecycle stays the single source of + truth. + * Poll ``http://localhost:/v1/models`` until 200 OK, then + * shell out to ``lucebox profile --level level1`` which runs + ``luce-bench snapshot`` in the container. Parse decode_tokens_per_sec + out of the resulting ``/.json`` rows. + +Pre-sweep state is snapshotted (``~/.lucebox/config.toml`` → ``.sweep- +backup``) and restored on SIGINT/SIGTERM or any uncaught exception. +On a successful sweep the backup is deleted after the winner is +applied; on failure (no cell produced a tps reading, or all cells +timed out) the backup is restored and the exit code is non-zero. +""" + +from __future__ import annotations + +import dataclasses +import json +import os +import shutil +import signal +import subprocess +import time +import urllib.error +import urllib.request +from pathlib import Path + +from rich.console import Console +from rich.table import Table + +from lucebox import autotune as autotune_mod +from lucebox import config as config_mod +from lucebox.host_facts import from_env +from lucebox.types import DflashRuntime + +# ── allowlist: dflash.* fields written by the sweep per cell ──────────────── +# Kept in sync with cli.DFLASH_ALLOWLIST — we duplicate it locally to +# avoid an import cycle (cli.py imports this module). ``fa_window`` is +# included even though it's not part of the strict lucebench snapshot +# allowlist; the sweep needs to be able to vary it as a per-cell +# bracket axis for the coding-agent-loop profile. +DFLASH_ALLOWLIST: tuple[str, ...] = ( + "budget", + "max_ctx", + "lazy", + "prefix_cache_slots", + "prefill_cache_slots", + "cache_type_k", + "cache_type_v", + "prefill_mode", + "prefill_keep_ratio", + "prefill_threshold", + "prefill_drafter", + "fa_window", +) + + +@dataclasses.dataclass(slots=True) +class CellResult: + """One sweep cell's outcome. + + Carries either the legacy ``mean_decode_tps`` (heuristic profile) or + the composite ``(passed, speed_metric, pass_reason)`` set used by + the coding-agent-loop profile. The fields are not mutually + exclusive — a cell can carry both when both scorers ran — but the + sweep driver only populates one per profile run. + """ + + index: int + config: DflashRuntime + snapshot_dir: Path | None + mean_decode_tps: float | None + error: str | None + # Coding-agent-loop scorer outputs. ``passed`` is None when the + # profile didn't run; True/False after a pass/fail check. + passed: bool | None = None + pass_reason: str = "" + # Higher = better. For agent_replay_pass_rate this is + # completion_tokens / wall_seconds (rough throughput when the + # snapshot's ``decode_tokens_per_sec`` is empty, as it is for + # longctx cells on gemma). + speed_metric: float | None = None + # Echoed case length so the results table can show "which trace was + # actually exercised" — useful when the bracket varied max_ctx. + case_id: str = "" + case_tokens: int | None = None + + +# ── snapshot dir + decode-tps extraction ─────────────────────────────────── + + +def _sweep_root() -> Path: + """Top-level sweep dir: ``$XDG_DATA_HOME/lucebox/profile-snapshots/sweep``. + + Cells land at ``/cell-NN-/``; the per-cell + snapshot dir name is then chosen by ``luce-bench snapshot`` via the + ``--name`` flag we pass. + """ + base = os.environ.get("XDG_DATA_HOME") or str(Path.home() / ".local" / "share") + return Path(base) / "lucebox" / "profile-snapshots" / "sweep" + + +def _short_hash(config: DflashRuntime) -> str: + """Stable 8-char tag for a config — used in cell directory names. + + Hash the eleven allowlisted fields so two runs with the same + bracket produce the same dir names (helps `luce-bench report` + dedupe across host machines and reruns). + """ + import hashlib + + fields = "|".join(f"{k}={getattr(config, k)!r}" for k in DFLASH_ALLOWLIST) + return hashlib.sha1(fields.encode("utf-8"), usedforsecurity=False).hexdigest()[:8] + + +def _mean_decode_tps_from_snapshot(snapshot_dir: Path) -> float | None: + """Walk every per-area JSON in ``snapshot_dir`` and average decode tps. + + luce-bench snapshot writes ``.json`` per area (smoke, code, + gsm8k, agent, longctx for level1). Each row carries + ``timings.decode_tokens_per_sec`` when the lucebox-server populated + it. We average across all rows where that field is present — the + winner picker is "highest mean" so a higher row count just makes + the estimate more stable. + + Returns None when no row in the snapshot carries a decode tps + (offline server, OpenRouter-shaped responses with no timings, + etc.). Callers treat None as "this cell didn't produce a measurement" + and exclude it from winner picking. + """ + if not snapshot_dir.exists(): + return None + tps_values: list[float] = [] + for area_json in sorted(snapshot_dir.glob("*.json")): + if area_json.name.startswith("_") or area_json.name in { + "host.json", + "props.json", + "config.json", + }: + continue + try: + payload = json.loads(area_json.read_text()) + except (OSError, json.JSONDecodeError): + continue + if not isinstance(payload, dict): + continue + rows = payload.get("rows") + if not isinstance(rows, list): + continue + for row in rows: + if not isinstance(row, dict): + continue + timings = row.get("timings") + if not isinstance(timings, dict): + continue + tps = timings.get("decode_tokens_per_sec") + if isinstance(tps, int | float) and tps > 0: + tps_values.append(float(tps)) + if not tps_values: + return None + return sum(tps_values) / len(tps_values) + + +# ── server lifecycle: restart + readiness probe ──────────────────────────── + + +def _systemctl_restart() -> int: + """``systemctl --user restart lucebox.service``. Returns exit code. + + Shell out instead of adding a Python restart() helper — the + systemd unit is already the single source of truth for the + server's lifecycle (see ``lucebox.sh::cmd_systemctl_passthrough``). + """ + return subprocess.run( + ["systemctl", "--user", "restart", "lucebox.service"], + check=False, + ).returncode + + +def _wait_ready(port: int, timeout_s: int) -> bool: + """Poll ``http://localhost:/v1/models`` until 200 OK or budget runs out.""" + deadline = time.time() + timeout_s + probe_url = f"http://localhost:{port}/v1/models" + while time.time() < deadline: + try: + with urllib.request.urlopen(probe_url, timeout=2) as resp: + if resp.status == 200: + return True + except (urllib.error.URLError, OSError): + pass + time.sleep(1.0) + return False + + +# ── pre-flight checks ────────────────────────────────────────────────────── + + +def _systemd_unit_path() -> Path: + """Where the user-installed systemd unit lives.""" + base = os.environ.get("XDG_CONFIG_HOME") or str(Path.home() / ".config") + return Path(base) / "systemd" / "user" / "lucebox.service" + + +def _preflight(console: Console) -> int | None: + """Refuse to sweep when prerequisites are missing. + + Returns an exit code (non-zero) when the sweep should abort, or + None when it's safe to proceed. Each refusal includes a one-line + hint pointing at the canonical "fix it" command. + """ + if not _systemd_unit_path().exists(): + console.print( + f"[red]No lucebox.service unit at {_systemd_unit_path()}.[/red]\n" + "[dim]Hint: run `lucebox install` first.[/dim]" + ) + return 2 + + # No preset AND no target_file → entrypoint would have nothing to + # serve. Either is enough on its own (preset implies a target; + # explicit target_file overrides the preset path). + cfg = config_mod.load() or config_mod.live_config() + if not cfg.model.preset and not cfg.model.target_file: + console.print( + "[red]No model configured (model.preset and model.target_file are both unset).[/red]\n" + "[dim]Hint: run `lucebox models download` first.[/dim]" + ) + return 2 + + return None + + +# ── backup + restore (signal-safe) ───────────────────────────────────────── + + +def _backup_path() -> Path: + return config_mod.default_config_path().with_suffix(".toml.sweep-backup") + + +def _take_backup(console: Console) -> Path | None: + """Copy config.toml → config.toml.sweep-backup. Returns the backup path. + + None when there's no pre-existing config.toml — the sweep will + still trap signals, and on failure the on-disk file is simply + removed (back to the pre-sweep state of "no file"). + """ + cfg_path = config_mod.default_config_path() + if not cfg_path.exists(): + return None + backup = _backup_path() + backup.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(cfg_path, backup) + console.print(f"[dim]config.toml → {backup}[/dim]") + return backup + + +def _restore_backup(console: Console, backup: Path | None) -> None: + """Put config.toml back the way we found it.""" + cfg_path = config_mod.default_config_path() + if backup is not None and backup.exists(): + shutil.copy2(backup, cfg_path) + console.print(f"[yellow]Restored[/yellow] config.toml from {backup}") + elif cfg_path.exists(): + # Pre-sweep state was "no file" — remove ours. + cfg_path.unlink() + console.print("[yellow]Removed[/yellow] config.toml (no pre-sweep file)") + + +# ── config write + apply ─────────────────────────────────────────────────── + + +def _apply_config(runtime: DflashRuntime) -> None: + """Write the swept dflash.* fields of ``runtime`` to config.toml. + + Uses ``config_set`` so each field lands sparsely — other on-disk + keys (model.preset etc.) are untouched. + """ + for name in DFLASH_ALLOWLIST: + config_mod.config_set(f"dflash.{name}", getattr(runtime, name)) + + +# ── coding-agent-loop scorer (alternative to decode-tps snapshot) ────────── + + +def _score_agent_replay( + port: int, + max_ctx: int, + *, + hard_limit_reply_budget: int = 4096, + request_timeout_s: int = 300, +) -> tuple[bool, str, float | None, str, int | None]: + """Run the largest fitting multi-turn case against the live server. + + Returns ``(passed, reason, speed_metric, case_id, case_tokens)``: + + * ``passed`` — True iff the server returned a non-empty, + non-erroring response within ``request_timeout_s``. + * ``reason`` — human-readable verdict (carried into the results table). + * ``speed_metric`` — ``completion_tokens / wall_seconds`` when the + response carried a usage block; ``None`` when not. Higher = better. + * ``case_id`` / ``case_tokens`` — which fixture case actually ran. + + The fixture is loaded lazily so a sweep cell that won't reach this + scorer (e.g. the heuristic profile) pays no import cost. When the + fixture is missing or has no case fitting under the cell's + ``max_ctx − hard_limit_reply_budget`` budget, the call returns a + fail with a descriptive reason — the cell is then excluded from + winner selection. + """ + import time + import urllib.error + import urllib.request + + try: + # luce-bench lands in a sibling PR (#337); the workspace deliberately + # doesn't hard-depend on it, so mypy can't resolve the symbol at + # check time. The try/except below is the runtime fallback. + from lucebench.areas.agent_recorded import ( # type: ignore[import-not-found] + load_agent_recorded_multi_turn_cases, + pick_multi_turn_case_for_budget, + ) + except ImportError as exc: + return (False, f"luce-bench not importable: {exc}", None, "", None) + + cases = load_agent_recorded_multi_turn_cases() + if not cases: + return (False, "no multi-turn cases on disk", None, "", None) + + prompt_budget = max(1, max_ctx - hard_limit_reply_budget) + case = pick_multi_turn_case_for_budget(cases, prompt_budget) + if case is None: + return ( + False, + f"no case fits under prompt_budget={prompt_budget}", + None, + "", + None, + ) + + # Cap completion to a fraction of the reply budget so we don't + # gobble wall time on hopeless decodes — the verifier just needs + # to confirm the server is producing tokens, not full sessions. + body = { + "model": "dflash", + "messages": case["messages"], + "max_tokens": min(hard_limit_reply_budget, 256), + "temperature": 0.0, + "stream": False, + } + payload = json.dumps(body).encode() + req = urllib.request.Request( + f"http://localhost:{port}/v1/chat/completions", + data=payload, + headers={"Content-Type": "application/json"}, + ) + t0 = time.perf_counter() + try: + with urllib.request.urlopen(req, timeout=request_timeout_s) as resp: + raw = json.loads(resp.read()) + except urllib.error.HTTPError as exc: + wall = time.perf_counter() - t0 + return ( + False, + f"HTTP {exc.code} after {wall:.1f}s", + None, + case["id"], + int(case["context_tokens_approx"]), + ) + except (urllib.error.URLError, OSError, TimeoutError) as exc: + wall = time.perf_counter() - t0 + return ( + False, + f"{type(exc).__name__} after {wall:.1f}s: {exc}", + None, + case["id"], + int(case["context_tokens_approx"]), + ) + wall = time.perf_counter() - t0 + + choice = (raw.get("choices") or [{}])[0] + msg = choice.get("message", {}) if isinstance(choice, dict) else {} + content = (msg.get("content") or "") + (msg.get("reasoning_content") or "") + if not content.strip(): + return ( + False, + f"empty response after {wall:.1f}s", + None, + case["id"], + int(case["context_tokens_approx"]), + ) + + usage = raw.get("usage") or {} + completion_tokens = usage.get("completion_tokens") + speed: float | None = None + if isinstance(completion_tokens, int | float) and completion_tokens > 0 and wall > 0: + speed = float(completion_tokens) / wall + + reason = ( + f"pass: {len(content)} chars / {completion_tokens or '?'} tok in " + f"{wall:.1f}s (case {case['id']})" + ) + return (True, reason, speed, case["id"], int(case["context_tokens_approx"])) + + +# ── winner selection + results table ─────────────────────────────────────── + + +def _pick_winner(results: list[CellResult], scorer: str) -> CellResult | None: + """Profile-aware winner selection. + + ``scorer == "decode_tps_snapshot"``: highest mean_decode_tps wins; + ties → lower max_ctx, then lower budget. Cells with + ``mean_decode_tps is None`` are excluded. + + ``scorer == "agent_replay_pass_rate"``: only passing cells qualify; + among those, largest ``max_ctx`` wins first (a coding-agent-loop + workload must not silently downgrade the context window — a faster + result at a smaller ctx is not a better result). Within the same + max_ctx, highest ``speed_metric`` breaks the tie, then larger + fa_window, then lower budget. + + Why max_ctx before speed: cells at different max_ctx values run + against different fixture cases (32K case for 65K ctx, 64K case for + 96K ctx). The 32K case prefills and decodes faster, so a 65K cell + always shows higher tok/s than a 96K cell — not because the config + is better, but because the test input is shorter. Sorting by + speed first would systematically pick the smaller context as the + "winner". See bragi sweep 2026-05-30 for a concrete example. + """ + if scorer == "agent_replay_pass_rate": + valid = [r for r in results if r.passed is True] + if not valid: + return None + valid.sort( + key=lambda r: ( + -int(r.config.max_ctx), + -float(r.speed_metric or 0), + -int(r.config.fa_window), + int(r.config.budget), + ) + ) + return valid[0] + + # Default / heuristic path. + valid = [r for r in results if r.mean_decode_tps is not None] + if not valid: + return None + valid.sort( + key=lambda r: ( + -float(r.mean_decode_tps or 0), + int(r.config.max_ctx), + int(r.config.budget), + ) + ) + return valid[0] + + +def _print_results( + console: Console, + results: list[CellResult], + winner: CellResult | None, + scorer: str = "decode_tps_snapshot", +) -> None: + """Pretty-print the final results table after the sweep completes. + + Columns vary by scorer: + * decode_tps_snapshot: budget / max_ctx / kv / tps / status + * agent_replay_pass_rate: budget / max_ctx / fa_win / kv / + case_tok / tok_per_s / pass / status + """ + if scorer == "agent_replay_pass_rate": + table = Table(title=f"Sweep complete (coding-agent-loop). {len(results)} cell(s).") + table.add_column("#") + table.add_column("budget", justify="right") + table.add_column("max_ctx", justify="right") + table.add_column("fa_win", justify="right") + table.add_column("kv") + table.add_column("pflash") + table.add_column("case tok", justify="right") + table.add_column("tok/s", justify="right") + table.add_column("pass") + table.add_column("status") + for r in results: + cfg = r.config + kv = cfg.cache_type_k or "—" + case_tok = "—" if r.case_tokens is None else str(r.case_tokens) + speed = "—" if r.speed_metric is None else f"{r.speed_metric:.1f}" + if r.error: + pass_cell = "—" + status = f"[red]{r.error}[/red]" + elif r.passed is None: + pass_cell = "—" + status = "[yellow]not scored[/yellow]" + elif r.passed: + pass_cell = "[green]✓[/green]" + status = "[green]← winner[/green]" if winner is r else r.pass_reason[:60] + else: + pass_cell = "[red]✗[/red]" + status = r.pass_reason[:60] + table.add_row( + str(r.index + 1), + str(cfg.budget), + str(cfg.max_ctx), + str(cfg.fa_window), + kv, + cfg.prefill_mode, + case_tok, + speed, + pass_cell, + status, + ) + console.print(table) + return + + table = Table(title=f"Sweep complete. Tested {len(results)} config(s).") + table.add_column("#") + table.add_column("budget", justify="right") + table.add_column("max_ctx", justify="right") + table.add_column("kv") + table.add_column("tps", justify="right") + table.add_column("status") + for r in results: + cfg = r.config + kv = cfg.cache_type_k or "auto" + if r.error: + tps_str = "—" + status = f"[red]{r.error}[/red]" + elif r.mean_decode_tps is None: + tps_str = "—" + status = "[yellow]no tps in snapshot[/yellow]" + else: + tps_str = f"{r.mean_decode_tps:.1f}" + status = "[green]← winner[/green]" if winner is r else "" + table.add_row(str(r.index + 1), str(cfg.budget), str(cfg.max_ctx), kv, tps_str, status) + console.print(table) + + +# ── driver ───────────────────────────────────────────────────────────────── + + +def _format_eta(n_candidates: int) -> str: + """Rough wall-time estimate to surface before the user commits. + + Each cell does: restart (~10 s) + readiness wait (≤60 s) + level1 + snapshot (~30-60 s on a 24 GB rig). Call it ~90 s per cell as a + user-facing estimate; honest enough to set expectations without + over-promising. + """ + seconds = n_candidates * 90 + minutes = seconds // 60 + if minutes < 1: + return f"~{seconds}s" + return f"~{minutes} min" + + +def run_sweep( + *, + console: Console | None = None, + ready_timeout: int = 60, + yes: bool = False, + profile: str = "heuristic", +) -> int: + """Top-level sweep driver. Returns process exit code. + + 1. Pre-flight: refuse if no systemd unit or no model preset. + 2. Snapshot config.toml + install signal trap. + 3. For each candidate from the selected profile's + ``candidate_configs(host, preset)``: + a. Write dflash.* fields via ``config_set``. + b. Restart the systemd unit. + c. Wait for /v1/models healthy. + d. Score the cell — either by invoking ``lucebox profile`` and + parsing decode_tps (``decode_tps_snapshot``), or by replaying + the largest fitting multi-turn fixture case + (``agent_replay_pass_rate``). + 4. Apply the winning config, restart, remove backup. + + ``profile``: one of ``autotune.PROFILES`` keys + (``heuristic`` is the legacy preset-agnostic path; ``coding-agent-loop`` + is the agentic-workload-aware path). + """ + console = console or Console() + + try: + active_profile = autotune_mod.get_profile(profile) + except KeyError as exc: + console.print(f"[red]{exc}[/red]") + return 2 + + rc = _preflight(console) + if rc is not None: + return rc + + cfg = config_mod.load() or config_mod.live_config() + host = from_env() + # The LUCEBOX_HOST_* env vars are set by the lucebox.sh wrapper. + # When the sweep is invoked directly (e.g. via `uv run python -m + # lucebox` for development), those env vars are missing and + # from_env() returns a zero-VRAM HostFacts — which makes every + # profile bracket fall through to base-only. Fall back to the + # persisted [host] block in config.toml, which was populated by an + # earlier `lucebox check` / `autotune` run via the wrapper. + if host.vram_gb == 0 and cfg.host.vram_gb > 0: + host = cfg.host + candidates = active_profile.candidate_configs(host, cfg.model.preset) + if not candidates: + console.print( + f"[red]profile {profile!r} returned no candidate configs — " + "nothing to sweep.[/red]" + ) + return 2 + + if not yes: + console.print( + f"About to sweep [bold]{len(candidates)}[/bold] config(s) " + f"(~{_format_eta(len(candidates))} total). " + "Each cell restarts the server and runs `lucebox profile --level level1`." + ) + try: + answer = input("Proceed? [y/N] ").strip().lower() + except EOFError: + answer = "" + if answer not in ("y", "yes"): + console.print("[dim]aborted[/dim]") + return 1 + + backup = _take_backup(console) + + # SIGINT/SIGTERM trap: restore the backup + restart so the + # interrupted sweep doesn't leave the server with a half-applied + # cell config. The handler raises KeyboardInterrupt so the + # outer try/except still fires the same cleanup path. + def _signal_handler(signum, frame): # noqa: ARG001 + raise KeyboardInterrupt(f"signal {signum}") + + old_sigint = signal.signal(signal.SIGINT, _signal_handler) + old_sigterm = signal.signal(signal.SIGTERM, _signal_handler) + + sweep_root = _sweep_root() + sweep_root.mkdir(parents=True, exist_ok=True) + + results: list[CellResult] = [] + interrupted = False + try: + for idx, candidate in enumerate(candidates): + short = _short_hash(candidate) + kv = candidate.cache_type_k or "auto" + extras = "" + if active_profile.scorer == "agent_replay_pass_rate": + extras = ( + f" fa_window={candidate.fa_window} pflash={candidate.prefill_mode}" + ) + console.print( + f"[bold][{idx + 1}/{len(candidates)}][/bold] " + f"budget={candidate.budget} max_ctx={candidate.max_ctx} kv={kv}{extras}" + ) + + try: + _apply_config(candidate) + except (KeyError, ValueError, OSError) as exc: + results.append( + CellResult( + index=idx, + config=candidate, + snapshot_dir=None, + mean_decode_tps=None, + error=f"config_set: {exc}", + ) + ) + continue + + rc = _systemctl_restart() + if rc != 0: + results.append( + CellResult( + index=idx, + config=candidate, + snapshot_dir=None, + mean_decode_tps=None, + error=f"restart exit={rc}", + ) + ) + continue + + if not _wait_ready(cfg.port, ready_timeout): + results.append( + CellResult( + index=idx, + config=candidate, + snapshot_dir=None, + mean_decode_tps=None, + error=f"server-not-ready ({ready_timeout}s)", + ) + ) + continue + + cell_name = f"cell-{idx + 1:02d}-{short}" + + if active_profile.scorer == "agent_replay_pass_rate": + passed, reason, speed, case_id, case_tokens = _score_agent_replay( + cfg.port, + max_ctx=candidate.max_ctx, + ) + results.append( + CellResult( + index=idx, + config=candidate, + snapshot_dir=None, + mean_decode_tps=None, + error=None, + passed=passed, + pass_reason=reason, + speed_metric=speed, + case_id=case_id, + case_tokens=case_tokens, + ) + ) + speed_str = "—" if speed is None else f"{speed:.1f} tok/s" + console.print( + f" → {'pass' if passed else 'fail'} ({reason[:80]}) " + f"speed={speed_str}" + ) + continue + + # Legacy heuristic path: call run_profile directly so we + # land snapshots in // instead of + # the default profile-snapshots dir. Going through + # subprocess + env vars was the old plumbing but `lucebox + # profile` doesn't read LUCEBOX_SWEEP_* — the per-cell + # routing only works via the kwarg form. + cell_start = time.time() + from lucebox import profile as _profile_mod + snapshot_rc = _profile_mod.run_profile( + cfg, + level="level1", + console=console, + out_dir=sweep_root, + name=cell_name, + ) + + # Locate the snapshot dir luce-bench actually wrote. We + # asked for cell-NN-, but the snapshot subcommand + # may decorate the name with host/gpu/date when --name + # isn't honored (older luce-bench builds). Fall back to + # the newest dir under sweep_root. + candidate_dir = sweep_root / cell_name + if not candidate_dir.exists(): + # Pick newest dir under sweep_root that was written + # *after this cell started*. The mtime gate is the key + # bit — a previous sweep run's directory may still sit + # under sweep_root, and without the gate we'd silently + # attribute its TPS to the current cell. Also exclude + # dirs we've already attributed to earlier cells in this + # run. + existing = {r.snapshot_dir for r in results if r.snapshot_dir is not None} + newest: Path | None = None + newest_mtime = -1.0 + for child in sweep_root.iterdir(): + if not child.is_dir() or child in existing: + continue + mtime = child.stat().st_mtime + # Stale-dir guard: anything older than this cell's + # start is from a previous run. + if mtime < cell_start: + continue + if mtime > newest_mtime: + newest_mtime = mtime + newest = child + candidate_dir = newest or candidate_dir + + tps = _mean_decode_tps_from_snapshot(candidate_dir) \ + if candidate_dir.exists() else None + error = None + if snapshot_rc != 0 and tps is None: + error = f"profile exit={snapshot_rc}" + results.append( + CellResult( + index=idx, + config=candidate, + snapshot_dir=candidate_dir if candidate_dir.exists() else None, + mean_decode_tps=tps, + error=error, + ) + ) + tps_str = "—" if tps is None else f"{tps:.2f}" + console.print(f" → mean_decode_tps={tps_str}") + + except KeyboardInterrupt as exc: + interrupted = True + console.print(f"\n[yellow]interrupted ({exc})[/yellow]") + except Exception as exc: + # Unexpected crash mid-sweep — the last cell's _apply_config() + # has already written its dflash.* fields to config.toml. Treat + # this exactly like a SIGINT: restore the backup and restart so + # the server isn't left running with a half-applied bracket + # cell. We re-raise after cleanup so the operator still sees + # the traceback (and so CI surfaces it as a failure). + console.print(f"\n[red]sweep aborted: {type(exc).__name__}: {exc}[/red]") + signal.signal(signal.SIGINT, old_sigint) + signal.signal(signal.SIGTERM, old_sigterm) + _restore_backup(console, backup) + _systemctl_restart() + raise + finally: + signal.signal(signal.SIGINT, old_sigint) + signal.signal(signal.SIGTERM, old_sigterm) + + if interrupted: + _restore_backup(console, backup) + _systemctl_restart() + return 130 # canonical SIGINT exit code + + winner = _pick_winner(results, active_profile.scorer) + if winner is None: + if active_profile.scorer == "agent_replay_pass_rate": + msg = "No cell passed the agent_replay probe — restoring pre-sweep config." + else: + msg = "No cell produced a decode_tps measurement — restoring pre-sweep config." + console.print(f"[red]{msg}[/red]") + _restore_backup(console, backup) + _systemctl_restart() + _print_results(console, results, None, active_profile.scorer) + return 1 + + # Apply the winner. The losing cells already wrote their dflash.* + # fields during the loop, so the on-disk state is whatever the + # final cell wrote — we overwrite with the winner's fields here. + _apply_config(winner.config) + rc = _systemctl_restart() + if rc != 0: + console.print( + f"[red]Restart after applying winner failed (exit={rc}). " + "Backup retained.[/red]" + ) + _print_results(console, results, winner, active_profile.scorer) + return rc + + if not _wait_ready(cfg.port, ready_timeout): + console.print( + "[red]Server didn't come back after applying winner — backup retained.[/red]" + ) + _print_results(console, results, winner, active_profile.scorer) + return 1 + + # Success path — purge the backup so the next `lucebox autotune + # --sweep` starts clean. + if backup is not None and backup.exists(): + backup.unlink() + + _print_results(console, results, winner, active_profile.scorer) + return 0 + + +__all__ = [ + "CellResult", + "DFLASH_ALLOWLIST", + "run_sweep", +] diff --git a/lucebox/src/lucebox/types.py b/lucebox/src/lucebox/types.py new file mode 100644 index 00000000..e1d3620d --- /dev/null +++ b/lucebox/src/lucebox/types.py @@ -0,0 +1,140 @@ +"""Shared dataclasses passed between modules. + +HostFacts is populated from the LUCEBOX_HOST_* env vars set by lucebox.sh. +Config is what we serialize to/from .lucebox/config.toml. Both are frozen so +mistakes (e.g. mutating a config after autotune wrote it) fail loudly. +""" + +from __future__ import annotations + +import os +from dataclasses import dataclass, field +from pathlib import Path +from typing import Literal + +Variant = str +CtkStatus = Literal["runtime", "cdi", "installed-unwired", "none"] + + +def default_models_dir() -> Path: + """Resolve the default models directory under the XDG Base Directory spec. + + $XDG_DATA_HOME (default ~/.local/share) is the conventional location for + user-specific data files on Linux + macOS. Lucebox nests its model store + under that so downloads live alongside other per-user app data instead + of cluttering $HOME directly. The host wrapper bind-mounts this path + into the container so paths line up in and out of the image. + """ + base = os.environ.get("XDG_DATA_HOME") or str(Path.home() / ".local" / "share") + return Path(base) / "lucebox" / "models" + + +GpuVendor = Literal["nvidia", "amd", "none"] + + +@dataclass(frozen=True, slots=True) +class HostFacts: + """Probed once by lucebox.sh, passed in via env vars. Single source of + truth on the Python side — we never reprobe (we can't see host /proc).""" + + nproc: int = 0 + ram_gb: int = 0 + gpu_vendor: GpuVendor = "none" + gpu_name: str = "" + gpu_count: int = 0 + vram_gb: int = 0 + gpu_sm: str = "" # e.g. "120" — matches docker-bake arch lists + driver_version: str = "" # e.g. "595.71.05" + driver_major: int = 0 + has_systemd: bool = False + is_wsl: bool = False + has_docker: bool = False + docker_version: str = "" + ctk: CtkStatus = "none" + + +@dataclass(frozen=True, slots=True) +class DflashRuntime: + """The DFLASH_* knobs as typed values. Serialized under [dflash] in TOML + and emitted as -e DFLASH_FOO=bar args to docker run. + + The 11 fields below (budget through prefill_drafter) form the strict + allowlist mirrored by lucebench's snapshot config.json — keep both + in lockstep. ``think_max`` is a separate phase-1 thinking cap that + isn't part of the runtime snapshot allowlist (it's per-request, not + per-server). + """ + + budget: int = 22 + max_ctx: int = 16384 + lazy: bool = False + prefix_cache_slots: int = 0 + prefill_cache_slots: int = 0 + cache_type_k: str = "" + cache_type_v: str = "" + prefill_mode: Literal["off", "auto", "always"] = "off" + prefill_keep_ratio: float = 0.05 + prefill_threshold: int = 32000 + prefill_drafter: str = "" + # Phase-1 (thinking) cap when a request opts into thinking. Default mirrors + # antirez/ds4 ds4_eval.c: think_max_tokens = max_tokens - hard_limit_reply + # budget = 16000 - 512 = 15488. The server's own hardcoded default is 10000. + think_max: int = 15488 + # Flash-attention sliding-window on full-attention layers. 0 = full + # attention (server default). On gemma4's hybrid iSWA the full-attn + # layers grow KV linearly with max_ctx; a sparse fa_window keeps + # decode compute bounded on long prompts without changing the KV + # footprint. Q: passed through to the server's `--fa-window ` + # flag (see server/src/server/server_main.cpp). + fa_window: int = 0 + # Soft-close thinking termination dial (PR #326 in lucebox-hub). + # Lets the AR loop force early when the close-token logit + # comes within this probability ratio of the chosen-token logit. + # Range [0.0, 1.0]; 0.0 = disabled (byte-identical to pre-change + # behaviour). 0.5 = close when close-token prob >= 0.5 * chosen-token + # prob; 0.9 = aggressive. Qwen3.5/3.6 AR path only in v1. Surfaced + # to the server via DFLASH_THINK_SOFT_CLOSE_MIN_RATIO → + # --think-soft-close-min-ratio. + think_soft_close_min_ratio: float = 0.0 + # Diagnostic: when True, surface --debug-thinking-logits to the + # server CLI via DFLASH_DEBUG_THINKING_LOGITS=1, producing one + # stderr line per thinking AR step recording the close-vs-chosen + # logit gap. Used to fit a sliding-ratio curve from real trajectory + # data. Heavy stderr (one line per thinking token across all + # in-flight requests); leave off in production. + debug_thinking_logits: bool = False + + +@dataclass(frozen=True, slots=True) +class ModelMeta: + """Which preset the operator picked at configure/download time. + + Persisted under ``[model]`` in config.toml so `lucebox serve` can + pass ``DFLASH_TARGET=/opt/lucebox-hub/server/models/`` and + ``DFLASH_DRAFT`` for the draft GGUF (when one is published for the + preset). The entrypoint's "multiple candidate GGUFs" branch never + has to guess which one to load. + + ``target_file`` and ``draft_file`` are advanced overrides — when set + they win over the preset's registry default. Empty strings mean + "fall back to the registry value for [model] preset, then to the + entrypoint's autodetect". + """ + + preset: str = "" + target_file: str = "" + draft_file: str = "" + + +@dataclass(frozen=True, slots=True) +class Config: + """The whole config.toml, materialized.""" + + variant: Variant = "cuda12" + image: str = "ghcr.io/luce-org/lucebox-hub" + container_name: str = "lucebox" + port: int = 8080 + models_dir: Path = field(default_factory=default_models_dir) + dflash: DflashRuntime = field(default_factory=DflashRuntime) + host: HostFacts = field(default_factory=HostFacts) + model: ModelMeta = field(default_factory=ModelMeta) diff --git a/lucebox/tests/test_autotune.py b/lucebox/tests/test_autotune.py new file mode 100644 index 00000000..db03e44a --- /dev/null +++ b/lucebox/tests/test_autotune.py @@ -0,0 +1,135 @@ +import pytest +from lucebox.autotune import PROFILES, get_profile, runtime_from_host +from lucebox.types import HostFacts + + +def test_wsl_24gb_defaults_leave_cuda_headroom() -> None: + runtime = runtime_from_host(HostFacts(vram_gb=24, is_wsl=True)) + + assert runtime.budget == 16 + # Bumped 65536 → 98304 on 2026-05-30 after the gemma4-26b coding- + # agent-loop sweep proved 98K serves 90K-token agentic prompts + # with ~3 GB VRAM headroom and no CUDA VMM failures on the 3090 Ti + # WSL configuration (see + # docs/experiments/gemma4-26b-coding-agent-loop-sweep-2026-05-30.md). + assert runtime.max_ctx == 98304 + # lazy is False because the heuristic path does NOT set prefill_drafter, + # and the C++ server silently ignores --lazy-draft without it. Flipping + # to False makes the host config match runtime behaviour. See the + # `entrypoint.sh` warning emitted when the two are out-of-sync. + assert runtime.lazy is False + assert runtime.prefix_cache_slots == 0 + + +def test_native_24gb_caps_context_below_vmm_failure_boundary() -> None: + runtime = runtime_from_host(HostFacts(vram_gb=24, is_wsl=False)) + + assert runtime.budget == 22 + assert runtime.max_ctx == 98304 + assert runtime.lazy is False # see WSL test above + assert runtime.prefix_cache_slots == 0 + + +def test_no_heuristic_tier_sets_lazy_without_prefill_drafter() -> None: + """Regression for the `--lazy-draft ignored` silent no-op. + + The C++ dflash_server drops `--lazy-draft` unless `--prefill-drafter` + is also passed. The heuristic doesn't set `prefill_drafter`, so any + tier that sets `lazy=True` would produce a host config that doesn't + match what actually ran — exactly the mismatch the sindri decode + sweep tripped over (every docker.stderr contained the warning). + """ + for vram in (0, 8, 16, 24, 40, 80): + for is_wsl in (False, True): + rt = runtime_from_host(HostFacts(vram_gb=vram, is_wsl=is_wsl)) + if rt.lazy: + assert rt.prefill_drafter, ( + f"vram={vram} is_wsl={is_wsl}: lazy=True without " + f"prefill_drafter → silent no-op on the C++ server" + ) + + +# ── Profile abstraction + coding-agent-loop bracket ─────────────────────── + + +def test_profiles_registered(): + """Two profiles ship: legacy heuristic + coding-agent-loop.""" + assert "heuristic" in PROFILES + assert "coding-agent-loop" in PROFILES + assert PROFILES["heuristic"].scorer == "decode_tps_snapshot" + assert PROFILES["coding-agent-loop"].scorer == "agent_replay_pass_rate" + + +def test_get_profile_unknown_raises(): + with pytest.raises(KeyError) as excinfo: + get_profile("does-not-exist") + assert "known" in str(excinfo.value).lower() + + +def test_coding_agent_loop_gemma_bracket_excludes_kv_axis(): + """Gemma4's KV is hardcoded F16 — the gemma bracket must NOT vary + cache_type_k/v (a no-op axis there). Regression guard.""" + host = HostFacts(vram_gb=24, is_wsl=True) + cells = PROFILES["coding-agent-loop"].candidate_configs(host, "gemma-4-26b") + assert cells, "gemma bracket must produce at least one cell at 24 GB" + kv_values = {(c.cache_type_k, c.cache_type_v) for c in cells} + assert kv_values == {("", "")}, ( + f"gemma cells should not vary KV quant — got {kv_values}" + ) + + +def test_coding_agent_loop_gemma_bracket_varies_target_axes(): + """The gemma bracket must vary max_ctx × fa_window × budget.""" + host = HostFacts(vram_gb=24, is_wsl=True) + cells = PROFILES["coding-agent-loop"].candidate_configs(host, "gemma-4-26b") + assert len({c.max_ctx for c in cells}) >= 2, "max_ctx should be a swept axis" + assert len({c.fa_window for c in cells}) >= 2, "fa_window should be a swept axis" + assert len({c.budget for c in cells}) >= 2, "budget should be a swept axis" + + +def test_coding_agent_loop_qwen_bracket_includes_kv_axis(): + """Qwen3.6 / laguna respect cache_type_k/v — their bracket must + sweep KV quant.""" + host = HostFacts(vram_gb=24, is_wsl=True) + cells = PROFILES["coding-agent-loop"].candidate_configs(host, "qwen3.6-27b") + kv_values = {c.cache_type_k for c in cells} + assert "tq3_0" in kv_values and "q8_0" in kv_values, ( + f"qwen bracket should include both tq3_0 and q8_0; got {kv_values}" + ) + + +def test_coding_agent_loop_low_vram_falls_back_to_base(): + """Below 22 GB the model barely fits — sweeping risks OOM. Both + arch builders should return just the heuristic base.""" + host = HostFacts(vram_gb=12) + cells_g = PROFILES["coding-agent-loop"].candidate_configs(host, "gemma-4-26b") + cells_q = PROFILES["coding-agent-loop"].candidate_configs(host, "qwen3.6-27b") + assert len(cells_g) == 1 + assert len(cells_q) == 1 + + +def test_large_model_gets_reduced_ctx_on_24gb() -> None: + """gemma-4-31b (~21 GB model) only has ~2 GB headroom on 24 GB VRAM. + The heuristic must cap max_ctx at 32K to avoid OOM, not 98K. + Confirmed empirically on bragi (RTX 5090 Laptop) 2026-05-31.""" + host = HostFacts(vram_gb=24, is_wsl=True) + runtime_large = runtime_from_host(host, preset="gemma-4-31b") + runtime_small = runtime_from_host(host, preset="qwen3.6-27b") + + assert runtime_large.max_ctx == 32768, ( + "gemma-4-31b (~21 GB) leaves only ~2 GB for KV at 24 GB VRAM → must cap at 32K" + ) + assert runtime_small.max_ctx == 98304, ( + "qwen3.6-27b (~18 GB) leaves ~5 GB for KV at 24 GB VRAM → can use 98K" + ) + + +def test_unknown_preset_uses_tier_default() -> None: + """An unknown preset string falls back to the model-agnostic tier default.""" + host = HostFacts(vram_gb=24, is_wsl=True) + runtime_unknown = runtime_from_host(host, preset="some-future-model") + runtime_no_preset = runtime_from_host(host) + + assert runtime_unknown.max_ctx == runtime_no_preset.max_ctx, ( + "Unknown preset should produce the same result as no preset" + ) diff --git a/lucebox/tests/test_autotune_candidate_configs.py b/lucebox/tests/test_autotune_candidate_configs.py new file mode 100644 index 00000000..a5a4522c --- /dev/null +++ b/lucebox/tests/test_autotune_candidate_configs.py @@ -0,0 +1,92 @@ +"""Tests for ``autotune.candidate_configs`` — the sweep bracket. + +Pure-function tests: the bracket size matters per VRAM tier so a 5090- +Laptop test rig doesn't waste 90 min on configs that obviously won't +fit, and so a 24 GB box gets a useful empirical comparison against +the heuristic prior. +""" + +from __future__ import annotations + +from lucebox.types import HostFacts + +from lucebox import autotune as autotune_mod + + +def _key(rt) -> tuple: + """Stable identity tuple for de-dup checks across a config list.""" + return (rt.budget, rt.max_ctx, rt.cache_type_k, rt.cache_type_v) + + +def test_no_vram_signal_returns_single_config() -> None: + """``vram_gb=0`` → base only. No sweep — we have no signal to bracket.""" + configs = autotune_mod.candidate_configs(HostFacts(vram_gb=0)) + assert len(configs) == 1 + + +def test_8gb_host_no_sweep() -> None: + """8 GB → 1 config. Model barely fits; sweeping risks OOM.""" + configs = autotune_mod.candidate_configs(HostFacts(vram_gb=8)) + assert len(configs) == 1 + + +def test_24gb_host_six_to_twelve_configs() -> None: + """24 GB (the 5090/3090 tier) → ~6-12 cells covering budget × KV.""" + configs = autotune_mod.candidate_configs(HostFacts(vram_gb=24)) + assert 6 <= len(configs) <= 12 # noqa: PLR2004 + + +def test_24gb_host_includes_heuristic_config() -> None: + """The heuristic prior must be in the bracket so we can prove or disprove it.""" + host = HostFacts(vram_gb=24) + base = autotune_mod.runtime_from_host(host) + configs = autotune_mod.candidate_configs(host) + assert any(_key(c) == _key(base) for c in configs), ( + f"heuristic {_key(base)} missing from bracket {[_key(c) for c in configs]}" + ) + + +def test_80gb_host_at_most_eight_configs() -> None: + """80 GB → ≤ 8 configs. Larger brackets waste GPU time on + differences the H100/A100 won't show. + """ + configs = autotune_mod.candidate_configs(HostFacts(vram_gb=80)) + assert len(configs) <= 8 # noqa: PLR2004 + + +def test_no_duplicates_in_returned_list() -> None: + """Per-tier brackets must not generate redundant cells.""" + for vram in (8, 16, 24, 40, 80): + configs = autotune_mod.candidate_configs(HostFacts(vram_gb=vram)) + keys = [_key(c) for c in configs] + assert len(set(keys)) == len(keys), ( + f"vram={vram} GB produced duplicate cells: {keys}" + ) + + +def test_16gb_host_three_configs() -> None: + """12-21 GB tier: budget bracket only, 3 cells (including base).""" + configs = autotune_mod.candidate_configs(HostFacts(vram_gb=16)) + # Three budget values × 1 KV combo = 3 cells. + assert len(configs) == 3 # noqa: PLR2004 + + +def test_40gb_host_includes_f16_kv() -> None: + """32-47 GB tier opens up f16 KV — the bracket should test it.""" + configs = autotune_mod.candidate_configs(HostFacts(vram_gb=40)) + kv_types = {c.cache_type_k for c in configs} + assert "f16" in kv_types + assert "tq3_0" in kv_types + assert "q8_0" in kv_types + + +def test_wsl_24gb_includes_wsl_heuristic() -> None: + """WSL 24 GB tier serves up to max_ctx=98304 (bumped from 65536 + after the 2026-05-30 sweep; the original 65K cap cited unverified + VMM failures and the empirical run proved 90K-token prompts on + 98K max_ctx work reliably). The bracket includes this base.""" + host = HostFacts(vram_gb=24, is_wsl=True) + base = autotune_mod.runtime_from_host(host) + assert base.max_ctx == 98304 # WSL heuristic, see runtime_from_host # noqa: PLR2004 + configs = autotune_mod.candidate_configs(host) + assert any(_key(c) == _key(base) for c in configs) diff --git a/lucebox/tests/test_autotune_cli.py b/lucebox/tests/test_autotune_cli.py new file mode 100644 index 00000000..77188fc2 --- /dev/null +++ b/lucebox/tests/test_autotune_cli.py @@ -0,0 +1,175 @@ +"""Tests for the ``lucebox autotune`` CLI surface.""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest +from lucebox.cli import app +from lucebox.types import HostFacts +from typer.testing import CliRunner + +from lucebox import autotune as autotune_mod +from lucebox import config as config_mod + + +def _set_config_path(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + """Pin the on-disk config.toml path under tmp.""" + cfg_path = tmp_path / "config.toml" + monkeypatch.setenv("LUCEBOX_HOME", str(tmp_path)) + return cfg_path + + +def _stub_host(monkeypatch: pytest.MonkeyPatch, vram_gb: int) -> None: + """Force the env-driven HostFacts to a known VRAM tier.""" + monkeypatch.setattr( + "lucebox.host_facts.from_env", + lambda: HostFacts(vram_gb=vram_gb, gpu_name="Test", gpu_count=1), + ) + # cli.py imports from_env directly into its module namespace. + monkeypatch.setattr("lucebox.cli.from_env", lambda: HostFacts(vram_gb=vram_gb)) + + +def test_autotune_json_dumps_dflashruntime( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + _set_config_path(tmp_path, monkeypatch) + _stub_host(monkeypatch, vram_gb=24) + result = CliRunner().invoke(app, ["autotune", "--json"]) + assert result.exit_code == 0 + # Trim ANSI escapes by parsing the bare body. + payload = json.loads(result.stdout) + # 24 GB native → max_ctx=98304, budget=22 (see autotune tiers). + assert payload["budget"] == 22 + assert payload["max_ctx"] == 98304 + + +def test_autotune_apply_writes_eleven_keys( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + cfg_path = _set_config_path(tmp_path, monkeypatch) + _stub_host(monkeypatch, vram_gb=24) + result = CliRunner().invoke(app, ["autotune", "--apply"]) + assert result.exit_code == 0 + assert cfg_path.exists() + # Read back via the same sparse-get to confirm each of the 11 + # allowlisted keys is now "from file" rather than "from default". + from lucebox.cli import DFLASH_ALLOWLIST + + entries = config_mod.config_get(path=cfg_path) + for name in DFLASH_ALLOWLIST: + value, origin = entries[f"dflash.{name}"] + assert origin == "file", f"dflash.{name} did not land in config.toml" + # think_max is NOT in the allowlist — autotune --apply must not touch it. + _value, origin = entries["dflash.think_max"] + assert origin == "default" + + +def test_autotune_default_view_does_not_write_file( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + cfg_path = _set_config_path(tmp_path, monkeypatch) + _stub_host(monkeypatch, vram_gb=24) + result = CliRunner().invoke(app, ["autotune"]) + assert result.exit_code == 0 + # No --apply → no file created. + assert not cfg_path.exists() + + +def test_autotune_apply_refuses_when_persisted_value_differs( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """A previously-persisted dflash.* key with a different value blocks --apply.""" + cfg_path = _set_config_path(tmp_path, monkeypatch) + _stub_host(monkeypatch, vram_gb=24) + # First apply: lands fresh recommendations. + rc1 = CliRunner().invoke(app, ["autotune", "--apply"]) + assert rc1.exit_code == 0 + # Now overwrite one key with a hand-tuned value (simulates a bench + # winner: budget=16 instead of the recommendation's 22). + config_mod.config_set("dflash.budget", 16, path=cfg_path) + config_mod.config_set("dflash.max_ctx", 65536, path=cfg_path) + # Re-invoke --apply: drift guard fires. + result = CliRunner().invoke(app, ["autotune", "--apply"]) + assert result.exit_code == 1 + assert "already differ" in result.stdout + assert "dflash.budget" in result.stdout + assert "current=16" in result.stdout + assert "recommended=22" in result.stdout + # Persisted bench winner stayed in place. + entries = config_mod.config_get(path=cfg_path) + assert entries["dflash.budget"][0] == 16 + assert entries["dflash.budget"][1] == "file" + + +def test_autotune_apply_force_bypasses_drift_guard( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + cfg_path = _set_config_path(tmp_path, monkeypatch) + _stub_host(monkeypatch, vram_gb=24) + config_mod.config_set("dflash.budget", 16, path=cfg_path) + config_mod.config_set("dflash.max_ctx", 65536, path=cfg_path) + result = CliRunner().invoke(app, ["autotune", "--apply", "--force"]) + assert result.exit_code == 0, result.stdout + entries = config_mod.config_get(path=cfg_path) + # The recommendation overwrites the bench winner. + assert entries["dflash.budget"][0] == 22 + + +def test_autotune_apply_first_time_no_guard( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """When no keys are persisted yet, --apply lands clean (no drift to compare to).""" + cfg_path = _set_config_path(tmp_path, monkeypatch) + _stub_host(monkeypatch, vram_gb=24) + result = CliRunner().invoke(app, ["autotune", "--apply"]) + assert result.exit_code == 0 + assert cfg_path.exists() + + +def test_autotune_sweep_flag_is_registered() -> None: + """``--sweep`` and ``--yes`` appear on the autotune CLI surface.""" + result = CliRunner().invoke(app, ["autotune", "--help"]) + assert result.exit_code == 0 + assert "--sweep" in result.output + assert "--yes" in result.output + + +def test_autotune_sweep_and_apply_are_mutually_exclusive( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """`--sweep --apply` is refused — sweep applies its own winner.""" + _set_config_path(tmp_path, monkeypatch) + _stub_host(monkeypatch, vram_gb=24) + result = CliRunner().invoke(app, ["autotune", "--sweep", "--apply", "--yes"]) + assert result.exit_code == 2 # typer.Exit(code=2) # noqa: PLR2004 + + +def test_autotune_sweep_dispatches_to_run_sweep( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """`autotune --sweep` calls into lucebox.sweep.run_sweep (mocked).""" + _set_config_path(tmp_path, monkeypatch) + _stub_host(monkeypatch, vram_gb=24) + + called = {"n": 0} + + def fake_run_sweep(**kw): # noqa: ARG001 + called["n"] += 1 + return 0 + + monkeypatch.setattr("lucebox.sweep.run_sweep", fake_run_sweep) + result = CliRunner().invoke(app, ["autotune", "--sweep", "--yes"]) + assert result.exit_code == 0 + assert called["n"] == 1 + + +def test_recommend_preset_tiers() -> None: + """The preset recommender follows the VRAM thresholds in the spec.""" + assert autotune_mod.recommend_preset(HostFacts(vram_gb=24)) == "qwen3.6-27b" + assert autotune_mod.recommend_preset(HostFacts(vram_gb=22)) == "qwen3.6-27b" + assert autotune_mod.recommend_preset(HostFacts(vram_gb=20)) == "laguna-xs.2" + assert autotune_mod.recommend_preset(HostFacts(vram_gb=16)) == "laguna-xs.2" + assert autotune_mod.recommend_preset(HostFacts(vram_gb=12)) is None + assert autotune_mod.recommend_preset(HostFacts(vram_gb=0)) is None diff --git a/lucebox/tests/test_check.py b/lucebox/tests/test_check.py new file mode 100644 index 00000000..3fdd469d --- /dev/null +++ b/lucebox/tests/test_check.py @@ -0,0 +1,118 @@ +"""Tests for ``lucebox check`` — readiness report. + +The check command has two surfaces that must stay independent: + + * pass/fail checks → drive the exit code, so the command is usable + as a CI exit-code gate; + * Host facts section → informational, prints the LUCEBOX_HOST_* + convoy that gets baked into /opt/lucebox-hub/HOST_INFO inside + the container. +""" + +from __future__ import annotations + +import pytest +from lucebox.cli import app +from lucebox.types import HostFacts +from rich.console import Console +from typer.testing import CliRunner + +from lucebox import host_check + + +def test_check_prints_host_facts_section(monkeypatch: pytest.MonkeyPatch) -> None: + """`lucebox check` includes a Host facts block sourced from LUCEBOX_HOST_*.""" + monkeypatch.setenv("LUCEBOX_HOST_OS_PRETTY", "Ubuntu 22.04.3 LTS") + monkeypatch.setenv("LUCEBOX_HOST_KERNEL", "6.6.87.2-microsoft-standard-WSL2") + monkeypatch.setenv("LUCEBOX_HOST_WSL_VERSION", "wsl2") + monkeypatch.setenv("LUCEBOX_HOST_DOCKER_VERSION", "29.1.3") + monkeypatch.setenv("LUCEBOX_HOST_DRIVER_VERSION", "596.36") + monkeypatch.setenv("LUCEBOX_HOST_NVIDIA_CTK_VERSION", "1.16.2") + monkeypatch.setenv("LUCEBOX_HOST_CPU_MODEL", "Intel Test CPU") + monkeypatch.setenv( + "LUCEBOX_HOST_GPU_LIST_CSV", + "0, GPU-abc, 00000000:01:00.0, NVIDIA RTX 5090, 12.0, 24576 MiB, 175.00 W", + ) + # Stub HostFacts so the pass/fail checks succeed at least minimally. + # `cli.check` imports `from_env` into its module namespace, so patch + # both names. + def stub() -> HostFacts: + return HostFacts( + nproc=24, + ram_gb=64, + gpu_vendor="nvidia", + gpu_name="NVIDIA RTX 5090", + gpu_count=1, + vram_gb=24, + gpu_sm="120", + driver_version="596.36", + driver_major=596, + has_systemd=True, + is_wsl=True, + has_docker=True, + docker_version="29.1.3", + ctk="runtime", + ) + monkeypatch.setattr("lucebox.host_facts.from_env", stub) + monkeypatch.setattr("lucebox.cli.from_env", stub) + result = CliRunner().invoke(app, ["check"]) + # The pass/fail half of `check` should still exit 0 on this stubbed host. + assert result.exit_code == 0, result.stdout + assert "Host facts" in result.stdout + assert "Ubuntu 22.04.3 LTS" in result.stdout + assert "wsl2" in result.stdout + assert "1.16.2" in result.stdout + assert "Intel Test CPU" in result.stdout + # Multi-GPU table line. + assert "NVIDIA RTX 5090" in result.stdout + + +def test_render_host_facts_unset_env_shows_placeholders( + monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture[str] +) -> None: + """All LUCEBOX_HOST_* unset → section still renders with explicit (unset) markers.""" + for k in list(__import__("os").environ): + if k.startswith("LUCEBOX_HOST_"): + monkeypatch.delenv(k, raising=False) + console = Console(force_terminal=False, no_color=True, record=True) + host_check.render_host_facts(console) + text = console.export_text() + assert "Host facts" in text + # Multi-line section renders even when no env was passed in. + assert "(unset)" in text + assert "gpus" in text + + +def test_check_exit_code_independent_of_host_facts( + monkeypatch: pytest.MonkeyPatch, +) -> None: + """Host facts section must not change the exit-code semantics of check. + + Drives the pass/fail logic through a known-fail HostFacts (no docker) + and asserts the exit code is still 1, regardless of what the Host + facts block prints. + """ + monkeypatch.setenv("LUCEBOX_HOST_OS_PRETTY", "Bare Linux") + def stub() -> HostFacts: + return HostFacts( + nproc=8, + ram_gb=16, + gpu_vendor="nvidia", + gpu_name="X", + gpu_count=1, + vram_gb=24, + gpu_sm="86", + driver_version="555.00", + driver_major=555, + has_systemd=False, + is_wsl=False, + has_docker=False, # → fail + docker_version="", + ctk="none", # also fail + ) + monkeypatch.setattr("lucebox.host_facts.from_env", stub) + monkeypatch.setattr("lucebox.cli.from_env", stub) + result = CliRunner().invoke(app, ["check"]) + assert result.exit_code == 1 + # Host facts block still printed despite the failure. + assert "Host facts" in result.stdout diff --git a/lucebox/tests/test_cli.py b/lucebox/tests/test_cli.py new file mode 100644 index 00000000..4c980ea6 --- /dev/null +++ b/lucebox/tests/test_cli.py @@ -0,0 +1,111 @@ +"""Tests for the top-level Typer surface.""" + +from __future__ import annotations + +import os + +import lucebox.cli as cli +from lucebox.cli import app +from typer.testing import CliRunner + + +def test_benchmark_subcommand_is_removed() -> None: + """The benchmark verb was folded into `autotune --sweep`.""" + result = CliRunner().invoke(app, ["benchmark", "--help"]) + assert result.exit_code != 0 + + +def test_profile_help_exposes_collapsed_surface() -> None: + """The new profile is a ~150-line wrapper; only --level and --url remain.""" + result = CliRunner().invoke(app, ["profile", "--help"]) + + assert result.exit_code == 0 + assert "--level" in result.output + assert "--url" in result.output + # Old step-registry knobs are gone. + assert "--export-snapshot" not in result.output + assert "--force-refresh" not in result.output + assert "--dry-run" not in result.output + assert "--step" not in result.output + + +def test_default_variant_honors_wrapper_env() -> None: + old = os.environ.get("LUCEBOX_VARIANT") + try: + os.environ["LUCEBOX_VARIANT"] = "integration-props-uv-squared-clean-cuda12" + + assert cli._pick_variant_from_driver(555, "86") == ( + "integration-props-uv-squared-clean-cuda12" + ) + finally: + if old is None: + os.environ.pop("LUCEBOX_VARIANT", None) + else: + os.environ["LUCEBOX_VARIANT"] = old + + +def test_config_subcommand_is_registered() -> None: + result = CliRunner().invoke(app, ["config", "--help"]) + assert result.exit_code == 0 + assert "get" in result.output + assert "set" in result.output + assert "unset" in result.output + + +def test_models_subcommand_is_registered() -> None: + result = CliRunner().invoke(app, ["models", "--help"]) + assert result.exit_code == 0 + assert "list" in result.output + assert "download" in result.output + + +def test_autotune_subcommand_is_registered() -> None: + result = CliRunner().invoke(app, ["autotune", "--help"]) + assert result.exit_code == 0 + assert "--apply" in result.output + assert "--json" in result.output + + +def test_legacy_subcommands_are_removed() -> None: + """`configure` and `download-models` were folded into config/models.""" + cfg = CliRunner().invoke(app, ["configure", "--help"]) + assert cfg.exit_code != 0 + dl = CliRunner().invoke(app, ["download-models", "--help"]) + assert dl.exit_code != 0 + + +def test_server_run_spec_forwards_lucebox_host_env(monkeypatch) -> None: + """server_run_spec carries LUCEBOX_HOST_* from the orchestrator into the server. + + lucebox.sh exports the LUCEBOX_HOST_* convoy before `docker run` on the + orchestrator; the orchestrator inherits them and we forward each one + as ``-e KEY=VALUE`` to the server container so entrypoint.sh's + write_host_info() can populate /opt/lucebox-hub/HOST_INFO. + """ + import lucebox.docker_run as docker_run + from lucebox.config import live_config + + # Scrub any pre-existing LUCEBOX_HOST_* env so the test sees only what we set. + for k in list(os.environ): + if k.startswith("LUCEBOX_HOST_"): + monkeypatch.delenv(k, raising=False) + monkeypatch.setenv("LUCEBOX_HOST_OS_PRETTY", "Ubuntu 22.04.3 LTS") + monkeypatch.setenv("LUCEBOX_HOST_KERNEL", "6.6.87.2-microsoft-standard-WSL2") + monkeypatch.setenv("LUCEBOX_HOST_WSL_VERSION", "wsl2") + monkeypatch.setenv( + "LUCEBOX_HOST_GPU_LIST_CSV", + "0, GPU-x, 00000000:01:00.0, NVIDIA RTX 5090, 12.0, 24576 MiB, 175.00 W", + ) + + cfg = live_config() + spec = docker_run.server_run_spec(cfg) + env_keys = {k for k, _ in spec.env} + assert "LUCEBOX_HOST_OS_PRETTY" in env_keys + assert "LUCEBOX_HOST_KERNEL" in env_keys + assert "LUCEBOX_HOST_WSL_VERSION" in env_keys + assert "LUCEBOX_HOST_GPU_LIST_CSV" in env_keys + # DFLASH_* still present. + assert "DFLASH_BUDGET" in env_keys + # Values surface verbatim. + env_map = dict(spec.env) + assert env_map["LUCEBOX_HOST_OS_PRETTY"] == "Ubuntu 22.04.3 LTS" diff --git a/lucebox/tests/test_config.py b/lucebox/tests/test_config.py new file mode 100644 index 00000000..d60f3d88 --- /dev/null +++ b/lucebox/tests/test_config.py @@ -0,0 +1,176 @@ +"""Tests for the sparse TOML config persistence layer.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest +from lucebox.config import config_get, config_set, config_unset + +from lucebox import config + + +def test_legacy_env_migration_skips_invalid_values(tmp_path: Path) -> None: + legacy = tmp_path / "config.env" + legacy.write_text("DFLASH_BUDGET=not-an-int\nDFLASH_MAX_CTX=65536\nDFLASH_LAZY=true\n") + + cfg, _doc = config._load_legacy_env(legacy) + + assert cfg.dflash.budget == 22 + assert cfg.dflash.max_ctx == 65536 + assert cfg.dflash.lazy is True + + +def test_image_variant_round_trips_from_toml(tmp_path: Path) -> None: + path = tmp_path / "config.toml" + path.write_text( + "[image]\n" + 'registry = "ghcr.io/luce-org/lucebox-hub"\n' + 'variant = "integration-props-uv-squared-clean-cuda12"\n' + ) + + cfg = config._load_toml(path) + + assert cfg.image == "ghcr.io/luce-org/lucebox-hub" + assert cfg.variant == "integration-props-uv-squared-clean-cuda12" + + +def test_model_preset_round_trips_through_set_and_load(tmp_path: Path) -> None: + """Setting model.preset writes a sparse TOML doc that loads back correctly.""" + path = tmp_path / "config.toml" + config_set("model.preset", "gemma-4-26b", path=path) + config_set("model.target_file", "google_gemma-4-26B-A4B-it-Q4_K_M.gguf", path=path) + + cfg = config._load_toml(path) + assert cfg.model.preset == "gemma-4-26b" + assert cfg.model.target_file == "google_gemma-4-26B-A4B-it-Q4_K_M.gguf" + + +def test_legacy_config_without_model_section_stays_unpinned(tmp_path: Path) -> None: + """Legacy configs (no [model] section) must NOT silently pin to qwen.""" + path = tmp_path / "config.toml" + path.write_text('[image]\nvariant = "cuda12"\n') + + cfg = config._load_toml(path) + + assert cfg.model.preset == "" + assert cfg.model.target_file == "" + assert cfg.model.draft_file == "" + + +def test_model_section_picks_target_file_from_registry(tmp_path: Path) -> None: + """A bare [model] preset="..." entry pulls target_file from the registry.""" + path = tmp_path / "config.toml" + path.write_text('[model]\npreset = "gemma-4-31b"\n') + + cfg = config._load_toml(path) + + assert cfg.model.preset == "gemma-4-31b" + assert cfg.model.target_file == "google_gemma-4-31B-it-Q4_K_M.gguf" + + +def test_model_section_picks_draft_file_from_registry(tmp_path: Path) -> None: + """When preset has a published draft GGUF, [model] preset="..." picks draft_file too.""" + path = tmp_path / "config.toml" + path.write_text('[model]\npreset = "qwen3.6-27b"\n') + + cfg = config._load_toml(path) + assert cfg.model.preset == "qwen3.6-27b" + assert cfg.model.draft_file == "dflash-draft-3.6-q4_k_m.gguf" + + +def test_config_set_writes_only_named_key(tmp_path: Path) -> None: + """Sparse persistence: setting one key does NOT serialize every default.""" + path = tmp_path / "config.toml" + config_set("dflash.budget", 16, path=path) + body = path.read_text() + # The only [dflash] field that should appear is budget — none of the others. + assert "[dflash]" in body + assert "budget = 16" in body + assert "max_ctx" not in body # not user-set, must not appear + assert "lazy" not in body + assert "[host]" not in body # whole section absent + assert "[image]" not in body # not touched either + + +def test_config_set_preserves_existing_keys(tmp_path: Path) -> None: + """Setting a new key leaves previously-set keys intact.""" + path = tmp_path / "config.toml" + config_set("dflash.budget", 16, path=path) + config_set("model.preset", "qwen3.6-27b", path=path) + body = path.read_text() + assert "budget = 16" in body + assert 'preset = "qwen3.6-27b"' in body + + +def test_config_unset_removes_one_key(tmp_path: Path) -> None: + """Unset removes the named key and leaves siblings alone.""" + path = tmp_path / "config.toml" + config_set("dflash.budget", 16, path=path) + config_set("dflash.max_ctx", 65536, path=path) + changed = config_unset("dflash.budget", path=path) + assert changed is True + body = path.read_text() + assert "budget" not in body + assert "max_ctx = 65536" in body + + +def test_config_unset_drops_empty_section(tmp_path: Path) -> None: + """Unsetting the last key in a section drops the empty section.""" + path = tmp_path / "config.toml" + config_set("dflash.budget", 16, path=path) + config_unset("dflash.budget", path=path) + body = path.read_text() + # The section may still exist as an empty table but `[dflash]` shouldn't. + assert "[dflash]" not in body + + +def test_config_get_reports_origin(tmp_path: Path) -> None: + """Each key carries an origin label — `file` when overridden, `default` otherwise.""" + path = tmp_path / "config.toml" + config_set("dflash.budget", 9, path=path) + entries = config_get(path=path) + assert entries["dflash.budget"] == (9, "file") + # max_ctx wasn't set so should report the live default. + value, origin = entries["dflash.max_ctx"] + assert origin == "default" + assert value == 16384 # DflashRuntime.max_ctx default + + +def test_config_get_rejects_unknown_key(tmp_path: Path) -> None: + path = tmp_path / "config.toml" + with pytest.raises(KeyError): + config_get("not.a.key", path=path) + + +def test_config_set_rejects_unknown_key(tmp_path: Path) -> None: + path = tmp_path / "config.toml" + with pytest.raises(KeyError): + config_set("not.a.key", 1, path=path) + + +def test_config_set_auto_creates_file(tmp_path: Path) -> None: + """`config set` creates a missing config.toml on first write.""" + path = tmp_path / "config.toml" + assert not path.exists() + config_set("port", 9090, path=path) + assert path.exists() + assert "port = 9090" in path.read_text() + + +def test_save_writes_sparse_doc(tmp_path: Path) -> None: + """`save` writes whatever doc is handed in — no defaults serialized.""" + path = tmp_path / "config.toml" + cfg = config._from_dict({}) + config.save(cfg, path, doc={"dflash": {"budget": 9}}) + body = path.read_text() + assert "budget = 9" in body + assert "max_ctx" not in body + + +def test_live_config_uses_recommend_preset_indirectly(tmp_path: Path) -> None: + """``live_config()`` returns a Config — no implicit preset when none given.""" + # The function probes the env-provided HostFacts; with no preset arg + # we must NOT silently pin one (that would surprise legacy installs). + cfg = config.live_config() + assert cfg.model.preset == "" diff --git a/lucebox/tests/test_config_cli.py b/lucebox/tests/test_config_cli.py new file mode 100644 index 00000000..446ab41b --- /dev/null +++ b/lucebox/tests/test_config_cli.py @@ -0,0 +1,127 @@ +"""Tests for the ``lucebox config`` sub-app CLI.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest +from lucebox.cli import app +from typer.testing import CliRunner + + +def _set_config_path(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + monkeypatch.setenv("LUCEBOX_HOME", str(tmp_path)) + return tmp_path / "config.toml" + + +def test_config_set_then_get_round_trip( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + cfg_path = _set_config_path(tmp_path, monkeypatch) + set_result = CliRunner().invoke(app, ["config", "set", "dflash.budget=12"]) + assert set_result.exit_code == 0 + assert cfg_path.exists() + get_result = CliRunner().invoke(app, ["config", "get", "dflash.budget"]) + assert get_result.exit_code == 0 + assert "12" in get_result.stdout + assert "from file" in get_result.stdout + + +def test_config_get_with_no_key_lists_every_registered_key( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + _set_config_path(tmp_path, monkeypatch) + result = CliRunner().invoke(app, ["config", "get"]) + assert result.exit_code == 0 + # Every registered dotted key shows up at least once. + for key in ("model.preset", "dflash.budget", "port"): + assert key in result.stdout + + +def test_config_unset_drops_key( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + cfg_path = _set_config_path(tmp_path, monkeypatch) + CliRunner().invoke(app, ["config", "set", "dflash.budget=9"]) + assert "budget = 9" in cfg_path.read_text() + unset_result = CliRunner().invoke(app, ["config", "unset", "dflash.budget"]) + assert unset_result.exit_code == 0 + body = cfg_path.read_text() + assert "budget" not in body + + +def test_config_set_unknown_key_errors( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + _set_config_path(tmp_path, monkeypatch) + result = CliRunner().invoke(app, ["config", "set", "totally.unknown=1"]) + assert result.exit_code == 2 + + +def test_config_set_rejects_missing_equals( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + _set_config_path(tmp_path, monkeypatch) + result = CliRunner().invoke(app, ["config", "set", "dflash.budget"]) + assert result.exit_code == 2 + + +def test_config_set_creates_file_when_missing( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + cfg_path = _set_config_path(tmp_path, monkeypatch) + assert not cfg_path.exists() + CliRunner().invoke(app, ["config", "set", "port=9090"]) + assert cfg_path.exists() + assert "port = 9090" in cfg_path.read_text() + + +def test_load_or_build_env_overrides_persisted_config( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """LUCEBOX_* env vars must win over config.toml. + + Regression test for the precedence bug fixed in this commit: prior + to the fix, `_load_or_build()` returned `config_mod.load()`'s result + verbatim when config.toml existed, so the systemd unit's + `Environment=LUCEBOX_IMAGE=...` was silently ignored. Sindri's + config.toml had `[image]` without `registry`, which made the + dataclass default `ghcr.io/luce-org/lucebox-hub` win over the + intended easel image. + """ + from lucebox.cli import _load_or_build + + cfg_path = _set_config_path(tmp_path, monkeypatch) + # Write a config.toml WITHOUT an image.registry line — the + # bug-trigger shape on sindri. + cfg_path.write_text( + '[image]\nvariant = "cuda12"\n[runtime]\nport = 9090\n' + '[dflash]\nbudget = 22\n' + ) + # Env should override what config.toml says (and what dataclass + # defaults fill in for missing keys). + monkeypatch.setenv("LUCEBOX_IMAGE", "ghcr.io/myfork/lucebox-hub") + monkeypatch.setenv("LUCEBOX_PORT", "7777") + monkeypatch.setenv("LUCEBOX_CONTAINER", "lucebox-test") + cfg = _load_or_build() + assert cfg.image == "ghcr.io/myfork/lucebox-hub" # env beats dataclass default + assert cfg.port == 7777 # env beats config.toml + assert cfg.container_name == "lucebox-test" # env applied + # variant is in config.toml — config.toml value (no env override). + assert cfg.variant == "cuda12" + # dflash IS persisted in config.toml — env doesn't touch it (no DFLASH_* + # env hooks at this layer). + assert cfg.dflash.budget == 22 + + +def test_load_or_build_no_toml_env_overrides_defaults( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """When config.toml is absent, env must still override defaults.""" + from lucebox.cli import _load_or_build + + _set_config_path(tmp_path, monkeypatch) + # Don't write a config.toml — exercise the live_config() fallback. + monkeypatch.setenv("LUCEBOX_IMAGE", "ghcr.io/myfork/lucebox-hub") + cfg = _load_or_build() + assert cfg.image == "ghcr.io/myfork/lucebox-hub" diff --git a/lucebox/tests/test_download.py b/lucebox/tests/test_download.py new file mode 100644 index 00000000..a64a10cd --- /dev/null +++ b/lucebox/tests/test_download.py @@ -0,0 +1,301 @@ +"""Tests for the model-download orchestration. + +The downloader now drives `huggingface_hub.hf_hub_download` directly +(no subprocess) and verifies size + sha256 against the repo metadata +before re-fetching. The tests stub out the network calls so the +behavior contract — what gets requested, when downloads are skipped — +stays pinned without actually talking to the Hub. +""" + +from pathlib import Path +from types import SimpleNamespace + +import pytest +from lucebox.download import DEFAULT_PRESET, PRESETS, resolve_preset, status + +from lucebox import download + + +def test_default_preset_uses_quantized_gguf_draft(): + assert DEFAULT_PRESET.draft_repo == "spiritbuun/Qwen3.6-27B-DFlash-GGUF" + assert DEFAULT_PRESET.draft_file == "dflash-draft-3.6-q4_k_m.gguf" + + +def test_default_preset_is_registered_under_qwen_name(): + assert DEFAULT_PRESET is PRESETS["qwen3.6-27b"] + assert DEFAULT_PRESET.name == "qwen3.6-27b" + + +def test_resolve_preset_returns_default_on_none(): + assert resolve_preset(None) is DEFAULT_PRESET + assert resolve_preset("") is DEFAULT_PRESET + + +def test_resolve_preset_picks_gemma_target_and_draft(): + pres = resolve_preset("gemma-4-26b") + assert pres.name == "gemma-4-26b" + assert pres.target_repo == "bartowski/google_gemma-4-26B-A4B-it-GGUF" + assert pres.target_file == "google_gemma-4-26B-A4B-it-Q4_K_M.gguf" + assert pres.draft_repo == "Lucebox/gemma-4-26B-A4B-it-DFlash-GGUF" + assert pres.draft_file == "gemma-4-26B-A4B-it-DFlash-q8_0.gguf" + assert pres.has_draft + + +def test_resolve_preset_supports_target_only_laguna(): + pres = resolve_preset("laguna-xs.2") + assert pres.target_repo == "Lucebox/Laguna-XS.2-GGUF" + assert pres.draft_repo is None + assert not pres.has_draft + + +def test_resolve_preset_picks_qwen36_moe_target_only(): + """Qwen3.6 MoE preset routes to unsloth's UD-Q4_K_M file, no draft. + + The MoE variant has no published DFlash draft GGUF (verified against + HfApi.repo_info 2026-05-28), so it runs target-only like Laguna. The + file stem is `Qwen3.6-35B-A3B-UD-Q4_K_M.gguf` — the unsloth repo only + publishes the UD ("unsloth dynamic") family at Q4_K_M, not a plain + `Q4_K_M.gguf`. + """ + pres = resolve_preset("qwen3.6-moe") + assert pres.name == "qwen3.6-moe" + assert pres.target_repo == "unsloth/Qwen3.6-35B-A3B-GGUF" + assert pres.target_file == "Qwen3.6-35B-A3B-UD-Q4_K_M.gguf" + assert pres.draft_repo is None + assert pres.draft_file is None + assert not pres.has_draft + + +def test_download_preset_target_only_qwen36_moe_skips_draft(tmp_path, monkeypatch): + """qwen3.6-moe behaves identically to laguna-xs.2: target only, no draft fetch.""" + cfg = SimpleNamespace(models_dir=tmp_path) + pres = resolve_preset("qwen3.6-moe") + assert not pres.has_draft + fetches: list[tuple[str, str]] = [] + + def _meta(_api, repo_id: str, filename: str) -> tuple[int, None]: + return 10, None + + def _stub_fetch(api, repo_id, filename, local_dir, console): # noqa: ARG001 + fetches.append((repo_id, filename)) + out = local_dir / filename + out.parent.mkdir(parents=True, exist_ok=True) + with out.open("wb") as f: + f.truncate(10) + return out + + monkeypatch.setattr(download, "_file_meta", _meta) + monkeypatch.setattr(download, "_fetch", _stub_fetch) + + assert download.download_preset(cfg, pres) == 0 + # Only the target — no draft attempt at all. + assert fetches == [(pres.target_repo, pres.target_file)] + + +def test_status_qwen36_moe_reports_draft_present_when_target_only(tmp_path, monkeypatch): + """No published draft → status reports draft_present=True (nothing to fetch).""" + cfg = SimpleNamespace(models_dir=tmp_path) + pres = resolve_preset("qwen3.6-moe") + + def _meta(_api, repo_id: str, filename: str) -> tuple[int, None]: + return 22 * 10**9, None + + monkeypatch.setattr(download, "_file_meta", _meta) + # Target absent → target_present False, draft_present True (no draft). + assert status(cfg, pres) == {"target_present": False, "draft_present": True} + + +def test_resolve_preset_unknown_name_lists_known_options(): + with pytest.raises(KeyError) as exc_info: + resolve_preset("qwen-99b") + msg = str(exc_info.value) + # Every registered preset must appear in the suggestion list so the + # user can copy-paste the right name. + for name in PRESETS: + assert name in msg + + +def _stub_file_meta(target_size: int, draft_size: int): + """Build a `_file_meta` replacement that returns (size, None) per repo+file. + + sha256 is left None so tests don't need to compute real hashes; the + real metadata path is exercised by the live `models download` + invocation, not the unit tests. + """ + + def _meta(_api, repo_id: str, filename: str) -> tuple[int, None]: + if repo_id == DEFAULT_PRESET.target_repo and filename == DEFAULT_PRESET.target_file: + return target_size, None + if repo_id == DEFAULT_PRESET.draft_repo and filename == DEFAULT_PRESET.draft_file: + return draft_size, None + raise FileNotFoundError(f"unexpected ({repo_id}, {filename})") + + return _meta + + +def test_status_checks_default_draft_gguf(tmp_path, monkeypatch): + cfg = SimpleNamespace(models_dir=tmp_path) + draft_dir = tmp_path / "draft" + draft_dir.mkdir() + target = tmp_path / DEFAULT_PRESET.target_file + draft = draft_dir / DEFAULT_PRESET.draft_file + + monkeypatch.setattr(download, "_file_meta", _stub_file_meta(target_size=1024, draft_size=512)) + + # Neither file exists yet. + assert status(cfg) == {"target_present": False, "draft_present": False} + + # Write files at the expected sizes. + with target.open("wb") as f: + f.truncate(1024) + with draft.open("wb") as f: + f.truncate(512) + assert status(cfg) == {"target_present": True, "draft_present": True} + + +def test_status_rejects_partial_model_files(tmp_path, monkeypatch): + cfg = SimpleNamespace(models_dir=tmp_path) + draft_dir = tmp_path / "draft" + draft_dir.mkdir() + target = tmp_path / DEFAULT_PRESET.target_file + draft = draft_dir / DEFAULT_PRESET.draft_file + target.write_bytes(b"partial") + draft.write_bytes(b"partial") + + # Repo says the target is 1 GB; a 7-byte file is partial, not present. + monkeypatch.setattr( + download, "_file_meta", _stub_file_meta(target_size=10**9, draft_size=10**6) + ) + assert status(cfg) == {"target_present": False, "draft_present": False} + + +def test_current_bytes_reads_xet_staging_path(tmp_path): + """Regression: progress polling must see hf-xet's hashed staging file. + + huggingface_hub 1.x writes partial Xet downloads to + ``{local_dir}/.cache/huggingface/download/{short_hash}.{etag}.incomplete`` + — NOT to ``{local_dir}/{filename}.incomplete``. Before the fix the + polling code only checked the latter (which never appears) so the + Rich progress bar sat at 0 bytes for the entire transfer. + """ + filename = "model.gguf" + etag = "abc123" + candidates = download._incomplete_path_candidates(tmp_path, filename, etag) + # The first candidate must point at the actual hf-xet staging path. + xet_path: Path = candidates[0] + assert xet_path.parent == tmp_path / ".cache" / "huggingface" / "download" + assert xet_path.name.endswith(f".{etag}.incomplete") + + # Now: writing to that path must be observed by _current_bytes. + xet_path.parent.mkdir(parents=True, exist_ok=True) + xet_path.write_bytes(b"x" * 4096) + target = tmp_path / filename + assert download._current_bytes(target, candidates) == 4096 + + +def test_current_bytes_falls_back_to_glob_without_etag(tmp_path): + """When sha256 is unknown we still find growing .incomplete files.""" + filename = "model.gguf" + candidates = download._incomplete_path_candidates(tmp_path, filename, etag=None) + target = tmp_path / filename + + staging = tmp_path / ".cache" / "huggingface" / "download" + staging.mkdir(parents=True, exist_ok=True) + (staging / "deadbeef.deadbeef.incomplete").write_bytes(b"x" * 8192) + assert download._current_bytes(target, candidates) == 8192 + + +def test_current_bytes_prefers_final_target_when_complete(tmp_path): + filename = "model.gguf" + candidates = download._incomplete_path_candidates(tmp_path, filename, etag="abc") + target = tmp_path / filename + target.write_bytes(b"x" * 1234) + assert download._current_bytes(target, candidates) == 1234 + + +def test_download_preset_fetches_exact_draft_file(tmp_path, monkeypatch): + cfg = SimpleNamespace(models_dir=tmp_path) + fetches: list[tuple[str, str, str]] = [] + + monkeypatch.setattr(download, "_file_meta", _stub_file_meta(target_size=10, draft_size=10)) + + # Stub the actual download to record what was requested + create a stub + # file of the expected size so `_local_matches` would pass on a re-run. + def _stub_fetch(api, repo_id, filename, local_dir, console): # noqa: ARG001 + fetches.append((repo_id, filename, str(local_dir))) + target = local_dir / filename + target.parent.mkdir(parents=True, exist_ok=True) + with target.open("wb") as f: + f.truncate(10) + return target + + monkeypatch.setattr(download, "_fetch", _stub_fetch) + + assert download.download_preset(cfg) == 0 + assert (DEFAULT_PRESET.target_repo, DEFAULT_PRESET.target_file, str(tmp_path)) in fetches + assert ( + DEFAULT_PRESET.draft_repo, + DEFAULT_PRESET.draft_file, + str(tmp_path / "draft"), + ) in fetches + + +def test_download_preset_routes_gemma_preset_to_correct_repos(tmp_path, monkeypatch): + cfg = SimpleNamespace(models_dir=tmp_path) + pres = resolve_preset("gemma-4-26b") + fetches: list[tuple[str, str, str]] = [] + + def _meta(_api, repo_id: str, filename: str) -> tuple[int, None]: + return 10, None + + def _stub_fetch(api, repo_id, filename, local_dir, console): # noqa: ARG001 + fetches.append((repo_id, filename, str(local_dir))) + out = local_dir / filename + out.parent.mkdir(parents=True, exist_ok=True) + with out.open("wb") as f: + f.truncate(10) + return out + + monkeypatch.setattr(download, "_file_meta", _meta) + monkeypatch.setattr(download, "_fetch", _stub_fetch) + + assert download.download_preset(cfg, pres) == 0 + assert (pres.target_repo, pres.target_file, str(tmp_path)) in fetches + assert (pres.draft_repo, pres.draft_file, str(tmp_path / "draft")) in fetches + + +def test_download_preset_target_only_skips_draft_fetch(tmp_path, monkeypatch): + cfg = SimpleNamespace(models_dir=tmp_path) + pres = resolve_preset("laguna-xs.2") + assert not pres.has_draft + fetches: list[tuple[str, str]] = [] + + def _meta(_api, repo_id: str, filename: str) -> tuple[int, None]: + return 10, None + + def _stub_fetch(api, repo_id, filename, local_dir, console): # noqa: ARG001 + fetches.append((repo_id, filename)) + out = local_dir / filename + out.parent.mkdir(parents=True, exist_ok=True) + with out.open("wb") as f: + f.truncate(10) + return out + + monkeypatch.setattr(download, "_file_meta", _meta) + monkeypatch.setattr(download, "_fetch", _stub_fetch) + + assert download.download_preset(cfg, pres) == 0 + # Target fetched, no draft fetch attempted at all. + assert fetches == [(pres.target_repo, pres.target_file)] + + +def test_status_target_only_preset_reports_draft_as_present(tmp_path, monkeypatch): + cfg = SimpleNamespace(models_dir=tmp_path) + pres = resolve_preset("laguna-xs.2") + + def _meta(_api, repo_id: str, filename: str) -> tuple[int, None]: + return 1024, None + + monkeypatch.setattr(download, "_file_meta", _meta) + # Target absent → target_present False, draft_present True (nothing to download). + assert status(cfg, pres) == {"target_present": False, "draft_present": True} diff --git a/lucebox/tests/test_models_cli.py b/lucebox/tests/test_models_cli.py new file mode 100644 index 00000000..f4458304 --- /dev/null +++ b/lucebox/tests/test_models_cli.py @@ -0,0 +1,142 @@ +"""Tests for the ``lucebox models`` sub-app.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest +from lucebox.cli import app +from lucebox.download import PRESETS +from lucebox.types import HostFacts +from typer.testing import CliRunner + +from lucebox import config as config_mod +from lucebox import download as download_mod + + +def _set_config_path(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + monkeypatch.setenv("LUCEBOX_HOME", str(tmp_path)) + monkeypatch.setenv("LUCEBOX_MODELS", str(tmp_path / "models")) + return tmp_path / "config.toml" + + +def _stub_host(monkeypatch: pytest.MonkeyPatch, vram_gb: int) -> None: + monkeypatch.setattr("lucebox.host_facts.from_env", lambda: HostFacts(vram_gb=vram_gb)) + monkeypatch.setattr("lucebox.cli.from_env", lambda: HostFacts(vram_gb=vram_gb)) + + +def test_models_list_shows_every_registered_preset( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + _set_config_path(tmp_path, monkeypatch) + _stub_host(monkeypatch, vram_gb=24) + result = CliRunner().invoke(app, ["models", "list"]) + assert result.exit_code == 0 + for name in PRESETS: + assert name in result.stdout + + +def test_models_default_view_lists_only_installed( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + _set_config_path(tmp_path, monkeypatch) + _stub_host(monkeypatch, vram_gb=24) + # No models on disk → default view says "no presets installed". + result = CliRunner().invoke(app, ["models"]) + assert result.exit_code == 0 + assert "No presets installed" in result.stdout or "Models dir" in result.stdout + + +def test_models_download_recommends_when_empty( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """No preset configured + nothing on argv → auto-recommend + auto-activate.""" + cfg_path = _set_config_path(tmp_path, monkeypatch) + _stub_host(monkeypatch, vram_gb=24) + + # Stub the network calls so the test doesn't try to talk to HF. + monkeypatch.setattr(download_mod, "download_preset", lambda cfg, pres: 0) + monkeypatch.setattr( + download_mod, + "status", + lambda cfg, pres: {"target_present": True, "draft_present": True}, + ) + + result = CliRunner().invoke(app, ["models", "download"]) + assert result.exit_code == 0 + assert "Recommended preset" in result.stdout + assert cfg_path.exists() + # The active preset should now be model.preset = qwen3.6-27b. + entries = config_mod.config_get(path=cfg_path) + assert entries["model.preset"] == ("qwen3.6-27b", "file") + + +def test_models_download_refuses_silent_switch( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """When a preset is already active, `download` with no arg refuses.""" + cfg_path = _set_config_path(tmp_path, monkeypatch) + _stub_host(monkeypatch, vram_gb=24) + config_mod.config_set("model.preset", "qwen3.6-27b", path=cfg_path) + + result = CliRunner().invoke(app, ["models", "download"]) + assert result.exit_code == 2 + assert "already active" in result.stdout.lower() + + +def test_models_download_explicit_preset_no_activate( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """Passing a preset without --activate downloads but doesn't flip model.preset.""" + cfg_path = _set_config_path(tmp_path, monkeypatch) + _stub_host(monkeypatch, vram_gb=24) + monkeypatch.setattr(download_mod, "download_preset", lambda cfg, pres: 0) + monkeypatch.setattr( + download_mod, + "status", + lambda cfg, pres: {"target_present": False, "draft_present": False}, + ) + + result = CliRunner().invoke(app, ["models", "download", "gemma-4-26b"]) + assert result.exit_code == 0 + if cfg_path.exists(): + entries = config_mod.config_get(path=cfg_path) + assert entries["model.preset"] == ("", "default") + + +def test_models_download_explicit_preset_with_activate( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + cfg_path = _set_config_path(tmp_path, monkeypatch) + _stub_host(monkeypatch, vram_gb=24) + monkeypatch.setattr(download_mod, "download_preset", lambda cfg, pres: 0) + monkeypatch.setattr( + download_mod, + "status", + lambda cfg, pres: {"target_present": False, "draft_present": False}, + ) + + result = CliRunner().invoke(app, ["models", "download", "gemma-4-26b", "--activate"]) + assert result.exit_code == 0 + entries = config_mod.config_get(path=cfg_path) + assert entries["model.preset"] == ("gemma-4-26b", "file") + + +def test_installed_helpers_track_presence( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """``installed_status`` / ``installed_size_gb`` reflect on-disk byte counts.""" + _set_config_path(tmp_path, monkeypatch) + _stub_host(monkeypatch, vram_gb=24) + from lucebox.config import live_config + + cfg = live_config() + cfg.models_dir.mkdir(parents=True, exist_ok=True) + laguna = PRESETS["laguna-xs.2"] + assert download_mod.installed_status(cfg, laguna) == "absent" + + target = cfg.models_dir / laguna.target_file + target.parent.mkdir(parents=True, exist_ok=True) + target.write_bytes(b"x" * (5 * 10**9)) + assert download_mod.installed_status(cfg, laguna) == "installed" + assert download_mod.installed_size_gb(cfg, laguna) == pytest.approx(5.0, rel=0.01) diff --git a/lucebox/tests/test_profile.py b/lucebox/tests/test_profile.py new file mode 100644 index 00000000..78acb18a --- /dev/null +++ b/lucebox/tests/test_profile.py @@ -0,0 +1,159 @@ +"""Tests for the collapsed ``lucebox profile`` wrapper. + +The profile module is now a thin shim over ``luce-bench snapshot``: it +probes host facts, picks an output dir, and exec's ``docker exec``. The +tests below pin the wrapper contract — no behavior tests of the bench +itself (those live in luce-bench's own test suite). +""" + +from __future__ import annotations + +import json +import subprocess +from pathlib import Path +from typing import Any + +import pytest +from lucebox.types import Config, HostFacts + +from lucebox import profile + + +def _cfg(tmp_path: Path) -> Config: + """Build a Config with a deterministic models_dir + sentinel host facts.""" + return Config( + models_dir=tmp_path / "models", + host=HostFacts( + gpu_vendor="nvidia", + gpu_name="Test GPU 5090", + vram_gb=32, + nproc=16, + ram_gb=64, + driver_version="595.71.05", + gpu_sm="12.0", + gpu_count=1, + ), + ) + + +def test_server_base_urls_includes_docker_host_route(tmp_path: Path) -> None: + cfg = _cfg(tmp_path) + urls = profile._server_base_urls(cfg) + assert urls[0] == f"http://127.0.0.1:{cfg.port}" + assert any("host.docker.internal" in u for u in urls) + assert any("172.17.0.1" in u for u in urls) + + +def test_server_base_urls_honors_override(tmp_path: Path) -> None: + cfg = _cfg(tmp_path) + assert profile._server_base_urls(cfg, "http://example/") == ["http://example"] + + +def test_host_info_payload_carries_canonical_keys(tmp_path: Path) -> None: + """The payload handed to luce-bench matches the bench's expected schema.""" + cfg = _cfg(tmp_path) + payload = profile._host_info_payload(cfg) + expected = { + "cpu_model", + "nproc", + "ram_gb", + "gpu_name", + "gpu_count", + "vram_gb", + "gpu_sm", + "gpu_power_limit_w", + "driver_version", + "cuda_runtime_version", + "nvidia_smi_csv", + "lucebox_host_facts", + } + assert expected.issubset(payload.keys()) + assert payload["gpu_name"] == "Test GPU 5090" + assert payload["vram_gb"] == 32 + + +def test_run_profile_rejects_unknown_level( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + cfg = _cfg(tmp_path) + rc = profile.run_profile(cfg, level="level42") + assert rc == 2 + + +def test_run_profile_errors_when_container_not_running( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """Clear error when there's no container — the wrapper must NOT try to boot one.""" + cfg = _cfg(tmp_path) + monkeypatch.setattr(profile, "_container_running", lambda name: False) + rc = profile.run_profile(cfg, level="level1") + assert rc == 2 + + +def test_run_profile_exec_docker_exec_with_expected_args( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """Happy path: builds the docker exec argv and writes a host-info file.""" + cfg = _cfg(tmp_path) + monkeypatch.setattr(profile, "_container_running", lambda name: True) + monkeypatch.setattr(profile, "_json_get", lambda url, timeout_s=5.0: {}) + # Pin the output dir under tmp_path so the test is hermetic. + monkeypatch.setattr(profile, "_profile_out_dir", lambda: tmp_path / "snaps") + + invocations: list[list[str]] = [] + + def fake_run(cmd: list[str], check: bool = False) -> subprocess.CompletedProcess[Any]: + invocations.append(cmd) + return subprocess.CompletedProcess(cmd, 0, "", "") + + monkeypatch.setattr(subprocess, "run", fake_run) + rc = profile.run_profile(cfg, level="level1", url="http://127.0.0.1:8080") + assert rc == 0 + assert len(invocations) == 1 + cmd = invocations[0] + assert cmd[:3] == ["docker", "exec", cfg.container_name] + assert "luce-bench" in cmd + assert "snapshot" in cmd + assert "--level" in cmd + i = cmd.index("--level") + assert cmd[i + 1] == "level1" + # host-info file was written. + host_info_path = tmp_path / "snaps" / "_host-info.json" + assert host_info_path.exists() + payload = json.loads(host_info_path.read_text()) + assert payload["gpu_name"] == "Test GPU 5090" + + +def test_run_profile_passes_url_override_through( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + cfg = _cfg(tmp_path) + monkeypatch.setattr(profile, "_container_running", lambda name: True) + monkeypatch.setattr(profile, "_profile_out_dir", lambda: tmp_path / "snaps") + captured: list[list[str]] = [] + + def fake_run(cmd: list[str], check: bool = False) -> subprocess.CompletedProcess[Any]: + captured.append(cmd) + return subprocess.CompletedProcess(cmd, 0, "", "") + + monkeypatch.setattr(subprocess, "run", fake_run) + rc = profile.run_profile(cfg, level="level2", url="http://example:9000") + assert rc == 0 + cmd = captured[0] + i = cmd.index("--url") + assert cmd[i + 1] == "http://example:9000" + + +def test_run_profile_returns_subprocess_rc( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + cfg = _cfg(tmp_path) + monkeypatch.setattr(profile, "_container_running", lambda name: True) + monkeypatch.setattr(profile, "_profile_out_dir", lambda: tmp_path / "snaps") + + def fake_run(cmd: list[str], check: bool = False) -> subprocess.CompletedProcess[Any]: + return subprocess.CompletedProcess(cmd, 7, "", "") + + monkeypatch.setattr(subprocess, "run", fake_run) + rc = profile.run_profile(cfg, level="level0", url="http://x") + assert rc == 7 diff --git a/lucebox/tests/test_smoke.py b/lucebox/tests/test_smoke.py new file mode 100644 index 00000000..e14fb196 --- /dev/null +++ b/lucebox/tests/test_smoke.py @@ -0,0 +1,36 @@ +import json + +import httpx + +from lucebox import smoke + + +def test_tool_smoke_disables_thinking_for_deterministic_tool_calls(): + seen_body = {} + + def handler(request: httpx.Request) -> httpx.Response: + nonlocal seen_body + seen_body = json.loads(request.content) + return httpx.Response( + 200, + json={ + "choices": [ + { + "message": { + "tool_calls": [ + {"function": {"name": "report_status", "arguments": "{}"}} + ] + }, + "finish_reason": "tool_calls", + } + ] + }, + ) + + client = httpx.Client(transport=httpx.MockTransport(handler)) + + ok, err = smoke._check_tool_call(client, "http://test", 1.0) + + assert ok is True + assert err == "" + assert seen_body["chat_template_kwargs"] == {"enable_thinking": False} diff --git a/lucebox/tests/test_sweep.py b/lucebox/tests/test_sweep.py new file mode 100644 index 00000000..d0fe408d --- /dev/null +++ b/lucebox/tests/test_sweep.py @@ -0,0 +1,523 @@ +"""Integration-style tests for ``lucebox.sweep.run_sweep``. + +All side-effects (subprocess.run, urllib, config_set, signal handlers) +are mocked. The goal is to verify the orchestration contract rather +than exercise real systemd / docker / urllib — the underlying +primitives (config.config_set, autotune.candidate_configs, +profile.run_profile) have their own tests. +""" + +from __future__ import annotations + +import json +import os +import signal +from pathlib import Path +from unittest import mock + +import pytest +from lucebox.types import DflashRuntime, HostFacts + +from lucebox import sweep as sweep_mod + +# ── fixtures ─────────────────────────────────────────────────────────────── + + +@pytest.fixture +def stub_env(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + """Pin LUCEBOX_HOME + XDG_DATA_HOME + XDG_CONFIG_HOME under tmp. + + Also creates a fake systemd unit file so the pre-flight check + passes, and seeds config.toml with a model preset so we don't + bail on "no model configured". + """ + monkeypatch.setenv("LUCEBOX_HOME", str(tmp_path / "lucebox")) + monkeypatch.setenv("XDG_DATA_HOME", str(tmp_path / "data")) + monkeypatch.setenv("XDG_CONFIG_HOME", str(tmp_path / "config")) + # Fake systemd unit + unit_path = tmp_path / "config" / "systemd" / "user" / "lucebox.service" + unit_path.parent.mkdir(parents=True, exist_ok=True) + unit_path.write_text("# fake unit\n") + # Seed config.toml with a model preset. + cfg_path = tmp_path / "lucebox" / "config.toml" + cfg_path.parent.mkdir(parents=True, exist_ok=True) + cfg_path.write_text( + '[model]\npreset = "qwen3.6-27b"\n' + 'target_file = "Qwen3-Coder-30B-A3B-Instruct-Q4_K_M.gguf"\n' + '\n[dflash]\nbudget = 22\nmax_ctx = 16384\n' + ) + # Force HostFacts to a known 24 GB tier (the test bracket has 6 cells). + monkeypatch.setattr( + "lucebox.host_facts.from_env", + lambda: HostFacts(vram_gb=24, gpu_name="RTX 5090", gpu_count=1), + ) + monkeypatch.setattr( + "lucebox.sweep.from_env", + lambda: HostFacts(vram_gb=24, gpu_name="RTX 5090", gpu_count=1), + ) + return tmp_path + + +def _write_synthetic_snapshot(snapshot_dir: Path, decode_tps: float) -> None: + """Drop a tiny smoke.json into ``snapshot_dir`` carrying ``decode_tps``. + + Schema mirrors what ``luce-bench snapshot --level level1`` writes — + `_mean_decode_tps_from_snapshot` averages every row that carries + `timings.decode_tokens_per_sec`. + """ + snapshot_dir.mkdir(parents=True, exist_ok=True) + payload = { + "area": "smoke", + "n": 1, + "rows": [ + { + "pass": True, + "timings": { + "decode_tokens_per_sec": decode_tps, + "prefill_ms": 100, + "decode_ms": 1000, + }, + } + ], + } + (snapshot_dir / "smoke.json").write_text(json.dumps(payload)) + + +# ── unit tests for the helpers ───────────────────────────────────────────── + + +def test_mean_decode_tps_averages_per_area_rows(tmp_path: Path) -> None: + snap = tmp_path / "cell" + snap.mkdir() + (snap / "smoke.json").write_text( + json.dumps( + { + "rows": [ + {"timings": {"decode_tokens_per_sec": 40.0}}, + {"timings": {"decode_tokens_per_sec": 50.0}}, + ] + } + ) + ) + (snap / "code.json").write_text( + json.dumps({"rows": [{"timings": {"decode_tokens_per_sec": 60.0}}]}) + ) + tps = sweep_mod._mean_decode_tps_from_snapshot(snap) + assert tps == pytest.approx(50.0) # (40+50+60)/3 + + +def test_mean_decode_tps_ignores_underscore_and_identity_files(tmp_path: Path) -> None: + snap = tmp_path / "cell" + snap.mkdir() + (snap / "_summary.json").write_text( + json.dumps({"rows": [{"timings": {"decode_tokens_per_sec": 999.0}}]}) + ) + (snap / "host.json").write_text( + json.dumps({"rows": [{"timings": {"decode_tokens_per_sec": 999.0}}]}) + ) + (snap / "smoke.json").write_text( + json.dumps({"rows": [{"timings": {"decode_tokens_per_sec": 40.0}}]}) + ) + tps = sweep_mod._mean_decode_tps_from_snapshot(snap) + assert tps == pytest.approx(40.0) + + +def test_mean_decode_tps_returns_none_when_no_rows(tmp_path: Path) -> None: + snap = tmp_path / "empty" + snap.mkdir() + (snap / "smoke.json").write_text(json.dumps({"rows": []})) + assert sweep_mod._mean_decode_tps_from_snapshot(snap) is None + + +def test_pick_winner_breaks_ties_by_max_ctx_then_budget() -> None: + r1 = sweep_mod.CellResult( + index=0, + config=DflashRuntime(budget=22, max_ctx=131072), + snapshot_dir=None, + mean_decode_tps=50.0, + error=None, + ) + r2 = sweep_mod.CellResult( + index=1, + config=DflashRuntime(budget=22, max_ctx=65536), + snapshot_dir=None, + mean_decode_tps=50.0, + error=None, + ) + r3 = sweep_mod.CellResult( + index=2, + config=DflashRuntime(budget=32, max_ctx=65536), + snapshot_dir=None, + mean_decode_tps=50.0, + error=None, + ) + winner = sweep_mod._pick_winner([r1, r2, r3], "decode_tps_snapshot") + # All tied tps. Lower max_ctx wins, then lower budget. + assert winner is r2 + + +def test_pick_winner_returns_none_when_all_failed() -> None: + r = sweep_mod.CellResult( + index=0, + config=DflashRuntime(), + snapshot_dir=None, + mean_decode_tps=None, + error="server-not-ready", + ) + assert sweep_mod._pick_winner([r], "decode_tps_snapshot") is None + + +def test_pick_winner_picks_highest_tps() -> None: + r1 = sweep_mod.CellResult( + index=0, config=DflashRuntime(budget=8), snapshot_dir=None, mean_decode_tps=10.0, error=None + ) + r2 = sweep_mod.CellResult( + index=1, config=DflashRuntime(budget=22), snapshot_dir=None, + mean_decode_tps=50.0, error=None, + ) + r3 = sweep_mod.CellResult( + index=2, config=DflashRuntime(budget=32), snapshot_dir=None, + mean_decode_tps=30.0, error=None, + ) + assert sweep_mod._pick_winner([r1, r2, r3], "decode_tps_snapshot") is r2 + + +def test_pick_winner_agent_replay_filters_failures_and_ranks_by_ctx_then_speed() -> None: + """coding-agent-loop ranking (post-3dffb30): only passing cells + qualify; larger ``max_ctx`` is the PRIMARY sort key; within the + same max_ctx, higher ``speed_metric`` wins. + + Cross-max_ctx speed comparisons are apples-to-oranges because + smaller-ctx cells run against shorter fixture cases (the picker + selects the largest case that fits). Sorting by speed first would + systematically pick the smaller context — see commit 3dffb30 + (bragi sweep 2026-05-30) for the concrete metric artifact. + """ + failed = sweep_mod.CellResult( + index=0, + config=DflashRuntime(max_ctx=131072, fa_window=2048, budget=22), + snapshot_dir=None, + mean_decode_tps=None, + error=None, + passed=False, + pass_reason="HTTP 500", + speed_metric=None, + ) + big_ctx_slow = sweep_mod.CellResult( + index=1, + config=DflashRuntime(max_ctx=131072, fa_window=0, budget=22), + snapshot_dir=None, + mean_decode_tps=None, + error=None, + passed=True, + pass_reason="ok", + speed_metric=5.0, + ) + small_ctx_fast = sweep_mod.CellResult( + index=2, + config=DflashRuntime(max_ctx=98304, fa_window=2048, budget=22), + snapshot_dir=None, + mean_decode_tps=None, + error=None, + passed=True, + pass_reason="ok", + speed_metric=25.0, + ) + winner = sweep_mod._pick_winner( + [failed, big_ctx_slow, small_ctx_fast], "agent_replay_pass_rate" + ) + assert winner is big_ctx_slow, ( + "larger max_ctx must win even when a smaller-ctx cell shows higher " + "speed_metric (different fixture cases → apples-to-oranges)" + ) + + +def test_pick_winner_agent_replay_speed_breaks_tie_within_same_max_ctx() -> None: + """Within the same max_ctx, higher speed_metric wins.""" + slow_at_131k = sweep_mod.CellResult( + index=0, + config=DflashRuntime(max_ctx=131072, fa_window=0, budget=22), + snapshot_dir=None, + mean_decode_tps=None, + error=None, + passed=True, + speed_metric=2.5, + ) + fast_at_131k = sweep_mod.CellResult( + index=1, + config=DflashRuntime(max_ctx=131072, fa_window=0, budget=16), + snapshot_dir=None, + mean_decode_tps=None, + error=None, + passed=True, + speed_metric=3.5, + ) + winner = sweep_mod._pick_winner( + [slow_at_131k, fast_at_131k], "agent_replay_pass_rate" + ) + assert winner is fast_at_131k + + +def test_pick_winner_agent_replay_returns_none_when_all_failed() -> None: + failed = sweep_mod.CellResult( + index=0, + config=DflashRuntime(max_ctx=131072), + snapshot_dir=None, + mean_decode_tps=None, + error=None, + passed=False, + pass_reason="HTTP 500", + speed_metric=None, + ) + assert sweep_mod._pick_winner([failed], "agent_replay_pass_rate") is None + + +def test_pick_winner_agent_replay_tiebreak_prefers_larger_max_ctx() -> None: + """Tied speed → larger max_ctx wins (more headroom for the workload).""" + small_ctx = sweep_mod.CellResult( + index=0, + config=DflashRuntime(max_ctx=65536, fa_window=0, budget=22), + snapshot_dir=None, + mean_decode_tps=None, + error=None, + passed=True, + speed_metric=20.0, + ) + big_ctx = sweep_mod.CellResult( + index=1, + config=DflashRuntime(max_ctx=131072, fa_window=0, budget=22), + snapshot_dir=None, + mean_decode_tps=None, + error=None, + passed=True, + speed_metric=20.0, + ) + winner = sweep_mod._pick_winner([small_ctx, big_ctx], "agent_replay_pass_rate") + assert winner is big_ctx + + +def test_fa_window_in_dflash_allowlist() -> None: + """fa_window must be in the sweep's write allowlist so the + bracket axis lands on disk per cell.""" + assert "fa_window" in sweep_mod.DFLASH_ALLOWLIST + + +def test_sweep_falls_back_to_persisted_host_when_env_empty( + stub_env: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """Regression: when LUCEBOX_HOST_* env vars are absent (e.g. sweep + invoked via `uv run` instead of the lucebox.sh wrapper), the sweep + must read host facts from config.toml's persisted [host] block — + otherwise every profile bracket falls through to base-only and the + sweep silently degrades to a 1-cell smoke test.""" + # Persist a [host] section with real VRAM in the test's config.toml. + cfg_path = stub_env / "lucebox" / "config.toml" + cfg_text = cfg_path.read_text() if cfg_path.exists() else "" + cfg_path.write_text( + cfg_text + + "\n[host]\nvram_gb = 24\ngpu_vendor = \"nvidia\"\ngpu_count = 1\n" + ) + # Ensure the LUCEBOX_HOST_* env vars are NOT set. + for k in list(os.environ): + if k.startswith("LUCEBOX_HOST_"): + monkeypatch.delenv(k, raising=False) + + # Stub the heavyweight side-effects (subprocess, urllib, restart) + # so the test only exercises the host-facts resolution path. + monkeypatch.setattr(sweep_mod, "_systemctl_restart", lambda: 0) + monkeypatch.setattr(sweep_mod, "_wait_ready", lambda *a, **kw: True) + monkeypatch.setattr( + sweep_mod, + "_score_agent_replay", + lambda *a, **kw: (True, "ok", 20.0, "test-case", 1024), + ) + + rc = sweep_mod.run_sweep(yes=True, profile="coding-agent-loop") + assert rc == 0 + # If the fallback works, the gemma 24 GB bracket emits >1 cell. + # Read the persisted config to confirm at least one non-base cell + # was applied (the winner-apply step writes dflash.max_ctx). + final = (stub_env / "lucebox" / "config.toml").read_text() + assert "max_ctx" in final, "sweep should have written a winning max_ctx" + + +# ── pre-flight ───────────────────────────────────────────────────────────── + + +def test_preflight_refuses_when_no_systemd_unit( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("XDG_CONFIG_HOME", str(tmp_path / "config")) + monkeypatch.setenv("LUCEBOX_HOME", str(tmp_path / "lucebox")) + from rich.console import Console + + rc = sweep_mod._preflight(Console()) + assert rc == 2 # noqa: PLR2004 + + +def test_preflight_refuses_when_no_model_configured( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + monkeypatch.setenv("XDG_CONFIG_HOME", str(tmp_path / "config")) + monkeypatch.setenv("LUCEBOX_HOME", str(tmp_path / "lucebox")) + # Install the systemd unit so we get past that check. + unit_path = tmp_path / "config" / "systemd" / "user" / "lucebox.service" + unit_path.parent.mkdir(parents=True, exist_ok=True) + unit_path.write_text("# fake unit\n") + # No config.toml at all → live_config returns model.preset="" and target_file="" + from rich.console import Console + + # Force HostFacts so live_config doesn't try to read env. + monkeypatch.setattr( + "lucebox.host_facts.from_env", + lambda: HostFacts(vram_gb=24), + ) + rc = sweep_mod._preflight(Console()) + assert rc == 2 # noqa: PLR2004 + + +# ── full sweep flow ──────────────────────────────────────────────────────── + + +def test_run_sweep_happy_path_picks_winner_and_applies( + stub_env: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """Three candidates → 3 restarts → 3 snapshots → winner applied + restart.""" + # Force a 3-cell bracket so the test runs fast. + base = DflashRuntime(budget=22, max_ctx=65536) + candidates = [ + DflashRuntime(budget=8, max_ctx=65536), + base, + DflashRuntime(budget=32, max_ctx=65536), + ] + monkeypatch.setattr( + "lucebox.sweep.autotune_mod.candidate_configs", + lambda host, preset="": candidates, # noqa: ARG005 + ) + + # Restart always succeeds. + restart_calls = [] + profile_calls: list[tuple[Path, str]] = [] + + def fake_run(argv, check=False, env=None, **kw): # noqa: ARG001 + # The sweep's subprocess.run path now only carries the systemctl + # restarts — the legacy `lucebox profile` shell-out was replaced + # by a direct `run_profile()` call (mocked separately below) so + # cells write into the right per-cell sweep dir. + restart_calls.append(argv) + return mock.MagicMock(returncode=0) + + def fake_run_profile(cfg, *, level, console=None, out_dir=None, name=None, **kw): # noqa: ARG001 + assert out_dir is not None and name is not None, \ + "sweep must pass out_dir + name so cells land in the sweep tree" + # tps tied to call order → 16, 44, 64. Winner = budget=32 (tps=64). + profile_calls.append((out_dir, name)) + idx = len(profile_calls) - 1 + tps = [16.0, 44.0, 64.0][idx] + _write_synthetic_snapshot(out_dir / name, tps) + return 0 + + monkeypatch.setattr("lucebox.sweep.subprocess.run", fake_run) + monkeypatch.setattr("lucebox.profile.run_profile", fake_run_profile) + monkeypatch.setattr("lucebox.sweep._wait_ready", lambda port, timeout_s: True) + + rc = sweep_mod.run_sweep(yes=True) + assert rc == 0 + + # 3 cell restarts + 1 final winner restart on subprocess.run; profile + # was called 3 times via the direct run_profile path. + restart_argvs = [c for c in restart_calls if c[0] == "systemctl"] + assert len(restart_argvs) == 4 # 3 cells + 1 final # noqa: PLR2004 + assert len(profile_calls) == 3 # noqa: PLR2004 + + # Winner = budget=32 (tps=64). It must be persisted as the final + # on-disk config. + from lucebox import config as config_mod + + entries = config_mod.config_get() + assert entries["dflash.budget"][0] == 32 # noqa: PLR2004 + assert entries["dflash.budget"][1] == "file" + + # Backup should be cleaned up on success. + backup = sweep_mod._backup_path() + assert not backup.exists(), f"backup not removed: {backup}" + + +def test_run_sweep_all_cells_fail_restores_backup( + stub_env: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """Every cell times out on wait_ready → backup restored, exit non-zero.""" + candidates = [ + DflashRuntime(budget=8), + DflashRuntime(budget=22), + DflashRuntime(budget=32), + ] + monkeypatch.setattr( + "lucebox.sweep.autotune_mod.candidate_configs", + lambda host, preset="": candidates, # noqa: ARG005 + ) + + # Capture the pre-sweep config.toml. + from lucebox import config as config_mod + + cfg_path = config_mod.default_config_path() + pre_text = cfg_path.read_text() + + monkeypatch.setattr( + "lucebox.sweep.subprocess.run", + lambda argv, **kw: mock.MagicMock(returncode=0), # noqa: ARG005 + ) + # Every readiness probe times out. + monkeypatch.setattr("lucebox.sweep._wait_ready", lambda port, timeout_s: False) + + rc = sweep_mod.run_sweep(yes=True) + assert rc == 1 + + # Backup was restored — config.toml should match the pre-sweep state. + assert cfg_path.read_text() == pre_text + + +def test_run_sweep_keyboard_interrupt_restores_backup( + stub_env: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """KeyboardInterrupt mid-sweep → backup restored, exit 130. + + The cleanup-restart at the end of ``run_sweep`` also flows through + ``subprocess.run``; the fake here only raises mid-loop, so the + final cleanup call lands in the "happy" branch. + """ + candidates = [DflashRuntime(budget=8), DflashRuntime(budget=22), DflashRuntime(budget=32)] + monkeypatch.setattr( + "lucebox.sweep.autotune_mod.candidate_configs", + lambda host, preset="": candidates, # noqa: ARG005 + ) + + from lucebox import config as config_mod + + cfg_path = config_mod.default_config_path() + pre_text = cfg_path.read_text() + + state = {"calls": 0, "raised": False} + + def fake_run(argv, **kw): # noqa: ARG001 + state["calls"] += 1 + # Raise only the FIRST time we hit the loop-body restart (call + # #1 is the very first cell's restart). After we've raised + # once we fall through to the cleanup restart inside the + # finally / signal-handler path; that one must succeed so the + # restore completes cleanly. + if not state["raised"] and state["calls"] >= 2: # noqa: PLR2004 + state["raised"] = True + raise KeyboardInterrupt("simulated ctrl+c") + return mock.MagicMock(returncode=0) + + monkeypatch.setattr("lucebox.sweep.subprocess.run", fake_run) + monkeypatch.setattr("lucebox.sweep._wait_ready", lambda port, timeout_s: True) + # Don't actually install signal handlers — pytest already has its own. + monkeypatch.setattr("lucebox.sweep.signal.signal", lambda sig, h: signal.SIG_DFL) + + rc = sweep_mod.run_sweep(yes=True) + assert rc == 130 # noqa: PLR2004 + + # Backup was restored — config.toml matches pre-sweep. + assert cfg_path.read_text() == pre_text diff --git a/pyproject.toml b/pyproject.toml index 56ae2bf4..86ff8393 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,8 @@ license = { text = "Apache-2.0" } authors = [{ name = "Lucebox" }] dependencies = [ + "harness", + "lucebox", "lucebox-dflash", "pflash", ] @@ -51,11 +53,13 @@ package = false no-build-isolation-package = ["qwen35-megakernel-bf16"] [tool.uv.workspace] -# Workspace members. Keeping the list to the packages that live in this -# repo lets `uv lock --check` / `uv sync --frozen` pass. -members = ["server", "optimizations/megakernel", "optimizations/pflash"] +# Workspace members. PR adds harness/ and lucebox/ packages alongside the +# existing server / megakernel / pflash members. +members = ["harness", "lucebox", "server", "optimizations/megakernel", "optimizations/pflash"] [tool.uv.sources] +harness = { workspace = true } +lucebox = { workspace = true } lucebox-dflash = { workspace = true } pflash = { workspace = true } qwen35-megakernel-bf16 = { workspace = true } diff --git a/scripts/check_lucebox_wrapper_sandbox.sh b/scripts/check_lucebox_wrapper_sandbox.sh new file mode 100755 index 00000000..df2b2b9b --- /dev/null +++ b/scripts/check_lucebox_wrapper_sandbox.sh @@ -0,0 +1,242 @@ +#!/usr/bin/env bash +# Exercise the host-side lucebox.sh installer/wrapper from an isolated prefix. +# +# The script intentionally runs from a throwaway HOME, XDG_CONFIG_HOME, +# LUCEBOX_HOME, model directory, and working directory. That catches accidental +# dependencies on the checkout or the user's real ~/.lucebox while keeping the +# test reproducible enough to paste into a bug report. + +set -euo pipefail + +IMAGE="${LUCEBOX_TEST_IMAGE:-ghcr.io/easel/lucebox-hub}" +VARIANT="${LUCEBOX_TEST_VARIANT:-integration-props-uv-squared-clean-cuda12}" +WRAPPER_SOURCE="${LUCEBOX_TEST_WRAPPER_SOURCE:-local}" +RUN_PULL="${LUCEBOX_TEST_RUN_PULL:-1}" +RUN_CONTAINER_CLI="${LUCEBOX_TEST_RUN_CONTAINER_CLI:-1}" +KEEP_SANDBOX="${LUCEBOX_TEST_KEEP_SANDBOX:-0}" + +ROOT="" +LOG="" + +usage() { + cat <&2; usage >&2; exit 2 ;; + esac +done + +die() { + echo "[FAIL] $*" >&2 + if [ -n "$LOG" ] && [ -f "$LOG" ]; then + echo "[FAIL] transcript: $LOG" >&2 + fi + exit 1 +} + +note() { + printf '[INFO] %s\n' "$*" +} + +pass() { + printf '[PASS] %s\n' "$*" +} + +assert_file() { + [ -f "$1" ] || die "missing file: $1" + pass "file exists: $1" +} + +assert_contains() { + local file="$1" + local pattern="$2" + if ! grep -Fq "$pattern" "$file"; then + echo "----- $file -----" >&2 + sed -n '1,220p' "$file" >&2 || true + echo "-----------------" >&2 + die "expected '$pattern' in $file" + fi + pass "$file contains: $pattern" +} + +run_logged() { + note "run: $*" + { + printf '\n===== %s =====\n' "$*" + "$@" + printf '===== exit=0 =====\n' + } 2>&1 | tee -a "$LOG" +} + +run_logged_capture() { + local out="$1" + shift + note "run: $* > $out" + { + printf '\n===== %s > %s =====\n' "$*" "$out" + "$@" + local rc=$? + printf '===== exit=%s =====\n' "$rc" + return "$rc" + } 2>&1 | tee "$out" | tee -a "$LOG" >/dev/null +} + +cleanup() { + if [ -n "$ROOT" ] && [ "$KEEP_SANDBOX" != "1" ]; then + rm -rf "$ROOT" + elif [ -n "$ROOT" ]; then + note "kept sandbox: $ROOT" + note "transcript: $LOG" + fi +} +trap cleanup EXIT + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +ROOT="$(mktemp -d "${TMPDIR:-/tmp}/lucebox-wrapper-sandbox.XXXXXX")" +LOG="$ROOT/transcript.log" + +HOME_DIR="$ROOT/home" +BIN_DIR="$ROOT/bin" +XDG_DIR="$ROOT/xdg" +MODELS_DIR="$ROOT/models" +WORK_DIR="$ROOT/work" +mkdir -p "$HOME_DIR" "$BIN_DIR" "$XDG_DIR" "$MODELS_DIR" "$WORK_DIR" + +note "sandbox: $ROOT" +note "transcript: $LOG" + +case "$WRAPPER_SOURCE" in + local) + cp "$REPO_ROOT/lucebox.sh" "$BIN_DIR/lucebox" + ;; + http://*|https://*) + curl -fsSL "$WRAPPER_SOURCE" -o "$BIN_DIR/lucebox" + ;; + *) + cp "$WRAPPER_SOURCE" "$BIN_DIR/lucebox" + ;; +esac +chmod +x "$BIN_DIR/lucebox" + +FIRST_LINE="$(head -n 1 "$BIN_DIR/lucebox")" +[ "$FIRST_LINE" = "#!/usr/bin/env bash" ] || die "unexpected shebang: $FIRST_LINE" +pass "wrapper has expected shebang" + +export HOME="$HOME_DIR" +export XDG_CONFIG_HOME="$XDG_DIR" +export LUCEBOX_HOME="$HOME_DIR/.lucebox" +export LUCEBOX_MODELS="$MODELS_DIR" +export LUCEBOX_IMAGE="$IMAGE" +export LUCEBOX_VARIANT="$VARIANT" +export LUCEBOX_CONTAINER="lucebox-sandbox" +export LUCEBOX_PORT="18080" +export PATH="$BIN_DIR:$PATH" + +cd "$WORK_DIR" +[ "$PWD" = "$WORK_DIR" ] || die "failed to enter sandbox workdir" +pass "working directory isolated: $PWD" + +run_logged_capture "$ROOT/version.out" lucebox version +assert_contains "$ROOT/version.out" "0.2.0" + +run_logged_capture "$ROOT/help.out" lucebox help +assert_contains "$ROOT/help.out" "LUCEBOX_VARIANT" +assert_contains "$ROOT/help.out" "LUCEBOX_IMAGE" + +docker manifest inspect "${IMAGE}:${VARIANT}" >/dev/null +pass "image manifest exists: ${IMAGE}:${VARIANT}" + +if [ "$RUN_PULL" = "1" ]; then + run_logged_capture "$ROOT/pull.out" lucebox pull + assert_contains "$ROOT/pull.out" "${IMAGE}:${VARIANT}" +fi + +if [ "$RUN_CONTAINER_CLI" = "1" ]; then + run_logged_capture "$ROOT/check.out" lucebox check + # Sparse persistence: `config set` creates config.toml with only the + # named key. Replaces the old `configure --overwrite` path. + run_logged_capture "$ROOT/config-image.out" lucebox config set "image=$IMAGE" + run_logged_capture "$ROOT/config-variant.out" lucebox config set "variant=$VARIANT" + assert_file "$LUCEBOX_HOME/config.toml" + [ "$(stat -c '%u' "$LUCEBOX_HOME/config.toml")" = "$(id -u)" ] \ + || die "config.toml is not owned by the invoking user" + pass "config.toml ownership matches invoking user" + assert_contains "$LUCEBOX_HOME/config.toml" "registry = \"$IMAGE\"" + assert_contains "$LUCEBOX_HOME/config.toml" "variant = \"$VARIANT\"" + + run_logged_capture "$ROOT/print-run.out" lucebox print-run + assert_contains "$ROOT/print-run.out" "${IMAGE}:${VARIANT}" + assert_contains "$ROOT/print-run.out" "$MODELS_DIR:/opt/lucebox-hub/dflash/models" + if grep -Fq "$REPO_ROOT" "$ROOT/print-run.out"; then + die "print-run leaked repository path: $REPO_ROOT" + fi + pass "print-run did not reference repository checkout" +fi + +# Exercise `lucebox install` without allowing it to call real systemctl, +# loginctl, docker, or nvidia-smi. The generated user unit must land under the +# sandbox XDG_CONFIG_HOME and point ExecStart at the sandbox-installed wrapper. +SHIM_DIR="$ROOT/shims" +mkdir -p "$SHIM_DIR" +cat > "$SHIM_DIR/docker" <<'EOF' +#!/usr/bin/env bash +case "${1:-}" in + info) exit 0 ;; + version) echo "25.0.0"; exit 0 ;; + stop) exit 0 ;; + *) echo "docker shim: $*" >&2; exit 0 ;; +esac +EOF +cat > "$SHIM_DIR/nvidia-smi" <<'EOF' +#!/usr/bin/env bash +case "$*" in + *"--query-gpu=name,memory.total,driver_version,compute_cap"*) + echo "Fake GPU, 24576, 555.42.01, 8.6"; exit 0 ;; + *"--query-gpu=name"*) + echo "Fake GPU"; exit 0 ;; + *) echo "Fake GPU"; exit 0 ;; +esac +EOF +cat > "$SHIM_DIR/systemctl" <<'EOF' +#!/usr/bin/env bash +if [ "$1" = "--user" ] && [ "$2" = "show-environment" ]; then exit 0; fi +if [ "$1" = "--user" ] && [ "$2" = "daemon-reload" ]; then exit 0; fi +echo "systemctl shim: $*" >&2 +exit 0 +EOF +cat > "$SHIM_DIR/loginctl" <<'EOF' +#!/usr/bin/env bash +echo "Linger=no" +EOF +chmod +x "$SHIM_DIR/docker" "$SHIM_DIR/nvidia-smi" "$SHIM_DIR/systemctl" "$SHIM_DIR/loginctl" + +PATH="$SHIM_DIR:$BIN_DIR:$PATH" run_logged_capture "$ROOT/install.out" lucebox install +UNIT="$XDG_CONFIG_HOME/systemd/user/lucebox.service" +assert_file "$UNIT" +assert_contains "$UNIT" "ExecStart=$BIN_DIR/lucebox serve" +assert_contains "$UNIT" "ExecStop=$SHIM_DIR/docker stop -t 30 lucebox-sandbox" +assert_contains "$ROOT/install.out" "Installed $UNIT" + +pass "sandbox wrapper check completed" +note "summary: image=${IMAGE}:${VARIANT} wrapper_source=${WRAPPER_SOURCE}" diff --git a/scripts/test_lucebox_sh.sh b/scripts/test_lucebox_sh.sh new file mode 100755 index 00000000..be75c178 --- /dev/null +++ b/scripts/test_lucebox_sh.sh @@ -0,0 +1,1131 @@ +#!/usr/bin/env bash +# scripts/test_lucebox_sh.sh — smoke tests for the host-side wrapper + +# every other bash script we ship. +# +# Catches regressions like: +# * syntax errors (bash -n) +# * shellcheck error-level findings across every shipped bash script +# * `set -u` violations in command paths that don't need docker/nvidia — +# each subcommand dispatch is exercised in isolation to verify no +# LUCEBOX_HOST_* or DFLASH_* read fires before the helper that should +# populate it has run. +# * missing dispatch handlers (help, version, check, usage) +# * stale references to subcommands removed from main's case +# +# The wrapper is shell + has zero non-coreutils deps for the host-only +# commands, so this script doesn't need docker/nvidia/systemd present — +# probe_host degrades cleanly when those aren't found, and the +# formatter must render fine for the "everything is missing" case too. +# +# Run from anywhere: scripts/test_lucebox_sh.sh + +set -euo pipefail + +# Resolve repo root + script under test. +ROOT="$(git rev-parse --show-toplevel 2>/dev/null || (cd "$(dirname "$0")/.." && pwd))" +SCRIPT="$ROOT/lucebox.sh" +ENTRYPOINT="$ROOT/server/scripts/entrypoint.sh" +INSTALLER="$ROOT/install.sh" + +if [ ! -f "$SCRIPT" ]; then + echo "FAIL: lucebox.sh not found at $SCRIPT" >&2 + exit 1 +fi + +# entrypoint.sh ships with the docker-stack PR (#334). When it's absent +# (e.g. on the lucebox-cli branch in isolation), skip the entire suite — +# every section below either references $ENTRYPOINT in shellcheck targets, +# parses it with `bash -n`, or sources/dispatches into it directly. The +# host-only lucebox.sh wrapper itself is covered by lucebox.sh's own unit +# tests; this script's value is the wrapper↔entrypoint contract. +if [ ! -f "$ENTRYPOINT" ]; then + echo "Skipping entrypoint tests: server/scripts/entrypoint.sh not present (provided by #334 docker-stack)" + exit 0 +fi + +fail=0 +pass=0 +report() { + if [ "$1" = "ok" ]; then + printf ' \033[1;32m✓\033[0m %s\n' "$2" + pass=$((pass + 1)) + else + printf ' \033[1;31m✗\033[0m %s\n' "$2" + if [ -n "${3:-}" ]; then + printf ' %s\n' "$3" + fi + fail=$((fail + 1)) + fi +} + +# Helper: run the wrapper with strict bash, capture stdout+stderr, check for +# (a) zero exit code, (b) substring match. NO_COLOR is set so colour codes +# don't pollute substring matches. +assert_runs() { + local label="$1" cmd="$2" expect="${3:-}" + local out rc + out=$(NO_COLOR=1 bash -c "$cmd" 2>&1) + rc=$? + if [ "$rc" -ne 0 ]; then + report fail "$label" "exit $rc; output: $(printf '%s' "$out" | head -3)" + return + fi + if [ -n "$expect" ] && ! grep -qF "$expect" <<<"$out"; then + report fail "$label" "missing expected substring '$expect'; got: $(printf '%s' "$out" | head -3)" + return + fi + report ok "$label" +} + +# Helper: run a subcommand whose successful completion would normally need +# docker / nvidia / systemd. We only care that the bash dispatch up to the +# point of the missing dependency does NOT trip `set -u`. Exit code is +# allowed to be non-zero; what we forbid is a raw "unbound variable" / +# "syntax error" / "line N:" leak in the captured output. +# +# Wrapped in `timeout` so subcommands that exec into a follow-style binary +# (logs → journalctl -f, status when systemd is healthy, etc.) don't hang +# the test runner on a dev box where the underlying tools succeed. +assert_no_set_u_leak() { + local label="$1" + shift + local out + out=$(NO_COLOR=1 timeout 5 bash "$@" 2>&1 || true) + # The "line N:" pattern is anchored to a script-path prefix to avoid + # false positives from journalctl output ("systemd[1385106]:") which + # contains a similar shape but isn't a bash error. Bash always emits + # the source filename before the line number, e.g. + # /tmp/lbh-flat/lucebox.sh: line 200: VAR: unbound variable + if grep -qE 'unbound variable|syntax error|\.sh: line [0-9]+:' <<<"$out"; then + report fail "$label" "raw bash error leaked: $(head -3 <<<"$out")" + else + report ok "$label" + fi +} + +echo "[test_lucebox_sh] running against $SCRIPT" + +# ── 1. shellcheck ───────────────────────────────────────────────────────── +# Run shellcheck across every bash script we ship (the wrapper, the +# in-container entrypoint, and every helper under scripts/). Error-level +# findings fail the build; warnings are informational only — those have +# been triaged and the SC2034/SC2155/SC2164 hits in sweep_ds4_2case.sh +# aren't user-visible bugs. +SHELLCHECK_TARGETS=( + "$SCRIPT" + "$ENTRYPOINT" + "$INSTALLER" +) +# Add every scripts/*.sh except this one (don't recurse into our own tests). +while IFS= read -r -d '' f; do + [ "$f" = "${BASH_SOURCE[0]}" ] && continue + SHELLCHECK_TARGETS+=("$f") +done < <(find "$ROOT/scripts" -maxdepth 1 -name '*.sh' -type f -print0 2>/dev/null) +SHELLCHECK_TARGETS+=("${BASH_SOURCE[0]}") + +if command -v shellcheck >/dev/null 2>&1; then + sc_out=$(shellcheck --severity=error "${SHELLCHECK_TARGETS[@]}" 2>&1) || sc_rc=$? + sc_rc="${sc_rc:-0}" + if [ "$sc_rc" -eq 0 ]; then + report ok "shellcheck --severity=error (${#SHELLCHECK_TARGETS[@]} files)" + else + report fail "shellcheck --severity=error" "$(printf '%s' "$sc_out" | head -10)" + fi +else + report fail "shellcheck not installed" "install via 'apt-get install -y shellcheck' (Ubuntu) or 'brew install shellcheck'" +fi + +# ── 2. Syntax / parse ───────────────────────────────────────────────────── +if bash -n "$SCRIPT"; then report ok "bash -n lucebox.sh parses cleanly" +else report fail "bash -n lucebox.sh"; fi +if bash -n "$ENTRYPOINT"; then report ok "bash -n entrypoint.sh parses cleanly" +else report fail "bash -n entrypoint.sh"; fi + +# ── 3. Trivial subcommands (zero-exit expected) ─────────────────────────── +assert_runs "help" "bash '$SCRIPT' help" "host-side wrapper" +assert_runs "--help" "bash '$SCRIPT' --help" "host-side wrapper" +assert_runs "-h" "bash '$SCRIPT' -h" "host-side wrapper" +assert_runs "version" "bash '$SCRIPT' version" "" +assert_runs "--version" "bash '$SCRIPT' --version" "" + +# ── 4. check — host-only, must run to completion even without docker/nvidia. +# This is the path that broke last time (multi-byte glyph + set -u). +assert_runs "check" "bash '$SCRIPT' check" "host readiness report" + +# ── 5. systemd-surface subcommands — every one of these used to crash with +# `LUCEBOX_HOST_HAS_SYSTEMD: unbound variable` because cmd_systemctl_passthrough +# / cmd_logs / cmd_systemd_uninstall reached require_systemd without first +# calling probe_host. The fix routes through require_systemd → probe_host +# when the var is unset; these tests pin that invariant. +# +# On the bare runner there is no user systemd, no installed unit, and no +# docker — so every command is expected to exit non-zero with a CLEAN error +# message. What we forbid is a raw bash "unbound variable" leak. +for sub in start stop restart enable disable status install uninstall; do + assert_no_set_u_leak "$sub dispatch (no set -u leak)" "$SCRIPT" "$sub" +done +# `logs` is special: it execs `journalctl -f` which streams every historical +# journal record for the unit. On a dev box where the lucebox service has +# actually run, that stream contains every past error — including the very +# bugs this test exists to prevent — and we'd false-positive on them. Pass +# `-n 0 --no-pager` so we only see new entries (none, in the test window). +assert_no_set_u_leak "logs dispatch (no set -u leak)" "$SCRIPT" logs -n 0 --no-pager + +# ── 6. server-spawning subcommands — exercise the dispatch up to where +# the missing docker daemon stops them. `serve` is intentionally skipped +# because on a host with a working docker + the cuda12 image already +# pulled, it would actually exec into the container — at which point +# we'd be testing the image's entrypoint, not the wrapper. `pull` just +# execs `docker pull`, so we still smoke its host-side dispatch. +assert_no_set_u_leak "pull dispatch (no set -u leak)" "$SCRIPT" pull + +# ── 7. Unknown subcommand → cmd_in_container fallback path. Same rule: +# clean error, no raw bash leak. +assert_no_set_u_leak "unknown subcommand dispatch" "$SCRIPT" no-such-subcommand + +# ── 8. Pre-populated LUCEBOX_HOST_* env (simulates an already-probed host +# whose vars are passed in from a parent process). Useful in CI matrices +# where we want to mock a "good host" without nvidia-smi/docker on PATH. +out=$( + NO_COLOR=1 \ + LUCEBOX_HOST_HAS_SYSTEMD=0 \ + LUCEBOX_HOST_HAS_DOCKER=0 \ + LUCEBOX_HOST_HAS_CTK=none \ + LUCEBOX_HOST_GPU_VENDOR=none \ + LUCEBOX_HOST_GPU_NAME="" \ + LUCEBOX_HOST_GPU_COUNT=0 \ + LUCEBOX_HOST_VRAM_GB=0 \ + LUCEBOX_HOST_GPU_SM="" \ + LUCEBOX_HOST_DRIVER_VERSION="" \ + LUCEBOX_HOST_DRIVER_MAJOR=0 \ + LUCEBOX_HOST_NPROC=1 \ + LUCEBOX_HOST_RAM_GB=0 \ + LUCEBOX_HOST_IS_WSL=0 \ + LUCEBOX_HOST_DOCKER_VERSION="" \ + timeout 5 bash "$SCRIPT" start 2>&1 || true +) +if grep -qE 'unbound variable|syntax error' <<<"$out"; then + report fail "start with pre-populated LUCEBOX_HOST_* env" "leak: $(head -3 <<<"$out")" +else + report ok "start with pre-populated LUCEBOX_HOST_* env" +fi + +# ── 8b. PIN the top-of-script LUCEBOX_HOST_* safe-default seeds. Even with +# probe_host short-circuited to a no-op (the worst-case bug recurrence: a +# future refactor accidentally deletes the call from a dispatch path) the +# wrapper must not leak `unbound variable` on `start`. We achieve "probe_host +# is a no-op" by exporting `_LUCEBOX_HOST_PROBED=1` so ensure_probed skips +# the real probe — equivalent to a future refactor that calls ensure_probed +# but mis-implements the gate. +out=$( + NO_COLOR=1 \ + _LUCEBOX_HOST_PROBED=1 \ + timeout 5 bash "$SCRIPT" start 2>&1 || true +) +if grep -qE 'unbound variable|syntax error' <<<"$out"; then + report fail "start with probe_host bypassed (seed defaults must catch this)" "leak: $(head -3 <<<"$out")" +else + report ok "start with probe_host bypassed (seed defaults intact)" +fi + +# Same for every other systemd-surface subcommand, since the seed defaults +# are the only thing keeping these safe under `set -u` if probe_host is ever +# bypassed. +for sub in stop restart enable disable status install uninstall logs; do + out=$( + NO_COLOR=1 \ + _LUCEBOX_HOST_PROBED=1 \ + timeout 5 bash "$SCRIPT" "$sub" -n 0 --no-pager 2>&1 || true + ) + if grep -qE 'unbound variable|syntax error' <<<"$out"; then + report fail "$sub with probe_host bypassed" "leak: $(head -3 <<<"$out")" + else + report ok "$sub with probe_host bypassed" + fi +done + +# ── 8c. Install path writes a robust unit file. Use a sandbox HOME so we +# don't clobber the developer's real ~/.config/systemd/user/lucebox.service, +# and verify the generated unit contains the Environment= / ExecStartPre= +# hardening that Bug 2 ("systemctl start succeeds but no container") added. +# The install runs in a host with no real systemd (the sandbox doesn't have +# `systemctl --user`), so we pre-seed LUCEBOX_HOST_HAS_SYSTEMD=1 to slip past +# the require_systemd gate, then stub out the `systemctl` binary itself so +# daemon-reload is a no-op. +test_install_writes_robust_unit() { + local label="install writes hardened unit file" + local sandbox shim_dir + sandbox=$(mktemp -d) + shim_dir="$sandbox/bin" + mkdir -p "$shim_dir" + # Stub systemctl + docker + nvidia-smi + loginctl so the install's + # require_host_prereqs and daemon-reload calls all succeed. + for binname in systemctl docker nvidia-smi loginctl; do + cat > "$shim_dir/$binname" <<'STUB' +#!/usr/bin/env bash +case "$1" in + ps|version) exit 0 ;; + show-user) echo "Linger=no" ;; + --query-gpu=*) echo "Fake, 24576, 550.00, 8.9" ;; +esac +exit 0 +STUB + chmod +x "$shim_dir/$binname" + done + local out rc unit_path + unit_path="$sandbox/.config/systemd/user/lucebox.service" + out=$( + set +e + HOME="$sandbox" \ + XDG_CONFIG_HOME="$sandbox/.config" \ + XDG_DATA_HOME="$sandbox/.local/share" \ + PATH="$shim_dir:$PATH" \ + LUCEBOX_HOST_HAS_SYSTEMD=1 \ + LUCEBOX_HOST_HAS_DOCKER=1 \ + LUCEBOX_HOST_HAS_CTK=runtime \ + LUCEBOX_HOST_GPU_VENDOR=nvidia \ + _LUCEBOX_HOST_PROBED=1 \ + NO_COLOR=1 \ + timeout 10 bash "$SCRIPT" install 2>&1 + echo "RC=$?" + ) + rc=$(grep -oE 'RC=[0-9]+$' <<<"$out" | tail -1 | sed 's/^RC=//') + rc="${rc:-99}" + if [ "$rc" != "0" ]; then + report fail "$label" "exit $rc; output: $(head -10 <<<"$out")" + rm -rf "$sandbox" + return + fi + if [ ! -f "$unit_path" ]; then + report fail "$label" "unit file not written at $unit_path" + rm -rf "$sandbox" + return + fi + # Required hardening — each line is a Bug-2 root-cause defence: + # ExecStartPre=…docker rm -f … → clear orphaned container name + # Environment=PATH=… → systemd user-session PATH is sparse + # Environment=LUCEBOX_IMAGE=… → pin the image the user installed against + local missing="" + for needle in \ + "ExecStartPre=" \ + "Environment=PATH=" \ + "Environment=LUCEBOX_IMAGE=" \ + "Environment=LUCEBOX_VARIANT=" \ + "Environment=LUCEBOX_PORT=" \ + "Environment=LUCEBOX_MODELS=" \ + ; do + grep -qF "$needle" "$unit_path" || missing="$missing $needle" + done + if [ -n "$missing" ]; then + report fail "$label" "unit missing required directives:$missing" + rm -rf "$sandbox" + return + fi + report ok "$label" + rm -rf "$sandbox" +} +test_install_writes_robust_unit + +# ── 9. entrypoint.sh dispatch — confirm the in-container dispatch routes +# trivial subcommands (shell, an unknown passthrough) without firing +# `set -u` on DFLASH_* / DRAFT_* vars that only get assigned on the +# serve path. We can't fully exec the serve path here (it needs nvidia +# and the compiled binary) but we can confirm the early dispatch is clean. +# +# Each `exec` would actually try to run the underlying binary, which we +# don't have — so we shim it by overriding `exec` via a wrapper script. +# Easier: just confirm `bash -n` parses and run a tiny subset. +out=$(NO_COLOR=1 SUBCMD=help bash -c " + cd '$ROOT' + # Simulate 'docker run ... lucebox-hub:cuda12 shell echo ok' — entrypoint + # gets SUBCMD=shell and execs /bin/bash with the rest of argv. We replace + # exec via PATH so we don't actually exec. + tmpdir=\$(mktemp -d) + trap 'rm -rf \$tmpdir' EXIT + cat > \$tmpdir/uv <<'STUB' +#!/usr/bin/env bash +echo \"uv stub: \$*\" +exit 0 +STUB + chmod +x \$tmpdir/uv + PATH=\$tmpdir:\$PATH bash $ENTRYPOINT shell -c 'echo entrypoint-shell-dispatched' +" 2>&1 || true) +if grep -qE 'unbound variable|syntax error' <<<"$out"; then + report fail "entrypoint shell dispatch (no set -u leak)" "leak: $(head -5 <<<"$out")" +else + report ok "entrypoint shell dispatch (no set -u leak)" +fi + +# ── 10. entrypoint.sh serve-path under `set -u` — drive the REAL +# server/scripts/entrypoint.sh through its full draft-resolution block by +# sandboxing it with a synthetic DFLASH_DIR layout and a `dflash_server` +# shim that captures argv instead of execing the native binary. The +# `DRAFT_FAMILY_GLOB: unbound variable` bug fired precisely here — the +# previous version of this test inlined the block instead of sourcing +# the real file, and silently passed even when the shipped script was +# broken. So this test invokes server/scripts/entrypoint.sh directly. +test_entrypoint_serve_path() { + local label="$1" target_name="$2" draft_file="$3" + local sandbox draft_dir models_dir bin_dir shim_dir + sandbox=$(mktemp -d) + models_dir="$sandbox/models" + draft_dir="$models_dir/draft" + bin_dir="$sandbox/build" + shim_dir="$sandbox/bin" + mkdir -p "$draft_dir" "$bin_dir" "$shim_dir" + # Synthetic target (must be a real file at least 5 GB to pass the + # auto-detect block, OR we set DFLASH_TARGET explicitly to skip it). + touch "$models_dir/$target_name" + touch "$draft_dir/$draft_file" + # `dflash_server` shim — print argv and exit 0 instead of running. + cat > "$bin_dir/dflash_server" <<'STUB' +#!/usr/bin/env bash +printf '[shim] dflash_server' +for a in "$@"; do printf ' %q' "$a"; done +printf '\n' +exit 0 +STUB + chmod +x "$bin_dir/dflash_server" + # `nvidia-smi` shim — pretend we have a 24 GB GPU so the autotune + # block runs but doesn't pick the under-12-GB warn tier. + cat > "$shim_dir/nvidia-smi" <<'STUB' +#!/usr/bin/env bash +case "$*" in + *"--query-gpu=memory.total"*) echo 24576 ;; + -L|*-L*) echo "GPU 0: Fake (UUID: 0)" ;; + *) echo "ok" ;; +esac +exit 0 +STUB + chmod +x "$shim_dir/nvidia-smi" + + local out rc + out=$( + set +e + PATH="$shim_dir:$PATH" \ + DFLASH_DIR="$sandbox" \ + DFLASH_SERVER_BIN="$bin_dir/dflash_server" \ + DFLASH_TARGET="$models_dir/$target_name" \ + DFLASH_DRAFT="$draft_dir" \ + timeout 10 bash "$ENTRYPOINT" serve 2>&1 + echo "RC=$?" + ) + rc=$(grep -oE 'RC=[0-9]+$' <<<"$out" | tail -1 | sed 's/^RC=//') + rc="${rc:-99}" + rm -rf "$sandbox" + if grep -qE 'unbound variable|syntax error' <<<"$out"; then + report fail "$label" "leak: $(head -5 <<<"$out")" + elif [ "$rc" != "0" ]; then + report fail "$label" "exit $rc; output: $(head -5 <<<"$out")" + elif ! grep -qF "[shim] dflash_server" <<<"$out"; then + report fail "$label" "shim never executed; output: $(head -5 <<<"$out")" + else + report ok "$label" + fi +} + +# Exercise three branches of the family-glob logic: qwen3.6 + gemma-4 (the +# two families with family-specific globs) and an unknown target that +# triggers the empty-FAMILY_GLOBS fallback to the generic glob list. +test_entrypoint_serve_path "entrypoint serve: qwen3.6 family match" \ + "Qwen3.6-27B-Q4_K_M.gguf" "dflash-draft-3.6-test.gguf" +test_entrypoint_serve_path "entrypoint serve: gemma-4-31b family match" \ + "gemma-4-31B-it-Q8_0.gguf" "gemma-4-31b-dflash-q8.gguf" +test_entrypoint_serve_path "entrypoint serve: generic fallback" \ + "Mystery-Model-7B.gguf" "model.gguf" + +# ── 11. entrypoint.sh serve-path with MULTIPLE target-sized GGUFs in +# models/. The single-candidate fixture in test 10 doesn't exercise the +# auto-detect path that picks "first alphabetically" when more than one +# target ≥5 GB lives in the models dir — that path is what the sindri +# decode sweep tripped over after the user added the qwen3.6-moe preset +# (commit 4b6bced) alongside the existing Qwen3.6-27B target. The crash +# manifested as `DRAFT_FAMILY_GLOB: unbound variable`, and the partial +# fix in a87bb93 didn't survive a recurrence. +# +# Uses sparse files (`truncate -s 6G`) so the test stays cheap on disk — +# the 6 GB virtual size is enough to clear the find ... -size +5G filter +# without consuming actual blocks. Skip if truncate is missing (e.g. +# minimal busybox CI image). +test_entrypoint_multi_target() { + local label="$1" + shift + if ! command -v truncate &>/dev/null; then + report ok "$label (skipped: truncate not available)" + return + fi + local sandbox draft_dir models_dir bin_dir shim_dir + sandbox=$(mktemp -d) + models_dir="$sandbox/models" + draft_dir="$models_dir/draft" + bin_dir="$sandbox/build" + shim_dir="$sandbox/bin" + mkdir -p "$draft_dir" "$bin_dir" "$shim_dir" + # Two qwen3.6-shaped targets ≥5 GB each — exactly the layout that + # broke on sindri (Qwen3.6-27B + Qwen3.6-35B-A3B-UD-Q4_K_M). + truncate -s 6G "$models_dir/Qwen3.6-27B-Q4_K_M.gguf" + truncate -s 6G "$models_dir/Qwen3.6-35B-A3B-UD-Q4_K_M.gguf" + touch "$draft_dir/dflash-draft-3.6-test.gguf" + cat > "$bin_dir/dflash_server" <<'STUB' +#!/usr/bin/env bash +printf '[shim] dflash_server' +for a in "$@"; do printf ' %q' "$a"; done +printf '\n' +exit 0 +STUB + chmod +x "$bin_dir/dflash_server" + cat > "$shim_dir/nvidia-smi" <<'STUB' +#!/usr/bin/env bash +case "$*" in + *"--query-gpu=memory.total"*) echo 24576 ;; + -L|*-L*) echo "GPU 0: Fake (UUID: 0)" ;; + *) echo "ok" ;; +esac +exit 0 +STUB + chmod +x "$shim_dir/nvidia-smi" + + local out rc + out=$( + set +e + # NOTE: deliberately NOT setting DFLASH_TARGET — the test must + # exercise the auto-detect block (line ~151). The explicit-config + # workaround from the bug report would skip the bug entirely. + PATH="$shim_dir:$PATH" \ + DFLASH_DIR="$sandbox" \ + DFLASH_SERVER_BIN="$bin_dir/dflash_server" \ + DFLASH_DRAFT="$draft_dir" \ + timeout 10 bash "$ENTRYPOINT" serve 2>&1 + echo "RC=$?" + ) + rc=$(grep -oE 'RC=[0-9]+$' <<<"$out" | tail -1 | sed 's/^RC=//') + rc="${rc:-99}" + rm -rf "$sandbox" + if grep -qE 'unbound variable|syntax error' <<<"$out"; then + report fail "$label" "leak: $(grep -E 'unbound variable|syntax error' <<<"$out" | head -3)" + elif [ "$rc" != "0" ]; then + report fail "$label" "exit $rc; output: $(head -5 <<<"$out")" + elif ! grep -qF "[shim] dflash_server" <<<"$out"; then + report fail "$label" "shim never executed; output: $(head -10 <<<"$out")" + elif ! grep -qF "Multiple candidate targets" <<<"$out"; then + report fail "$label" "multi-target warn missing — did the auto-detect block fire?" + else + report ok "$label" + fi +} + +# Drive the regression: the sindri layout that broke (post-moe-preset). +test_entrypoint_multi_target "entrypoint serve: multi-target auto-detect (no DRAFT_FAMILY_GLOB leak)" + +# Also drive the DFLASH_DRAFT-is-a-file path. The init at entrypoint.sh:257 +# sits inside `if [ -d "$DFLASH_DRAFT" ]; then` — when DRAFT is a file the +# block is skipped, and any future read of DRAFT_FAMILY_GLOB outside the +# block would trip set -u. The defensive `:-` guard at the read site is +# meant to survive that refactor; this test guarantees it. +test_entrypoint_draft_is_file() { + local label="$1" + local sandbox draft_dir models_dir bin_dir shim_dir + sandbox=$(mktemp -d) + models_dir="$sandbox/models" + draft_dir="$models_dir/draft" + bin_dir="$sandbox/build" + shim_dir="$sandbox/bin" + mkdir -p "$draft_dir" "$bin_dir" "$shim_dir" + touch "$models_dir/Qwen3.6-27B-Q4_K_M.gguf" + # DFLASH_DRAFT points at a FILE (not a directory). + touch "$draft_dir/dflash-draft-3.6-test.gguf" + cat > "$bin_dir/dflash_server" <<'STUB' +#!/usr/bin/env bash +printf '[shim] dflash_server' +for a in "$@"; do printf ' %q' "$a"; done +printf '\n' +exit 0 +STUB + chmod +x "$bin_dir/dflash_server" + cat > "$shim_dir/nvidia-smi" <<'STUB' +#!/usr/bin/env bash +case "$*" in + *"--query-gpu=memory.total"*) echo 24576 ;; + -L|*-L*) echo "GPU 0: Fake (UUID: 0)" ;; + *) echo "ok" ;; +esac +exit 0 +STUB + chmod +x "$shim_dir/nvidia-smi" + + local out rc + out=$( + set +e + PATH="$shim_dir:$PATH" \ + DFLASH_DIR="$sandbox" \ + DFLASH_SERVER_BIN="$bin_dir/dflash_server" \ + DFLASH_TARGET="$models_dir/Qwen3.6-27B-Q4_K_M.gguf" \ + DFLASH_DRAFT="$draft_dir/dflash-draft-3.6-test.gguf" \ + timeout 10 bash "$ENTRYPOINT" serve 2>&1 + echo "RC=$?" + ) + rc=$(grep -oE 'RC=[0-9]+$' <<<"$out" | tail -1 | sed 's/^RC=//') + rc="${rc:-99}" + rm -rf "$sandbox" + if grep -qE 'unbound variable|syntax error' <<<"$out"; then + report fail "$label" "leak: $(grep -E 'unbound variable|syntax error' <<<"$out" | head -3)" + elif [ "$rc" != "0" ]; then + report fail "$label" "exit $rc; output: $(head -5 <<<"$out")" + else + report ok "$label" + fi +} +test_entrypoint_draft_is_file "entrypoint serve: DFLASH_DRAFT is a file (no DRAFT_FAMILY_GLOB leak)" + +# ── 12. entrypoint.sh writes HOST_INFO atomically on the serve path. The +# C++ server reads /opt/lucebox-hub/HOST_INFO into ServerConfig.host_info +# and surfaces it under /props.host. We can't write to /opt/lucebox-hub +# from the test runner, so override the path by sourcing the helpers and +# calling _build_host_info_json directly. The full entrypoint runs in +# test 10/11 already; this test pins the JSON shape independently. +test_entrypoint_host_info_json() { + local label="$1" + # Source the helper functions from the real entrypoint.sh. + # shellcheck disable=SC1090 + source <(awk '/^_json_escape\(\) \{/,/^\}/' "$ENTRYPOINT") + # shellcheck disable=SC1090 + source <(awk '/^_json_str_or_null\(\) \{/,/^\}/' "$ENTRYPOINT") + # shellcheck disable=SC1090 + source <(awk '/^_json_int_or_null\(\) \{/,/^\}/' "$ENTRYPOINT") + # shellcheck disable=SC1090 + source <(awk '/^_emit_gpu_array\(\) \{/,/^\}/' "$ENTRYPOINT") + # shellcheck disable=SC1090 + source <(awk '/^_build_host_info_json\(\) \{/,/^\}/' "$ENTRYPOINT") + + local out + LUCEBOX_HOST_OS_PRETTY="Ubuntu 22.04.3 LTS" \ + LUCEBOX_HOST_KERNEL="6.6.87.2-microsoft-standard-WSL2" \ + LUCEBOX_HOST_WSL_VERSION="wsl2" \ + LUCEBOX_HOST_DOCKER_VERSION="29.1.3" \ + LUCEBOX_HOST_DRIVER_VERSION="596.36" \ + LUCEBOX_HOST_NVIDIA_CTK_VERSION="1.16.2" \ + LUCEBOX_HOST_CPU_MODEL='Intel(R) Core(TM) Ultra 9 275HX' \ + LUCEBOX_HOST_NPROC=24 \ + LUCEBOX_HOST_RAM_GB=64 \ + LUCEBOX_HOST_GPU_LIST_CSV="0, GPU-abc, 00000000:01:00.0, NVIDIA RTX 5090, 12.0, 24576 MiB, 175.00 W" \ + LUCEBOX_HOST_CUDA_VISIBLE_DEVICES="0" \ + out=$(_build_host_info_json "lucebox.sh" "lucebox.sh" "2026-05-28T20:31:42Z") + if ! python3 -c "import json,sys; d=json.loads(sys.argv[1]); assert d['os_pretty']=='Ubuntu 22.04.3 LTS'; assert d['wsl_version']=='wsl2'; assert d['nvidia_ctk_version']=='1.16.2'; assert d['source']=='lucebox.sh'; assert d['gpus'][0]['vram_gb']==24; assert d['gpus'][0]['name']=='NVIDIA RTX 5090'" "$out" >/dev/null 2>&1; then + report fail "$label (populated)" "JSON shape mismatch: $out" + return + fi + # Now drive the unknown path: every LUCEBOX_HOST_* unset → nulls and source=unknown. + out=$(env -i bash -c " + set -u + $(declare -f _json_escape _json_str_or_null _json_int_or_null _emit_gpu_array _build_host_info_json) + _build_host_info_json 'unknown' 'entrypoint.sh' '2026-05-28T20:31:42Z' + ") + if ! python3 -c "import json,sys; d=json.loads(sys.argv[1]); assert d['source']=='unknown'; assert d['gpus']==[]; assert d['os_pretty'] is None" "$out" >/dev/null 2>&1; then + report fail "$label (unknown)" "JSON shape mismatch: $out" + return + fi + report ok "$label" +} +test_entrypoint_host_info_json "entrypoint HOST_INFO JSON shape (populated + unknown)" + +# ── install.sh end-to-end ───────────────────────────────────────────────── +# Drive install.sh against a file:// URL pointing at a fixture lucebox.sh, +# verify the installed copy has LUCEBOX_INSTALLED_FROM rewritten to the +# fetched URL — that's the contract that `lucebox update` depends on to +# preserve the user's channel across upgrades. +test_install_sh_bakes_source_url() { + local label="$1" + local tmp dest_dir dest_path src_url out rc + tmp=$(mktemp -d -t lucebox-install.XXXXXX) + # Use the real lucebox.sh as the "remote" file — `file://` works with + # curl out of the box and exercises the same install.sh code path as + # an https fetch would. + src_url="file://$SCRIPT" + dest_dir="$tmp/bin" + dest_path="$dest_dir/lucebox" + out=$(LUCEBOX_INSTALL_URL="$src_url" LUCEBOX_INSTALL_DEST="$dest_path" \ + NO_COLOR=1 bash "$INSTALLER" 2>&1) || rc=$? + rc="${rc:-0}" + if [ "$rc" -ne 0 ]; then + rm -rf "$tmp" + report fail "$label" "installer exited $rc; output: $(printf '%s' "$out" | head -3)" + return + fi + if [ ! -x "$dest_path" ]; then + rm -rf "$tmp" + report fail "$label" "installed file missing or not executable at $dest_path" + return + fi + if ! grep -q "^LUCEBOX_INSTALLED_FROM=\"$src_url\"$" "$dest_path"; then + rm -rf "$tmp" + report fail "$label" "LUCEBOX_INSTALLED_FROM not rewritten in installed copy" + return + fi + rm -rf "$tmp" + report ok "$label" +} +test_install_sh_bakes_source_url "install.sh bakes LUCEBOX_INSTALLED_FROM into installed copy" + +# ── update dispatch ─────────────────────────────────────────────────────── +# `lucebox update` must dispatch to cmd_update — verify it's wired in the +# main case statement and appears in --help. We can't actually run the +# update (it'd curl + replace this very script) so the test is parse-level. +test_update_subcommand_wired() { + local label="$1" + local out + out=$(LUCEBOX_HOST_HAS_SYSTEMD=0 "$SCRIPT" --help 2>&1) + if ! grep -q '^ update ' <<<"$out"; then + report fail "$label" "update command missing from --help output" + return + fi + if ! grep -q '^[[:space:]]*update)[[:space:]]*cmd_update' "$SCRIPT"; then + report fail "$label" "update) → cmd_update dispatch not wired" + return + fi + report ok "$label" +} +test_update_subcommand_wired "lucebox update subcommand is wired" + +# ── IMAGE_BASE derived from install source ──────────────────────────────── +# Source lucebox.sh in a subshell with LUCEBOX_INSTALLED_FROM pointing at +# various URLs, then check that IMAGE_BASE comes out right. Uses +# `set -e; return` early so we don't actually run the wrapper's main(). +test_image_base_derives_from_install_url() { + local label="$1" url expected got + for case in \ + "https://raw.githubusercontent.com/easel/lucebox-hub/feat/lucebox-docker/lucebox.sh|ghcr.io/easel/lucebox-hub" \ + "https://raw.githubusercontent.com/Luce-Org/lucebox-hub/main/lucebox.sh|ghcr.io/luce-org/lucebox-hub" \ + "https://raw.githubusercontent.com/easel/lucebox-hub/601ab52/lucebox.sh|ghcr.io/easel/lucebox-hub" \ + "https://example.com/bogus|ghcr.io/luce-org/lucebox-hub" + do + url="${case%%|*}" + expected="${case##*|}" + # Extract the derivation function from the script and run it in + # isolation — sourcing the whole script triggers main() and side + # effects we don't want under a test harness. + got=$(bash -c ' + '"$(sed -n "/^_lucebox_derive_image()/,/^}/p" "$SCRIPT")"' + _lucebox_derive_image "$1" + ' bash "$url") + if [ "$got" != "$expected" ]; then + report fail "$label" "url=$url expected=$expected got=$got" + return + fi + done + report ok "$label" +} +test_image_base_derives_from_install_url "IMAGE_BASE derived from LUCEBOX_INSTALLED_FROM (4 URL shapes)" + +# ── config.toml reader + resolver ───────────────────────────────────────── +# Drive _lucebox_config_get + _lucebox_resolve against a fixture +# config.toml in a tmp $LUCEBOX_HOME. Verifies the wrapper agrees with +# the Python CLI on every scalar that lives in [image]/[runtime]/[paths]. +test_config_toml_reader_and_resolve() { + local label="$1" tmp got + tmp=$(mktemp -d -t lucebox-cfg.XXXXXX) + cat > "$tmp/config.toml" <<'TOML' +[image] +variant = "cuda13" +registry = "ghcr.io/myorg/forkedhub" + +[runtime] +port = 9090 +container_name = "luce-test" + +[paths] +models = "/srv/models" + +[dflash] +budget = 22 +lazy = false +TOML + + # Exercise both helpers + the resolver via a subshell that sources + # the relevant snippets out of lucebox.sh. Each case is a triple: + # env_value | toml_key | default | expected + local cases=( + "|image.registry|ghcr.io/luce-org/lucebox-hub|ghcr.io/myorg/forkedhub" + "|image.variant|cuda12|cuda13" + "|runtime.port|8080|9090" + "|runtime.container_name|lucebox|luce-test" + "|paths.models|/var/lib/lucebox|/srv/models" + "OVERRIDE|image.registry|ghcr.io/luce-org/lucebox-hub|OVERRIDE" + "|missing.key|fallback-default|fallback-default" + ) + local case env_value toml_key default expected + for case in "${cases[@]}"; do + IFS='|' read -r env_value toml_key default expected <<<"$case" + got=$(LUCEBOX_HOME="$tmp" bash -c ' + '"$(sed -n "/^_lucebox_config_path()/,/^}/p" "$SCRIPT")"' + '"$(sed -n "/^_lucebox_config_get()/,/^}/p" "$SCRIPT")"' + '"$(sed -n "/^_lucebox_resolve()/,/^}/p" "$SCRIPT")"' + _lucebox_resolve "$1" "$2" "$3" + ' bash "$env_value" "$toml_key" "$default") + if [ "$got" != "$expected" ]; then + rm -rf "$tmp" + report fail "$label" "env=$env_value key=$toml_key default=$default expected=$expected got=$got" + return + fi + done + rm -rf "$tmp" + report ok "$label" +} +test_config_toml_reader_and_resolve "config.toml reader + env > toml > default resolution (7 cases)" + +# ── cmd_serve under systemd: INVOCATION_ID short-circuits is-active ────── +# When systemd invokes the wrapper as a unit's ExecStart, it sets +# $INVOCATION_ID. The wrapper must NOT then refuse "already running under +# systemd" — that's a self-defeating check that turns into a restart loop. +# Verify the guard is present in the source (the actual behavior requires +# a running systemd unit to test end-to-end, which the harness can't do). +test_cmd_serve_invocation_id_guard() { + local label="$1" + if ! grep -q 'INVOCATION_ID' "$SCRIPT"; then + report fail "$label" "INVOCATION_ID guard missing from cmd_serve preflight" + return + fi + # The guard must be the AND-condition gating the is-active check. + # If grep finds the is-active line WITHOUT INVOCATION_ID nearby, + # the guard isn't wired correctly. + if ! awk ' + /INVOCATION_ID/ { saw_guard = NR } + /is-active --quiet "\$UNIT_NAME"/ { + if (saw_guard && NR - saw_guard <= 3) found = 1 + } + END { exit (found ? 0 : 1) } + ' "$SCRIPT"; then + report fail "$label" "INVOCATION_ID not adjacent to is-active check (guard not wired)" + return + fi + report ok "$label" +} +test_cmd_serve_invocation_id_guard "cmd_serve has INVOCATION_ID guard on systemd is-active check" + +# ── cmd_systemctl_passthrough: smart start ─────────────────────────────── +# Verify the source has the "already active" + "restart loop" short +# circuits for the start action. Behavior-level testing requires a real +# unit; this is a source-level guarantee that the branches exist. +test_cmd_start_already_active_shortcircuit() { + local label="$1" + if ! grep -q 'is already active' "$SCRIPT"; then + report fail "$label" "already-active short-circuit missing" + return + fi + if ! grep -q 'is in restart-loop' "$SCRIPT"; then + report fail "$label" "restart-loop short-circuit missing" + return + fi + report ok "$label" +} +test_cmd_start_already_active_shortcircuit "lucebox start has already-active + restart-loop short-circuits" + +# ── install.sh SHA-pin refusal + CHANNEL override ──────────────────────── +# A SHA-pinned LUCEBOX_INSTALL_URL with no LUCEBOX_INSTALL_CHANNEL must +# refuse — otherwise `lucebox update` would re-fetch that frozen SHA +# forever. With CHANNEL set, the bake-in uses the channel URL, not the +# fetch URL. +test_install_sha_pin_refusal_and_channel_override() { + local label="$1" tmp got rc + tmp=$(mktemp -d -t lucebox-sha.XXXXXX) + + # Case 1: SHA-pinned URL without CHANNEL → must refuse + LUCEBOX_INSTALL_URL="https://raw.githubusercontent.com/easel/lucebox-hub/abc1234567/lucebox.sh" \ + LUCEBOX_INSTALL_DEST="$tmp/lucebox1" \ + NO_COLOR=1 \ + bash "$INSTALLER" >/dev/null 2>&1 && rc=0 || rc=$? + if [ "$rc" -eq 0 ]; then + rm -rf "$tmp" + report fail "$label" "SHA-pinned URL without CHANNEL should have refused (rc=$rc, got success)" + return + fi + if [ -f "$tmp/lucebox1" ]; then + rm -rf "$tmp" + report fail "$label" "SHA-pinned URL refusal still wrote $tmp/lucebox1" + return + fi + + # Case 2: SHA-pinned URL WITH CHANNEL → installs, bakes CHANNEL + LUCEBOX_INSTALL_URL="file://$SCRIPT" \ + LUCEBOX_INSTALL_CHANNEL="https://raw.githubusercontent.com/easel/lucebox-hub/feat/lucebox-docker/lucebox.sh" \ + LUCEBOX_INSTALL_DEST="$tmp/lucebox2" \ + NO_COLOR=1 \ + bash "$INSTALLER" >/dev/null 2>&1 || rc=$? + got=$(grep '^LUCEBOX_INSTALLED_FROM=' "$tmp/lucebox2" 2>/dev/null || echo missing) + if [ "$got" != 'LUCEBOX_INSTALLED_FROM="https://raw.githubusercontent.com/easel/lucebox-hub/feat/lucebox-docker/lucebox.sh"' ]; then + rm -rf "$tmp" + report fail "$label" "CHANNEL not baked; got: $got" + return + fi + + rm -rf "$tmp" + report ok "$label" +} +test_install_sha_pin_refusal_and_channel_override "install.sh refuses SHA-pin without CHANNEL + honors CHANNEL override" + +# ── lucebox completion ─────────────────────────────────────────────────── +# The completion script must source cleanly and complete a known prefix. +test_completion_bash() { + local label="$1" out + out=$(LUCEBOX_HOST_HAS_SYSTEMD=0 bash -c ' + source <("$1" completion bash 2>/dev/null) + COMP_WORDS=(lucebox conf) + COMP_CWORD=1 + _lucebox_complete + printf "%s\n" "${COMPREPLY[@]}" + ' bash "$SCRIPT") + if ! grep -qx 'config' <<<"$out"; then + report fail "$label" "completion didn't suggest 'config' for prefix 'conf'; got: $(printf '%s' "$out" | tr '\n' ' ')" + return + fi + report ok "$label" +} +test_completion_bash "lucebox completion bash completes a known prefix" + +# ── docker exec routing ─────────────────────────────────────────────────── +# When the lucebox container is running, steady-state subcommands must +# `docker exec` into it (cheap + shares the live server's net namespace) and +# service-restarting subcommands (autotune --sweep, serve, ...) must stay on +# `docker run`. We mock docker via a PATH shim that: +# - on `docker ps -q -f name=^lucebox$` prints a fake container id +# (signals "container is running") iff DOCKER_FAKE_RUNNING=1. +# - on any other call (run, exec, pull, ...) echoes its argv on stdout and +# exits 0. The test then asserts on the captured first-token (run vs exec) +# and trailing argv. +# +# nvidia-smi is stubbed too so probe_host doesn't barf, but the captured argv +# we care about is the docker invocation downstream of dispatch. +_make_docker_shim() { + local sandbox="$1" running="$2" + local shim_dir="$sandbox/bin" + mkdir -p "$shim_dir" + # docker shim: dispatch on first arg. Important: ps -q -f name=^lucebox$ + # must print a fake id when DOCKER_FAKE_RUNNING=1 and nothing otherwise. + # All other invocations (run, exec, pull) print "DOCKER_INVOKED " + # on stdout so the caller can grep it. + cat > "$shim_dir/docker" < "$shim_dir/nvidia-smi" <<'STUB' +#!/usr/bin/env bash +case "$*" in + *"--query-gpu="*) echo "Fake GPU, 24576, 550.00, 8.9" ;; + *) echo "ok" ;; +esac +exit 0 +STUB + chmod +x "$shim_dir/nvidia-smi" +} + +# Drive the wrapper through the dispatch case under test and capture the +# docker invocation it would have exec'd. Because `cmd_in_container` / +# `cmd_exec_in_container` call `exec docker ...` we replace `exec` semantics +# by running the wrapper in a subshell — the docker shim prints what it was +# called with and the captured stdout is the proof. +_run_wrapper_capture_docker() { + local sandbox="$1"; shift + local shim_dir="$sandbox/bin" + set +e + HOME="$sandbox" \ + XDG_CONFIG_HOME="$sandbox/.config" \ + XDG_DATA_HOME="$sandbox/.local/share" \ + LUCEBOX_HOME="$sandbox/.lucebox" \ + PATH="$shim_dir:$PATH" \ + LUCEBOX_HOST_HAS_DOCKER=1 \ + LUCEBOX_HOST_HAS_CTK=runtime \ + LUCEBOX_HOST_GPU_VENDOR=nvidia \ + LUCEBOX_HOST_DRIVER_MAJOR=550 \ + LUCEBOX_HOST_DRIVER_VERSION="550.00" \ + LUCEBOX_HOST_GPU_NAME="Fake GPU" \ + LUCEBOX_HOST_GPU_COUNT=1 \ + LUCEBOX_HOST_VRAM_GB=24 \ + LUCEBOX_HOST_GPU_SM="89" \ + LUCEBOX_HOST_NPROC=8 \ + LUCEBOX_HOST_RAM_GB=64 \ + LUCEBOX_HOST_HAS_SYSTEMD=0 \ + LUCEBOX_HOST_IS_WSL=0 \ + LUCEBOX_HOST_DOCKER_VERSION="29.1.3" \ + _LUCEBOX_HOST_PROBED=1 \ + NO_COLOR=1 \ + timeout 10 bash "$SCRIPT" "$@" 2>&1 + set -e +} + +test_routes_to_exec_when_running() { + local label="$1" sandbox out + sandbox=$(mktemp -d -t lucebox-route.XXXXXX) + _make_docker_shim "$sandbox" 1 + out=$(_run_wrapper_capture_docker "$sandbox" config get model.preset || true) + rm -rf "$sandbox" + if ! grep -q '^DOCKER_INVOKED exec' <<<"$out"; then + report fail "$label" "expected 'docker exec' invocation; got: $(head -3 <<<"$out")" + return + fi + if grep -q '^DOCKER_INVOKED run' <<<"$out"; then + report fail "$label" "got 'docker run' when container is up — should have exec'd" + return + fi + # Sanity: the exec line ends with `lucebox config get model.preset`. + if ! grep -qE 'lucebox config get model.preset' <<<"$out"; then + report fail "$label" "exec argv missing tail 'lucebox config get model.preset'; got: $(head -3 <<<"$out")" + return + fi + report ok "$label" +} +test_routes_to_exec_when_running "config get routes to docker exec when container running" + +test_routes_to_run_when_not_running() { + local label="$1" sandbox out + sandbox=$(mktemp -d -t lucebox-route.XXXXXX) + _make_docker_shim "$sandbox" 0 + out=$(_run_wrapper_capture_docker "$sandbox" config get model.preset || true) + rm -rf "$sandbox" + if ! grep -q '^DOCKER_INVOKED run' <<<"$out"; then + report fail "$label" "expected 'docker run' invocation (container not running); got: $(head -3 <<<"$out")" + return + fi + if grep -q '^DOCKER_INVOKED exec' <<<"$out"; then + report fail "$label" "got 'docker exec' but container is not running — should fall back to run" + return + fi + report ok "$label" +} +test_routes_to_run_when_not_running "config get falls back to docker run when container not running" + +test_sweep_stays_on_run_even_when_running() { + local label="$1" sandbox out + sandbox=$(mktemp -d -t lucebox-route.XXXXXX) + _make_docker_shim "$sandbox" 1 + out=$(_run_wrapper_capture_docker "$sandbox" autotune --sweep || true) + rm -rf "$sandbox" + if grep -q '^DOCKER_INVOKED exec' <<<"$out"; then + report fail "$label" "autotune --sweep used docker exec — would restart the container it's in" + return + fi + if ! grep -q '^DOCKER_INVOKED run' <<<"$out"; then + report fail "$label" "expected 'docker run' for sweep; got: $(head -3 <<<"$out")" + return + fi + report ok "$label" +} +test_sweep_stays_on_run_even_when_running "autotune --sweep stays on docker run even when container is up" + +test_autotune_no_sweep_uses_exec() { + local label="$1" sandbox out + sandbox=$(mktemp -d -t lucebox-route.XXXXXX) + _make_docker_shim "$sandbox" 1 + out=$(_run_wrapper_capture_docker "$sandbox" autotune --list-profiles || true) + rm -rf "$sandbox" + if ! grep -q '^DOCKER_INVOKED exec' <<<"$out"; then + report fail "$label" "expected 'docker exec' for autotune --list-profiles; got: $(head -3 <<<"$out")" + return + fi + report ok "$label" +} +test_autotune_no_sweep_uses_exec "autotune --list-profiles routes to docker exec when container running" + +test_no_exec_flag_forces_run() { + local label="$1" sandbox out + sandbox=$(mktemp -d -t lucebox-route.XXXXXX) + _make_docker_shim "$sandbox" 1 + # --no-exec must override the prefer-exec path even when container is up. + out=$(_run_wrapper_capture_docker "$sandbox" --no-exec config get model.preset || true) + rm -rf "$sandbox" + if grep -q '^DOCKER_INVOKED exec' <<<"$out"; then + report fail "$label" "--no-exec failed to force run path; got exec" + return + fi + if ! grep -q '^DOCKER_INVOKED run' <<<"$out"; then + report fail "$label" "expected 'docker run' under --no-exec; got: $(head -3 <<<"$out")" + return + fi + report ok "$label" +} +test_no_exec_flag_forces_run "--no-exec flag forces docker run even when container is up" + +test_no_exec_env_forces_run() { + local label="$1" sandbox out + sandbox=$(mktemp -d -t lucebox-route.XXXXXX) + _make_docker_shim "$sandbox" 1 + out=$( + LUCEBOX_NO_EXEC=1 _run_wrapper_capture_docker "$sandbox" config get model.preset || true + ) + rm -rf "$sandbox" + if grep -q '^DOCKER_INVOKED exec' <<<"$out"; then + report fail "$label" "LUCEBOX_NO_EXEC=1 failed to force run path; got exec" + return + fi + if ! grep -q '^DOCKER_INVOKED run' <<<"$out"; then + report fail "$label" "expected 'docker run' under LUCEBOX_NO_EXEC=1; got: $(head -3 <<<"$out")" + return + fi + report ok "$label" +} +test_no_exec_env_forces_run "LUCEBOX_NO_EXEC=1 env override forces docker run" + +test_smoke_routes_to_exec() { + local label="$1" sandbox out + sandbox=$(mktemp -d -t lucebox-route.XXXXXX) + _make_docker_shim "$sandbox" 1 + out=$(_run_wrapper_capture_docker "$sandbox" smoke || true) + rm -rf "$sandbox" + if ! grep -q '^DOCKER_INVOKED exec' <<<"$out"; then + report fail "$label" "expected 'docker exec' for smoke when running; got: $(head -3 <<<"$out")" + return + fi + # Confirm the exec'd command tail is `lucebox smoke` — the in-container + # CLI's argv must NOT be polluted with the dispatcher's bookkeeping. + if ! grep -qE 'lucebox smoke' <<<"$out"; then + report fail "$label" "exec'd argv missing 'lucebox smoke' tail" + return + fi + report ok "$label" +} +test_smoke_routes_to_exec "smoke routes to docker exec when container running" + +# ── usage mentions exec-when-running ────────────────────────────────────── +test_usage_mentions_exec_routing() { + local label="$1" out + out=$(NO_COLOR=1 bash "$SCRIPT" --help 2>&1) + if ! grep -qi 'docker exec\|--no-exec' <<<"$out"; then + report fail "$label" "usage doesn't mention the exec routing / --no-exec flag" + return + fi + report ok "$label" +} +test_usage_mentions_exec_routing "usage documents docker exec routing + --no-exec flag" + +echo +if [ "$fail" -eq 0 ]; then + echo "[test_lucebox_sh] $pass passed, 0 failed" + exit 0 +else + echo "[test_lucebox_sh] $pass passed, $fail failed" >&2 + exit 1 +fi diff --git a/uv.lock b/uv.lock index fee8de0d..4c3071f3 100644 --- a/uv.lock +++ b/uv.lock @@ -9,6 +9,8 @@ resolution-markers = [ [manifest] members = [ + "harness", + "lucebox", "lucebox-dflash", "lucebox-hub", "pflash", @@ -314,6 +316,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, ] +[[package]] +name = "harness" +version = "0.1.0" +source = { editable = "harness" } + +[package.optional-dependencies] +dev = [ + { name = "pytest" }, +] + +[package.metadata] +requires-dist = [{ name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" }] +provides-extras = ["forge", "dev"] + [[package]] name = "hf-xet" version = "1.5.0" @@ -429,6 +445,26 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/59/67/a6739ac96e28b7855808bdb0370e250606104a859750d209e5a0716fe7ab/librt-0.11.0-cp312-cp312-win_arm64.whl", hash = "sha256:2f10cf143e4a9bb0f4f5af568a00df94a2d69ef41c2579584454bb0fe5cc642c", size = 103470, upload-time = "2026-05-10T18:16:10.369Z" }, ] +[[package]] +name = "lucebox" +source = { editable = "lucebox" } +dependencies = [ + { name = "httpx" }, + { name = "huggingface-hub" }, + { name = "rich" }, + { name = "tomli-w" }, + { name = "typer" }, +] + +[package.metadata] +requires-dist = [ + { name = "httpx", specifier = ">=0.27" }, + { name = "huggingface-hub", specifier = ">=0.27" }, + { name = "rich", specifier = ">=13" }, + { name = "tomli-w", specifier = ">=1.0" }, + { name = "typer", specifier = ">=0.12" }, +] + [[package]] name = "lucebox-dflash" version = "0.1.0" @@ -466,6 +502,8 @@ name = "lucebox-hub" version = "0.0.0" source = { virtual = "." } dependencies = [ + { name = "harness" }, + { name = "lucebox" }, { name = "lucebox-dflash" }, { name = "pflash" }, ] @@ -482,6 +520,8 @@ megakernel = [ [package.metadata] requires-dist = [ + { name = "harness", editable = "harness" }, + { name = "lucebox", editable = "lucebox" }, { name = "lucebox-dflash", virtual = "server" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.10,<2" }, { name = "pflash", editable = "optimizations/pflash" }, @@ -1124,6 +1164,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/72/f4/0de46cfa12cdcbcd464cc59fde36912af405696f687e53a091fb432f694c/tokenizers-0.22.2-cp39-abi3-win_arm64.whl", hash = "sha256:9ce725d22864a1e965217204946f830c37876eee3b2ba6fc6255e8e903d5fcbc", size = 2612133, upload-time = "2026-01-05T10:45:17.232Z" }, ] +[[package]] +name = "tomli-w" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/19/75/241269d1da26b624c0d5e110e8149093c759b7a286138f4efd61a60e75fe/tomli_w-1.2.0.tar.gz", hash = "sha256:2dd14fac5a47c27be9cd4c976af5a12d87fb1f0b4512f81d69cce3b35ae25021", size = 7184, upload-time = "2025-01-15T12:07:24.262Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/18/c86eb8e0202e32dd3df50d43d7ff9854f8e0603945ff398974c1d91ac1ef/tomli_w-1.2.0-py3-none-any.whl", hash = "sha256:188306098d013b691fcadc011abd66727d3c414c571bb01b1a174ba8c983cf90", size = 6675, upload-time = "2025-01-15T12:07:22.074Z" }, +] + [[package]] name = "torch" version = "2.11.0+cu128" From c46e358cddd2df6d55794aa72bdbe58cb7069f05 Mon Sep 17 00:00:00 2001 From: Erik LaBianca Date: Thu, 11 Jun 2026 10:23:06 -0400 Subject: [PATCH 2/4] fix(harness): drop unused subprocess import in codex client (ruff F401) Co-Authored-By: Claude Opus 4.7 --- harness/src/harness/clients/codex.py | 1 - 1 file changed, 1 deletion(-) diff --git a/harness/src/harness/clients/codex.py b/harness/src/harness/clients/codex.py index 232f29e6..75732887 100644 --- a/harness/src/harness/clients/codex.py +++ b/harness/src/harness/clients/codex.py @@ -9,7 +9,6 @@ from __future__ import annotations import os -import subprocess import sys from pathlib import Path From 74d50d0f7528ce59ae72506151e033e1793bfa51 Mon Sep 17 00:00:00 2001 From: Erik LaBianca Date: Thu, 11 Jun 2026 11:20:47 -0400 Subject: [PATCH 3/4] fix(lucebox): host_info write skips missing dir; test expects refuse-to-auto-pick entrypoint.sh:write_host_info() bailed loudly when /opt/lucebox-hub/ did not exist on the host (unit tests, plain docker run without bind mount), because bash refuses the > redirect before the command runs and 2>/dev/null does not suppress the redirect's own error. Guard with an upfront [ -d ] check. test_lucebox_sh.sh:test_entrypoint_multi_target was asserting against the pre-#334 multi-target semantics (auto-pick + warn + exec shim). PR #334 (merged) changed that to refuse-to-auto-pick + exit non-zero. Update the assertion: still drives the auto-detect block (so any DRAFT_FAMILY_GLOB set -u regression trips), but now requires the refuse warn to fire and the shim to NOT exec. Co-Authored-By: Claude Opus 4.7 --- scripts/test_lucebox_sh.sh | 15 +++++++++------ server/scripts/entrypoint.sh | 9 +++++++++ 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/scripts/test_lucebox_sh.sh b/scripts/test_lucebox_sh.sh index be75c178..e4ca9be4 100755 --- a/scripts/test_lucebox_sh.sh +++ b/scripts/test_lucebox_sh.sh @@ -501,14 +501,17 @@ STUB rc=$(grep -oE 'RC=[0-9]+$' <<<"$out" | tail -1 | sed 's/^RC=//') rc="${rc:-99}" rm -rf "$sandbox" + # The auto-detect block is entered (so any `set -u` regression on + # DRAFT_FAMILY_GLOB will trip) and then the entrypoint refuses to + # auto-pick — the deliberate safety added in PR #334's cubic round. + # We require: no set-u leak, the refuse warn fired, and the shim + # was NOT exec'd (a silent multi-target auto-pick would be the bug). if grep -qE 'unbound variable|syntax error' <<<"$out"; then report fail "$label" "leak: $(grep -E 'unbound variable|syntax error' <<<"$out" | head -3)" - elif [ "$rc" != "0" ]; then - report fail "$label" "exit $rc; output: $(head -5 <<<"$out")" - elif ! grep -qF "[shim] dflash_server" <<<"$out"; then - report fail "$label" "shim never executed; output: $(head -10 <<<"$out")" - elif ! grep -qF "Multiple candidate targets" <<<"$out"; then - report fail "$label" "multi-target warn missing — did the auto-detect block fire?" + elif ! grep -qF "Refusing to auto-select" <<<"$out"; then + report fail "$label" "refuse-to-auto-pick warn missing — did the auto-detect block fire? rc=$rc output: $(head -5 <<<"$out")" + elif grep -qF "[shim] dflash_server" <<<"$out"; then + report fail "$label" "shim was exec'd despite multi-target refuse" else report ok "$label" fi diff --git a/server/scripts/entrypoint.sh b/server/scripts/entrypoint.sh index f35e295c..4c454fe2 100755 --- a/server/scripts/entrypoint.sh +++ b/server/scripts/entrypoint.sh @@ -73,6 +73,15 @@ esac # write-failure (read-only FS, etc.) gets a warning and we continue. write_host_info() { local target="/opt/lucebox-hub/HOST_INFO" + # If the target dir doesn't exist (e.g. running the entrypoint outside + # the canonical container layout: unit tests, plain `docker run` without + # a bind mount), don't try to write — bash's own "No such file or + # directory" complaint on the `> "$tmp"` redirect below would leak to + # stderr regardless of `2>/dev/null` (that suppresses the command's + # stderr, not the redirect itself). HOST_INFO is informational. + if [ ! -d "$(dirname "$target")" ]; then + return 0 + fi local tmp="${target}.tmp.$$" local collected_at collected_at=$(date -u +%FT%TZ 2>/dev/null || echo "") From 09f26a8d2301ac4e6a1853765f21ce1bf132850f Mon Sep 17 00:00:00 2001 From: Erik LaBianca Date: Fri, 12 Jun 2026 10:34:36 -0400 Subject: [PATCH 4/4] test(lucebox): assert rc != 0 on multi-target refuse-to-auto-pick MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the missing rc != 0 assertion to test_entrypoint_multi_target. The previous fix updated the test to look for the "Refusing to auto-select" warn and a non-exec'd shim, but didn't check the exit code. A regression where the entrypoint logged the warn but still exited 0 (silently auto-picking under the covers) would have slipped through. The container MUST fail to start on multi-target ambiguity — that is the whole point of the policy added in #334. Co-Authored-By: Claude Opus 4.7 --- scripts/test_lucebox_sh.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/test_lucebox_sh.sh b/scripts/test_lucebox_sh.sh index e4ca9be4..3bea83db 100755 --- a/scripts/test_lucebox_sh.sh +++ b/scripts/test_lucebox_sh.sh @@ -504,12 +504,16 @@ STUB # The auto-detect block is entered (so any `set -u` regression on # DRAFT_FAMILY_GLOB will trip) and then the entrypoint refuses to # auto-pick — the deliberate safety added in PR #334's cubic round. - # We require: no set-u leak, the refuse warn fired, and the shim - # was NOT exec'd (a silent multi-target auto-pick would be the bug). + # We require: no set-u leak, the refuse warn fired, a non-zero exit + # (so a future regression that logs the warning but still returns 0 + # cannot slip past — the container MUST fail to start, not silently + # auto-pick a stale GGUF), and the shim was NOT exec'd. if grep -qE 'unbound variable|syntax error' <<<"$out"; then report fail "$label" "leak: $(grep -E 'unbound variable|syntax error' <<<"$out" | head -3)" elif ! grep -qF "Refusing to auto-select" <<<"$out"; then report fail "$label" "refuse-to-auto-pick warn missing — did the auto-detect block fire? rc=$rc output: $(head -5 <<<"$out")" + elif [ "$rc" = "0" ]; then + report fail "$label" "refuse warn fired but rc=0 — entrypoint must exit non-zero on multi-target refuse" elif grep -qF "[shim] dflash_server" <<<"$out"; then report fail "$label" "shim was exec'd despite multi-target refuse" else