diff --git a/Dockerfile b/Dockerfile index e7fa576..98d1e62 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,21 +1,81 @@ -# MAESTRO - Multi-Agent Evaluation for Structured Relational Output -# Dockerfile for cross-platform reproducibility +# MAESTRO — Multi-Agent Evaluation for Structured Relational Output +# Dockerfile for cross-platform reproducibility. +# +# The image carries both halves of the pipeline: +# * Python 3.11 — runs the experiment (models, strategies, scoring, DB). +# * mermaid-cli (mmdc) + Chromium — backs the structural-validity metric +# (analysis/metrics.py shells out to `mmdc` to compute parses_valid; without +# it that metric is recorded as NULL for every run). FROM python:3.11-slim WORKDIR /app -# Install system dependencies +# System dependencies: +# * git — environment.py records the commit hash per run +# * nodejs / npm — runtime for mermaid-cli +# * chromium — mmdc renders via Puppeteer, which needs a browser +# * the lib*/fonts* — shared libraries Chromium needs to start headless RUN apt-get update && apt-get install -y --no-install-recommends \ git \ + nodejs \ + npm \ + chromium \ + fonts-liberation \ + libasound2 \ + libatk-bridge2.0-0 \ + libatk1.0-0 \ + libcups2 \ + libdbus-1-3 \ + libdrm2 \ + libgbm1 \ + libgtk-3-0 \ + libnspr4 \ + libnss3 \ + libxcomposite1 \ + libxdamage1 \ + libxfixes3 \ + libxkbcommon0 \ + libxrandr2 \ && rm -rf /var/lib/apt/lists/* -# Copy project files +# Image digest for run provenance. An image cannot know its own digest at +# build time, so it is passed in (e.g. the CI-resolved digest or the git SHA) +# and baked as an env var; environment.capture_environment() reads it into +# run_environments.docker_image_digest. Unset → recorded as NULL. +ARG MAESTRO_IMAGE_DIGEST= +ENV MAESTRO_IMAGE_DIGEST=$MAESTRO_IMAGE_DIGEST + +# mermaid-cli, pinned for reproducibility. Puppeteer must use the system +# Chromium (installed above) rather than downloading its own — and Chromium +# refuses to run as root without --no-sandbox, which is the norm in CI/Docker. +ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \ + PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium +RUN npm install -g @mermaid-js/mermaid-cli@11.4.2 + +# A Puppeteer launch config so mmdc starts Chromium headless without a sandbox. +# Chromium refuses to run as root without --no-sandbox; --disable-dev-shm-usage +# avoids crashes from the small /dev/shm Docker allocates by default. mmdc only +# honours these via a config file passed with `-p`, so metrics.py reads this +# path from MERMAID_PUPPETEER_CONFIG and forwards it as `-p`. +RUN printf '{"args":["--no-sandbox","--disable-gpu","--disable-dev-shm-usage"]}' \ + > /app/puppeteer.json +ENV MERMAID_PUPPETEER_CONFIG=/app/puppeteer.json + +# Python project. Copy metadata first so the dependency layer caches across +# source-only changes. COPY pyproject.toml README.md ./ COPY src/ ./src/ - -# Install Python dependencies RUN pip install --no-cache-dir -e . -# Default command +# Sanity: fail the build if mmdc can't actually render, so a broken +# Chromium/Puppeteer setup is caught here, not 80% into a real run. Uses a temp +# file (not /dev/stdin) so this checks the browser, not the input path; -p makes +# the launch config explicit rather than relying on env discovery. +RUN printf 'flowchart LR\n a["A"] --> b["B"]\n' > /tmp/smoke.mmd \ + && mmdc -p /app/puppeteer.json -i /tmp/smoke.mmd -o /tmp/smoke.png -e png \ + && rm -f /tmp/smoke.mmd /tmp/smoke.png + +# Default: print the version. Override with the experiment runner, e.g. +# docker compose run --rm maestro python -m maestro.run --tier 1 CMD ["python", "-c", "import maestro; print(f'MAESTRO v{maestro.__version__}')"] diff --git a/docker-compose.yml b/docker-compose.yml index 36a2015..4be1d6d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,10 +1,44 @@ services: + # One image, two uses: + # * `docker compose up` → serves the Streamlit dashboard (default), + # reading the experiment DB read-write so a + # future in-app run can write to it. + # * `docker compose run --rm maestro python -m maestro.run --tier 1` + # → runs an experiment on the SAME image; the + # command overrides the dashboard default. + # The DB lives on the host at ./out so external tools (Jupyter, sqlite, BI) + # can read it directly. maestro: - build: . + build: + context: . + args: + # Optional run-provenance stamp baked into the image, surfaced in + # run_environments.docker_image_digest. Pass at build time, e.g. + # MAESTRO_IMAGE_DIGEST=$(git rev-parse HEAD) docker compose build + # Unset → recorded as NULL. + MAESTRO_IMAGE_DIGEST: ${MAESTRO_IMAGE_DIGEST:-} container_name: maestro - volumes: - - .:/app env_file: - .env - # Override default command for interactive use - # command: python -m maestro + environment: + # SQLite DB in the mounted output dir so results survive teardown and are + # reachable from the host. Both the runner (experiment_config.DB_PATH) and + # the dashboard (viz/settings.py) read this same var. + MAESTRO_DB_PATH: /app/out/maestro.db + ports: + - "8501:8501" + volumes: + # Scoped mounts (not the whole repo) so the image's installed package, + # /app/puppeteer.json, and the globally-installed mmdc are not shadowed. + - ./data:/app/data:ro # benchmark inputs + ground truth (read-only) + - ./out:/app/out # experiment DB, persisted + host-accessible + # Default command = the dashboard, so `docker compose up` just serves it. + # --server.address=0.0.0.0 lets the host browser reach the container; + # headless disables Streamlit's browser-open and first-run prompt. + command: + - streamlit + - run + - src/maestro/viz/app.py + - --server.address=0.0.0.0 + - --server.port=8501 + - --server.headless=true diff --git a/out/.gitkeep b/out/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/maestro/analysis/metrics.py b/src/maestro/analysis/metrics.py index 18a5e00..3569163 100644 --- a/src/maestro/analysis/metrics.py +++ b/src/maestro/analysis/metrics.py @@ -8,6 +8,7 @@ 4. Error taxonomy counts """ +import os import re import shutil import subprocess @@ -43,6 +44,16 @@ def check_mermaid_valid(diagram_code: str) -> tuple[bool | None, str | None]: if mmdc is None: return (None, "mmdc not found — validation skipped") + # mmdc renders via Puppeteer/Chromium. In a container running as root, + # Chromium refuses to start without --no-sandbox, which mmdc only picks up + # from a config file passed with -p (it does NOT read a PUPPETEER_* env). + # MERMAID_PUPPETEER_CONFIG points at that file when set (see the Docker + # image); locally it is unset and mmdc uses its working default. + puppeteer_args: list[str] = [] + puppeteer_config = os.environ.get("MERMAID_PUPPETEER_CONFIG") + if puppeteer_config and Path(puppeteer_config).is_file(): + puppeteer_args = ["-p", puppeteer_config] + # NamedTemporaryFile with delete=True cleans up after the context exits. # The file is created so mmdc has somewhere to write; we never read it. # NOTE: Windows-incompatible — Windows holds the named-temp-file open @@ -52,7 +63,16 @@ def check_mermaid_valid(diagram_code: str) -> tuple[bool | None, str | None]: try: with tempfile.NamedTemporaryFile(suffix=".png", delete=True) as out: result = subprocess.run( - [mmdc, "-i", "/dev/stdin", "-o", out.name, "-e", "png"], + [ + mmdc, + *puppeteer_args, + "-i", + "/dev/stdin", + "-o", + out.name, + "-e", + "png", + ], input=diagram_code, capture_output=True, text=True, diff --git a/src/maestro/experiment_config.py b/src/maestro/experiment_config.py index 9880132..3f9fd2a 100644 --- a/src/maestro/experiment_config.py +++ b/src/maestro/experiment_config.py @@ -8,6 +8,7 @@ To enable a strategy: add to STRATEGIES (once implemented) """ +import os from pathlib import Path from maestro.schemas import InputFile, ModelPricing, Strategy, Tier @@ -379,6 +380,9 @@ # Number of repeated runs per (input, strategy, model) cell DEFAULT_REPEATS = 5 -# SQLite database path (project root) +# SQLite database path. Defaults to maestro.db in the project root; override +# with MAESTRO_DB_PATH so a containerized run can write the DB to a mounted +# host volume (the project root holds the installed package and is not bind +# mounted) without touching code. PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent -DB_PATH = PROJECT_ROOT / "maestro.db" +DB_PATH = Path(os.environ.get("MAESTRO_DB_PATH") or PROJECT_ROOT / "maestro.db") diff --git a/src/maestro/viz/mermaid_render.py b/src/maestro/viz/mermaid_render.py index 1bae95d..6880beb 100644 --- a/src/maestro/viz/mermaid_render.py +++ b/src/maestro/viz/mermaid_render.py @@ -16,6 +16,7 @@ from __future__ import annotations +import os import shutil import subprocess import tempfile @@ -42,13 +43,32 @@ def render_mermaid_svg(diagram_code: str, *, timeout: int = 15) -> str | None: if mmdc is None or not diagram_code or not diagram_code.strip(): return None + # Forward a Puppeteer launch config when one is configured (mirrors + # metrics.check_mermaid_valid). In a container running as root, Chromium + # needs --no-sandbox, which mmdc only honours from a -p config file, not an + # env var. Locally MERMAID_PUPPETEER_CONFIG is unset and mmdc uses its + # default; without this, in-container renders would silently return None. + puppeteer_args: list[str] = [] + puppeteer_config = os.environ.get("MERMAID_PUPPETEER_CONFIG") + if puppeteer_config and Path(puppeteer_config).is_file(): + puppeteer_args = ["-p", puppeteer_config] + try: with tempfile.TemporaryDirectory() as tmp: in_path = Path(tmp) / "in.mmd" out_path = Path(tmp) / "out.svg" in_path.write_text(diagram_code, encoding="utf-8") result = subprocess.run( - [mmdc, "-i", str(in_path), "-o", str(out_path), "-e", "svg"], + [ + mmdc, + *puppeteer_args, + "-i", + str(in_path), + "-o", + str(out_path), + "-e", + "svg", + ], capture_output=True, text=True, timeout=timeout,