Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 67 additions & 7 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,21 +1,81 @@
# MAESTRO - Multi-Agent Evaluation for Structured Relational Output
# Dockerfile for cross-platform reproducibility
# MAESTRO — Multi-Agent Evaluation for Structured Relational Output
# Dockerfile for cross-platform reproducibility.
#
# The image carries both halves of the pipeline:
# * Python 3.11 — runs the experiment (models, strategies, scoring, DB).
# * mermaid-cli (mmdc) + Chromium — backs the structural-validity metric
# (analysis/metrics.py shells out to `mmdc` to compute parses_valid; without
# it that metric is recorded as NULL for every run).

FROM python:3.11-slim

WORKDIR /app

# Install system dependencies
# System dependencies:
# * git — environment.py records the commit hash per run
# * nodejs / npm — runtime for mermaid-cli
# * chromium — mmdc renders via Puppeteer, which needs a browser
# * the lib*/fonts* — shared libraries Chromium needs to start headless
RUN apt-get update && apt-get install -y --no-install-recommends \
git \
nodejs \
npm \
chromium \
fonts-liberation \
libasound2 \
libatk-bridge2.0-0 \
libatk1.0-0 \
libcups2 \
libdbus-1-3 \
libdrm2 \
libgbm1 \
libgtk-3-0 \
libnspr4 \
libnss3 \
libxcomposite1 \
libxdamage1 \
libxfixes3 \
libxkbcommon0 \
libxrandr2 \
&& rm -rf /var/lib/apt/lists/*

# Copy project files
# Image digest for run provenance. An image cannot know its own digest at
# build time, so it is passed in (e.g. the CI-resolved digest or the git SHA)
# and baked as an env var; environment.capture_environment() reads it into
# run_environments.docker_image_digest. Unset → recorded as NULL.
ARG MAESTRO_IMAGE_DIGEST=
ENV MAESTRO_IMAGE_DIGEST=$MAESTRO_IMAGE_DIGEST

# mermaid-cli, pinned for reproducibility. Puppeteer must use the system
# Chromium (installed above) rather than downloading its own — and Chromium
# refuses to run as root without --no-sandbox, which is the norm in CI/Docker.
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \
PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium
RUN npm install -g @mermaid-js/mermaid-cli@11.4.2
Comment thread
coderabbitai[bot] marked this conversation as resolved.

# A Puppeteer launch config so mmdc starts Chromium headless without a sandbox.
# Chromium refuses to run as root without --no-sandbox; --disable-dev-shm-usage
# avoids crashes from the small /dev/shm Docker allocates by default. mmdc only
# honours these via a config file passed with `-p`, so metrics.py reads this
# path from MERMAID_PUPPETEER_CONFIG and forwards it as `-p`.
RUN printf '{"args":["--no-sandbox","--disable-gpu","--disable-dev-shm-usage"]}' \
> /app/puppeteer.json
ENV MERMAID_PUPPETEER_CONFIG=/app/puppeteer.json
Comment thread
coderabbitai[bot] marked this conversation as resolved.

# Python project. Copy metadata first so the dependency layer caches across
# source-only changes.
COPY pyproject.toml README.md ./
COPY src/ ./src/

# Install Python dependencies
RUN pip install --no-cache-dir -e .

# Default command
# Sanity: fail the build if mmdc can't actually render, so a broken
# Chromium/Puppeteer setup is caught here, not 80% into a real run. Uses a temp
# file (not /dev/stdin) so this checks the browser, not the input path; -p makes
# the launch config explicit rather than relying on env discovery.
RUN printf 'flowchart LR\n a["A"] --> b["B"]\n' > /tmp/smoke.mmd \
&& mmdc -p /app/puppeteer.json -i /tmp/smoke.mmd -o /tmp/smoke.png -e png \
&& rm -f /tmp/smoke.mmd /tmp/smoke.png

# Default: print the version. Override with the experiment runner, e.g.
# docker compose run --rm maestro python -m maestro.run --tier 1
CMD ["python", "-c", "import maestro; print(f'MAESTRO v{maestro.__version__}')"]
44 changes: 39 additions & 5 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,44 @@
services:
# One image, two uses:
# * `docker compose up` → serves the Streamlit dashboard (default),
# reading the experiment DB read-write so a
# future in-app run can write to it.
# * `docker compose run --rm maestro python -m maestro.run --tier 1`
# → runs an experiment on the SAME image; the
# command overrides the dashboard default.
# The DB lives on the host at ./out so external tools (Jupyter, sqlite, BI)
# can read it directly.
maestro:
build: .
build:
context: .
args:
# Optional run-provenance stamp baked into the image, surfaced in
# run_environments.docker_image_digest. Pass at build time, e.g.
# MAESTRO_IMAGE_DIGEST=$(git rev-parse HEAD) docker compose build
# Unset → recorded as NULL.
MAESTRO_IMAGE_DIGEST: ${MAESTRO_IMAGE_DIGEST:-}
container_name: maestro
volumes:
- .:/app
env_file:
- .env
# Override default command for interactive use
# command: python -m maestro
environment:
# SQLite DB in the mounted output dir so results survive teardown and are
# reachable from the host. Both the runner (experiment_config.DB_PATH) and
# the dashboard (viz/settings.py) read this same var.
MAESTRO_DB_PATH: /app/out/maestro.db
ports:
- "8501:8501"
volumes:
# Scoped mounts (not the whole repo) so the image's installed package,
# /app/puppeteer.json, and the globally-installed mmdc are not shadowed.
- ./data:/app/data:ro # benchmark inputs + ground truth (read-only)
- ./out:/app/out # experiment DB, persisted + host-accessible
# Default command = the dashboard, so `docker compose up` just serves it.
# --server.address=0.0.0.0 lets the host browser reach the container;
# headless disables Streamlit's browser-open and first-run prompt.
command:
- streamlit
- run
- src/maestro/viz/app.py
- --server.address=0.0.0.0
- --server.port=8501
- --server.headless=true
Empty file added out/.gitkeep
Empty file.
22 changes: 21 additions & 1 deletion src/maestro/analysis/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
4. Error taxonomy counts
"""

import os
import re
import shutil
import subprocess
Expand Down Expand Up @@ -43,6 +44,16 @@ def check_mermaid_valid(diagram_code: str) -> tuple[bool | None, str | None]:
if mmdc is None:
return (None, "mmdc not found — validation skipped")

# mmdc renders via Puppeteer/Chromium. In a container running as root,
# Chromium refuses to start without --no-sandbox, which mmdc only picks up
# from a config file passed with -p (it does NOT read a PUPPETEER_* env).
# MERMAID_PUPPETEER_CONFIG points at that file when set (see the Docker
# image); locally it is unset and mmdc uses its working default.
puppeteer_args: list[str] = []
puppeteer_config = os.environ.get("MERMAID_PUPPETEER_CONFIG")
if puppeteer_config and Path(puppeteer_config).is_file():
puppeteer_args = ["-p", puppeteer_config]

# NamedTemporaryFile with delete=True cleans up after the context exits.
# The file is created so mmdc has somewhere to write; we never read it.
# NOTE: Windows-incompatible — Windows holds the named-temp-file open
Expand All @@ -52,7 +63,16 @@ def check_mermaid_valid(diagram_code: str) -> tuple[bool | None, str | None]:
try:
with tempfile.NamedTemporaryFile(suffix=".png", delete=True) as out:
result = subprocess.run(
[mmdc, "-i", "/dev/stdin", "-o", out.name, "-e", "png"],
[
mmdc,
*puppeteer_args,
"-i",
"/dev/stdin",
"-o",
out.name,
"-e",
"png",
],
input=diagram_code,
capture_output=True,
text=True,
Expand Down
8 changes: 6 additions & 2 deletions src/maestro/experiment_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
To enable a strategy: add to STRATEGIES (once implemented)
"""

import os
from pathlib import Path

from maestro.schemas import InputFile, ModelPricing, Strategy, Tier
Expand Down Expand Up @@ -379,6 +380,9 @@
# Number of repeated runs per (input, strategy, model) cell
DEFAULT_REPEATS = 5

# SQLite database path (project root)
# SQLite database path. Defaults to maestro.db in the project root; override
# with MAESTRO_DB_PATH so a containerized run can write the DB to a mounted
# host volume (the project root holds the installed package and is not bind
# mounted) without touching code.
PROJECT_ROOT = Path(__file__).resolve().parent.parent.parent
DB_PATH = PROJECT_ROOT / "maestro.db"
DB_PATH = Path(os.environ.get("MAESTRO_DB_PATH") or PROJECT_ROOT / "maestro.db")
22 changes: 21 additions & 1 deletion src/maestro/viz/mermaid_render.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from __future__ import annotations

import os
import shutil
import subprocess
import tempfile
Expand All @@ -42,13 +43,32 @@ def render_mermaid_svg(diagram_code: str, *, timeout: int = 15) -> str | None:
if mmdc is None or not diagram_code or not diagram_code.strip():
return None

# Forward a Puppeteer launch config when one is configured (mirrors
# metrics.check_mermaid_valid). In a container running as root, Chromium
# needs --no-sandbox, which mmdc only honours from a -p config file, not an
# env var. Locally MERMAID_PUPPETEER_CONFIG is unset and mmdc uses its
# default; without this, in-container renders would silently return None.
puppeteer_args: list[str] = []
puppeteer_config = os.environ.get("MERMAID_PUPPETEER_CONFIG")
if puppeteer_config and Path(puppeteer_config).is_file():
puppeteer_args = ["-p", puppeteer_config]

try:
with tempfile.TemporaryDirectory() as tmp:
in_path = Path(tmp) / "in.mmd"
out_path = Path(tmp) / "out.svg"
in_path.write_text(diagram_code, encoding="utf-8")
result = subprocess.run(
[mmdc, "-i", str(in_path), "-o", str(out_path), "-e", "svg"],
[
mmdc,
*puppeteer_args,
"-i",
str(in_path),
"-o",
str(out_path),
"-e",
"svg",
],
capture_output=True,
text=True,
timeout=timeout,
Expand Down