diff --git a/src/maestro/viz/mermaid_render.py b/src/maestro/viz/mermaid_render.py new file mode 100644 index 0000000..1bae95d --- /dev/null +++ b/src/maestro/viz/mermaid_render.py @@ -0,0 +1,60 @@ +""" +MAESTRO viz — render Mermaid source to SVG via the mmdc CLI. + +The dashboard renders diagrams with the *same* engine the metric pipeline +uses to compute ``parses_valid`` (mmdc — see analysis/metrics.py), so the +picture a viewer sees is consistent with the validity the data records. This +matters for a thesis artifact: a second, in-browser renderer could disagree +with mmdc and produce a diagram that looks fine but was scored invalid (or +vice versa). mmdc is also deterministic and version-pinnable, so a rendered +figure is reproducible. + +When mmdc is not installed the renderer returns ``None`` and the caller falls +back to showing the source — the honest behavior is to not fabricate a render +we cannot produce. Requires: ``npm install -g @mermaid-js/mermaid-cli``. +""" + +from __future__ import annotations + +import shutil +import subprocess +import tempfile +from pathlib import Path + + +def mmdc_available() -> bool: + """Whether the mmdc CLI is on PATH.""" + return shutil.which("mmdc") is not None + + +def render_mermaid_svg(diagram_code: str, *, timeout: int = 15) -> str | None: + """ + Render ``diagram_code`` to an SVG string via mmdc, or ``None`` on any + failure (mmdc missing, invalid source, timeout). Callers treat ``None`` as + "show the source instead". + + Unlike the validate-only path in metrics.py, this reads the produced SVG + back. Input is written to a temp ``.mmd`` file (not ``/dev/stdin``) so the + input side is not Unix-only; the output temp file is created in a temp dir + and read after mmdc exits. Empty/blank source short-circuits to ``None``. + """ + mmdc = shutil.which("mmdc") + if mmdc is None or not diagram_code or not diagram_code.strip(): + return None + + try: + with tempfile.TemporaryDirectory() as tmp: + in_path = Path(tmp) / "in.mmd" + out_path = Path(tmp) / "out.svg" + in_path.write_text(diagram_code, encoding="utf-8") + result = subprocess.run( + [mmdc, "-i", str(in_path), "-o", str(out_path), "-e", "svg"], + capture_output=True, + text=True, + timeout=timeout, + ) + if result.returncode != 0 or not out_path.exists(): + return None + return out_path.read_text(encoding="utf-8") + except (subprocess.TimeoutExpired, OSError): + return None diff --git a/src/maestro/viz/queries.py b/src/maestro/viz/queries.py index df21507..7cb9859 100644 --- a/src/maestro/viz/queries.py +++ b/src/maestro/viz/queries.py @@ -16,9 +16,18 @@ import re import sqlite3 +from typing import Any from maestro.experiment_config import CONTROL_STRATEGIES +# SQL fragment matching a "successful" run, mirroring RunResult.success +# (schemas.py): no error and a non-empty output diagram. Used wherever a +# success/failure split is needed. +_SUCCESS_SQL = ( + "(r.error IS NULL AND r.output_diagram_code IS NOT NULL " + "AND TRIM(r.output_diagram_code) != '')" +) + # A valid SQLite identifier for our purposes: a bare table name. Used to # reject anything that isn't a plain identifier before it reaches an # interpolated query (table names can't be bound as parameters). @@ -100,3 +109,379 @@ def mean_entity_id_f1_by_strategy( _CONTROL_VALUES, ).fetchall() return [(r["strategy"], float(r["mean_f1"])) for r in rows] + + +# --------------------------------------------------------------------------- +# Overview view +# --------------------------------------------------------------------------- + + +def overview_summary(conn: sqlite3.Connection) -> dict[str, Any]: + """ + Headline operational counts for the Overview cards: total runs, successful + runs, success rate, total cost (USD), and distinct environments. Returns + zeros on an empty database. + """ + if not (table_exists(conn, "run_configs") and table_exists(conn, "run_results")): + return { + "total_runs": 0, + "successful_runs": 0, + "success_rate": 0.0, + "total_cost_usd": 0.0, + "distinct_environments": 0, + } + row = conn.execute( + f""" + SELECT + COUNT(*) AS total_runs, + SUM(CASE WHEN {_SUCCESS_SQL} THEN 1 ELSE 0 END) AS successful_runs, + COALESCE(SUM(r.cost_usd), 0.0) AS total_cost_usd, + COUNT(DISTINCT c.environment_id) AS distinct_environments + FROM run_configs c + JOIN run_results r ON c.run_id = r.run_id + """ + ).fetchone() + total = row["total_runs"] or 0 + successful = row["successful_runs"] or 0 + return { + "total_runs": total, + "successful_runs": successful, + "success_rate": (successful / total) if total else 0.0, + "total_cost_usd": float(row["total_cost_usd"] or 0.0), + "distinct_environments": row["distinct_environments"] or 0, + } + + +def runs_by_strategy_success( + conn: sqlite3.Connection, +) -> list[tuple[str, int, int]]: + """ + Per strategy: ``(strategy, n_success, n_failure)`` across all runs. + Includes every strategy present (controls included — this is an + operational summary, not a strategy comparison). Empty list if no runs. + """ + if not (table_exists(conn, "run_configs") and table_exists(conn, "run_results")): + return [] + rows = conn.execute( + f""" + SELECT + c.strategy AS strategy, + SUM(CASE WHEN {_SUCCESS_SQL} THEN 1 ELSE 0 END) AS n_success, + SUM(CASE WHEN {_SUCCESS_SQL} THEN 0 ELSE 1 END) AS n_failure + FROM run_configs c + JOIN run_results r ON c.run_id = r.run_id + GROUP BY c.strategy + ORDER BY c.strategy + """ + ).fetchall() + return [(r["strategy"], r["n_success"], r["n_failure"]) for r in rows] + + +def total_cost_by_strategy(conn: sqlite3.Connection) -> list[tuple[str, float]]: + """Per strategy total cost (USD), all strategies. Empty list if no runs.""" + if not (table_exists(conn, "run_configs") and table_exists(conn, "run_results")): + return [] + rows = conn.execute( + """ + SELECT c.strategy AS strategy, COALESCE(SUM(r.cost_usd), 0.0) AS cost + FROM run_configs c + JOIN run_results r ON c.run_id = r.run_id + GROUP BY c.strategy + ORDER BY c.strategy + """ + ).fetchall() + return [(r["strategy"], float(r["cost"])) for r in rows] + + +# --------------------------------------------------------------------------- +# Strategy Comparison view +# --------------------------------------------------------------------------- + +# Metric families the Strategy Comparison view can show, mapped to their +# column prefixes. The view picks precision / recall / f1 as the suffix. +ENTITY_METRICS = ("entity_id", "entity_name", "entity_lemma") +RELATIONSHIP_METRICS = ("relationship_relaxed", "relationship_strict") + + +def distinct_tiers(conn: sqlite3.Connection) -> list[int]: + """Distinct tier values present in run_configs, ascending.""" + if not table_exists(conn, "run_configs"): + return [] + rows = conn.execute( + "SELECT DISTINCT tier FROM run_configs ORDER BY tier" + ).fetchall() + return [int(r["tier"]) for r in rows] + + +def distinct_models(conn: sqlite3.Connection) -> list[str]: + """Distinct model ids present in run_configs (controls' 'control' excluded).""" + if not table_exists(conn, "run_configs"): + return [] + rows = conn.execute( + "SELECT DISTINCT model FROM run_configs WHERE model != 'control' ORDER BY model" + ).fetchall() + return [r["model"] for r in rows] + + +def metric_means_by_strategy( + conn: sqlite3.Connection, + metric_columns: list[str], + *, + tier: int | None = None, + models: list[str] | None = None, +) -> dict[str, dict[str, float]]: + """ + Mean of each column in ``metric_columns`` per strategy, optionally filtered + to a tier and/or a set of models. Controls excluded. + + Returns ``{strategy: {metric_column: mean}}``. ``metric_columns`` are + validated as identifiers (they are interpolated, not bindable). Empty dict + if no rows match. + """ + for col in metric_columns: + if not _IDENTIFIER_RE.match(col): + raise ValueError(f"invalid metric column: {col!r}") + if not (table_exists(conn, "metric_results") and table_exists(conn, "run_configs")): + return {} + + where = [f"c.strategy NOT IN ({','.join('?' * len(_CONTROL_VALUES))})"] + params: list[Any] = list(_CONTROL_VALUES) + if tier is not None: + where.append("c.tier = ?") + params.append(tier) + if models: + where.append(f"c.model IN ({','.join('?' * len(models))})") + params.extend(models) + + avg_cols = ", ".join(f'AVG(m."{c}") AS "{c}"' for c in metric_columns) + rows = conn.execute( + f""" + SELECT c.strategy AS strategy, {avg_cols} + FROM run_configs c + JOIN metric_results m ON c.run_id = m.run_id + WHERE {" AND ".join(where)} + GROUP BY c.strategy + ORDER BY c.strategy + """, + params, + ).fetchall() + return { + r["strategy"]: {c: float(r[c]) for c in metric_columns if r[c] is not None} + for r in rows + } + + +# --------------------------------------------------------------------------- +# Pareto view +# --------------------------------------------------------------------------- + + +def pareto_points( + conn: sqlite3.Connection, + *, + strategies: list[str] | None = None, + tiers: list[int] | None = None, +) -> list[dict[str, Any]]: + """ + One row per scored run with the fields the Pareto view plots and tabulates: + run_id, strategy, model, tier, cost_usd, duration_ms, entity_id_f1. + Controls excluded. Optional strategy/tier filters. Empty list if none. + """ + needed = ("run_configs", "run_results", "metric_results") + if not all(table_exists(conn, t) for t in needed): + return [] + + where = [f"c.strategy NOT IN ({','.join('?' * len(_CONTROL_VALUES))})"] + params: list[Any] = list(_CONTROL_VALUES) + if strategies: + where.append(f"c.strategy IN ({','.join('?' * len(strategies))})") + params.extend(strategies) + if tiers: + where.append(f"c.tier IN ({','.join('?' * len(tiers))})") + params.extend(tiers) + + rows = conn.execute( + f""" + SELECT + c.run_id AS run_id, c.strategy AS strategy, c.model AS model, + c.tier AS tier, r.cost_usd AS cost_usd, r.duration_ms AS duration_ms, + m.entity_id_f1 AS entity_id_f1 + FROM run_configs c + JOIN run_results r ON c.run_id = r.run_id + JOIN metric_results m ON c.run_id = m.run_id + WHERE {" AND ".join(where)} + ORDER BY c.strategy, c.model + """, + params, + ).fetchall() + return [dict(r) for r in rows] + + +def distinct_strategies( + conn: sqlite3.Connection, *, exclude_controls: bool = True +) -> list[str]: + """Distinct strategy values present in run_configs, ascending.""" + if not table_exists(conn, "run_configs"): + return [] + if exclude_controls: + rows = conn.execute( + f"SELECT DISTINCT strategy FROM run_configs " + f"WHERE strategy NOT IN ({','.join('?' * len(_CONTROL_VALUES))}) " + f"ORDER BY strategy", + _CONTROL_VALUES, + ).fetchall() + else: + rows = conn.execute( + "SELECT DISTINCT strategy FROM run_configs ORDER BY strategy" + ).fetchall() + return [r["strategy"] for r in rows] + + +# --------------------------------------------------------------------------- +# Run Detail view +# --------------------------------------------------------------------------- + + +def list_runs(conn: sqlite3.Connection) -> list[dict[str, Any]]: + """ + Every run as a selectable entry: run_id, strategy, model, tier, example_id, + timestamp. Most recent first. Empty list if no runs. + """ + if not table_exists(conn, "run_configs"): + return [] + rows = conn.execute( + """ + SELECT run_id, strategy, model, tier, example_id, timestamp + FROM run_configs + ORDER BY timestamp DESC + """ + ).fetchall() + return [dict(r) for r in rows] + + +def run_detail(conn: sqlite3.Connection, run_id: str) -> dict[str, Any] | None: + """ + Full detail for one run: the config, result (output diagram, tokens, cost, + error), and metric scores, as a flat dict. None if the run_id is unknown. + """ + if not (table_exists(conn, "run_configs") and table_exists(conn, "run_results")): + return None + # Columns listed explicitly (not c.*, r.*, m.*): all three tables carry a + # run_id, and a star-join would emit it three times, with sqlite3.Row + # silently keeping only the last. Naming the consumed columns avoids the + # collision and documents exactly what the Run Detail / Diagram Visualizer + # views read. + row = conn.execute( + """ + SELECT + c.run_id AS run_id, c.strategy AS strategy, c.model AS model, + c.tier AS tier, c.example_id AS example_id, c.timestamp AS timestamp, + r.output_diagram_code AS output_diagram_code, + r.prompt_tokens AS prompt_tokens, + r.completion_tokens AS completion_tokens, + r.duration_ms AS duration_ms, r.cost_usd AS cost_usd, + r.error AS error, r.retry_count AS retry_count, + m.parses_valid AS parses_valid, + m.entity_id_f1 AS entity_id_f1, + m.entity_name_f1 AS entity_name_f1, + m.entity_lemma_f1 AS entity_lemma_f1, + m.relationship_relaxed_f1 AS relationship_relaxed_f1, + m.relationship_strict_f1 AS relationship_strict_f1 + FROM run_configs c + JOIN run_results r ON c.run_id = r.run_id + LEFT JOIN metric_results m ON c.run_id = m.run_id + WHERE c.run_id = ? + """, + (run_id,), + ).fetchone() + return dict(row) if row else None + + +def sub_results_for_run(conn: sqlite3.Connection, run_id: str) -> list[dict[str, Any]]: + """Sub-call trace for a run, ordered by step. Empty if none / no table.""" + if not table_exists(conn, "sub_results"): + return [] + rows = conn.execute( + """ + SELECT step_number, step_name, output_text, prompt_tokens, + completion_tokens, duration_ms, cost_usd, error, retry_count + FROM sub_results + WHERE run_id = ? + ORDER BY step_number + """, + (run_id,), + ).fetchall() + return [dict(r) for r in rows] + + +# --------------------------------------------------------------------------- +# Hallucination Taxonomy view +# --------------------------------------------------------------------------- + +ENTITY_TAXONOMY = ( + "missing_entities", + "extra_entities", + "false_entities", + "duplicate_entities", +) +RELATIONSHIP_TAXONOMY = ( + "missing_relationships", + "extra_relationships", + "false_relationships", + "duplicate_relationships", +) + + +def has_any_taxonomy_data(conn: sqlite3.Connection) -> bool: + """ + True if any error-taxonomy count is non-zero across all scored runs. Drives + the hallucination view's gating empty-state (all-zero ⇒ nothing to show). + """ + if not table_exists(conn, "metric_results"): + return False + cols = ENTITY_TAXONOMY + RELATIONSHIP_TAXONOMY + total = " + ".join(f"COALESCE(SUM({c}), 0)" for c in cols) + row = conn.execute( + f"SELECT ({total}) AS grand_total FROM metric_results" + ).fetchone() + return bool(row["grand_total"]) + + +def taxonomy_counts_by_strategy( + conn: sqlite3.Connection, + columns: tuple[str, ...], + *, + tier: int | None = None, +) -> dict[str, dict[str, int]]: + """ + Summed taxonomy counts per strategy for the given taxonomy ``columns`` + (entity or relationship set), optionally filtered to a tier. Controls + included — their error profile is itself informative. Returns + ``{strategy: {column: total}}``. + """ + for col in columns: + if not _IDENTIFIER_RE.match(col): + raise ValueError(f"invalid taxonomy column: {col!r}") + if not (table_exists(conn, "metric_results") and table_exists(conn, "run_configs")): + return {} + + where = [] + params: list[Any] = [] + if tier is not None: + where.append("c.tier = ?") + params.append(tier) + where_sql = f"WHERE {' AND '.join(where)}" if where else "" + + sum_cols = ", ".join(f'SUM(m."{c}") AS "{c}"' for c in columns) + rows = conn.execute( + f""" + SELECT c.strategy AS strategy, {sum_cols} + FROM run_configs c + JOIN metric_results m ON c.run_id = m.run_id + {where_sql} + GROUP BY c.strategy + ORDER BY c.strategy + """, + params, + ).fetchall() + return {r["strategy"]: {c: int(r[c] or 0) for c in columns} for r in rows} diff --git a/src/maestro/viz/views/__init__.py b/src/maestro/viz/views/__init__.py index 1009727..d05758c 100644 --- a/src/maestro/viz/views/__init__.py +++ b/src/maestro/viz/views/__init__.py @@ -6,11 +6,10 @@ navigation in ``app.py`` is driven by the ``VIEWS`` registry below: each entry is a (label, render-callable) pair, rendered when selected. -The registry currently holds a live "Home" view that confirms the -navigation + settings + empty-state wiring, plus placeholder entries for the -planned data views. Each planned view becomes a ``views/.py`` module -exposing ``render()``, appended to ``VIEWS``; until implemented it shows a -placeholder card. +The registry holds a "Home" landing view plus the data views (Overview, +Strategy Comparison, Pareto, Run Detail, Diagram Visualizer, Hallucination +Taxonomy). Each data view lives in its own ``views/.py`` module exposing +``render()``. """ from __future__ import annotations @@ -21,28 +20,26 @@ from maestro.viz import settings as viz_settings from maestro.viz.charts_reference import render_reference_chart - -# Names of the planned data views, shown in the nav as placeholders so the -# eventual structure is visible before each is implemented. -_PLANNED_VIEWS: tuple[str, ...] = ( - "Overview", - "Strategy Comparison", - "Pareto", - "Run Detail", - "Hallucination Taxonomy", +from maestro.viz.views import ( + diagram_visualizer, + hallucination, + overview, + pareto, + run_detail, + strategy_comparison, ) -def _render_placeholder() -> None: +def _render_home() -> None: """ - The Home view: confirms the app is wired up and shows the design-system + The Home landing view: orients the user and shows the design-system reference chart against the configured database. """ st.title("MAESTRO — Results Dashboard") st.write( - "Navigation, read-only database access, settings, and empty-state " - "handling are in place. Select a planned view from the sidebar to see " - "its placeholder." + "Read-only dashboard over the experiment database. Use the sidebar to " + "open a view; configure the database path and display timezone under " + "⚙️ Settings." ) st.divider() # Bound the chart to a left-hand portion of the wide page so the figure @@ -52,22 +49,13 @@ def _render_placeholder() -> None: render_reference_chart(viz_settings.current_settings().db_path) -def _make_planned_placeholder(name: str) -> Callable[[], None]: - """Build a render() that shows a 'not yet implemented' card for ``name``.""" - - def _render() -> None: - st.title(name) - st.info( - f"The **{name}** view is not implemented yet.", - icon="🚧", - ) - - return _render - - -# (label, render) pairs in nav order. The live placeholder first, then the -# planned views as placeholders. +# (label, render) pairs in sidebar order: Home landing, then the data views. VIEWS: list[tuple[str, Callable[[], None]]] = [ - ("Home", _render_placeholder), - *[(name, _make_planned_placeholder(name)) for name in _PLANNED_VIEWS], + ("Home", _render_home), + ("Overview", overview.render), + ("Strategy Comparison", strategy_comparison.render), + ("Pareto", pareto.render), + ("Run Detail", run_detail.render), + ("Diagram Visualizer", diagram_visualizer.render), + ("Hallucination Taxonomy", hallucination.render), ] diff --git a/src/maestro/viz/views/diagram_visualizer.py b/src/maestro/viz/views/diagram_visualizer.py new file mode 100644 index 0000000..62b421a --- /dev/null +++ b/src/maestro/viz/views/diagram_visualizer.py @@ -0,0 +1,149 @@ +""" +MAESTRO viz — Diagram Visualizer view (diagnostic, no RQ mapping). + +Side-by-side comparison of a run's ground-truth diagram and its generated +diagram. Ground truth is read from the file system via +experiment_config.INPUTS (the DB stores only example_id); the generated +diagram comes from run_results. + +A Code / Visualization toggle switches both panes together. Visualization +renders with the mmdc CLI — the same engine the metric pipeline uses for +``parses_valid`` — so the rendered picture is consistent with the recorded +validity, deterministic, and reproducible. When mmdc is unavailable (or a +source fails to render), the pane falls back to showing the source code. +""" + +from __future__ import annotations + +from pathlib import Path + +import streamlit as st + +from maestro.experiment_config import INPUTS +from maestro.viz import db as viz_db +from maestro.viz import queries as viz_queries +from maestro.viz import settings as viz_settings +from maestro.viz.components import empty_state +from maestro.viz.mermaid_render import mmdc_available, render_mermaid_svg +from maestro.viz.theme import strategy_display_name + +# example_id → InputFile, for resolving the ground-truth file path. +_INPUTS_BY_ID = {inp.example_id: inp for inp in INPUTS} + + +def render() -> None: + """Draw the Diagram Visualizer page.""" + st.title("Diagram Visualizer") + st.caption("Ground-truth vs. generated diagram source, side by side.") + + db_path: Path = viz_settings.current_settings().db_path + if not viz_db.database_exists(db_path): + empty_state( + "Database not found.", + "Run an experiment first, or update the path in ⚙️ Settings.", + ) + return + + with viz_db.connect(db_path) as conn: + runs = viz_queries.list_runs(conn) + if not runs: + empty_state("No runs available.") + return + + labels = { + r["run_id"]: ( + f"{strategy_display_name(r['strategy'])} | {r['model']} | " + f"tier {r['tier']} | {r['example_id']}" + ) + for r in runs + } + run_id = st.selectbox( + "Run", + options=[r["run_id"] for r in runs], + format_func=lambda rid: labels[rid], + ) + detail = viz_queries.run_detail(conn, run_id) + + if detail is None: + empty_state("Selected run not found.") + return + + # Code / Visualization toggle, applied to both panes together so the + # comparison is always like-with-like. Visualization is only offered when + # mmdc is installed; otherwise force Code mode with a note. + if mmdc_available(): + mode = st.radio("Display", ["Code", "Visualization"], horizontal=True) + else: + mode = "Code" + st.caption( + "Install the Mermaid CLI (`npm install -g @mermaid-js/mermaid-cli`) " + "to enable the Visualization mode." + ) + + _render_side_by_side(detail, render_visual=(mode == "Visualization")) + + +def _render_side_by_side(detail: dict, *, render_visual: bool) -> None: + """Ground truth (left) next to generated (right), as code or rendered SVG.""" + left, right = st.columns(2) + + with left: + st.subheader("Ground truth") + inp = _INPUTS_BY_ID.get(detail["example_id"]) + if inp is None: + st.info( + f"No input registered for example_id " + f"`{detail['example_id']}` in experiment_config.INPUTS." + ) + else: + _render_diagram( + _read_or_note(inp.ground_truth_path), render_visual=render_visual + ) + + with right: + st.subheader("Generated") + parses = detail.get("parses_valid") + if parses is True: + st.success("Parses: valid", icon="✅") + elif parses is False: + st.error("Parses: invalid", icon="❌") + else: + st.warning("Parse validity: not checked", icon="❔") + + output = detail.get("output_diagram_code") + if output: + _render_diagram(output, render_visual=render_visual) + else: + st.info("No diagram produced for this run.") + if detail.get("error"): + st.error(f"Run error: {detail['error']}") + + +def _render_diagram(source: str, *, render_visual: bool) -> None: + """ + Show ``source`` as a rendered SVG (Visualization mode) or as code. Falls + back to code if rendering fails — e.g. invalid Mermaid the metric pipeline + also rejected — so a broken diagram still shows its source for inspection. + """ + if render_visual: + svg = render_mermaid_svg(source) + if svg is not None: + # st.image treats a string as a path, so an SVG string can't go + # there. Embed the SVG markup directly; constrain width so it + # scales to the column. The SVG comes from our own mmdc render of + # data already in the DB — not arbitrary user input. + st.markdown( + f'
{svg}
', + unsafe_allow_html=True, + ) + return + st.caption("Could not render this source — showing code instead.") + st.code(source, language="mermaid") + + +def _read_or_note(path: Path) -> str: + """Read a text file, or return a short note if it's missing/unreadable.""" + try: + return Path(path).read_text(encoding="utf-8") + except (OSError, UnicodeDecodeError): + return f"(file not readable: {path})" diff --git a/src/maestro/viz/views/hallucination.py b/src/maestro/viz/views/hallucination.py new file mode 100644 index 0000000..ea0da4d --- /dev/null +++ b/src/maestro/viz/views/hallucination.py @@ -0,0 +1,162 @@ +""" +MAESTRO viz — Hallucination Taxonomy view (RQ3). + +Exploratory characterization of error types per strategy: stacked bars of the +four taxonomy categories (missing / extra / false / duplicate) at the entity +and relationship levels. Controls are included — their error profile is +itself informative. + +The four-category error palette is defined here rather than in the shared +theme: the design guide's section 1 covers strategy / provider / tier but not +errors yet. If this palette proves reusable it should graduate into the guide +and theme.py. +""" + +from __future__ import annotations + +from pathlib import Path + +import numpy as np +import streamlit as st + +from maestro.viz import db as viz_db +from maestro.viz import queries as viz_queries +from maestro.viz import settings as viz_settings +from maestro.viz.chart import new_figure, render_chart +from maestro.viz.components import empty_state +from maestro.viz.theme import strategy_display_name + +# Four-category error palette (view-local; candidate for the design guide). +# Distinct from the strategy/provider/tier palettes since errors co-occur with +# strategy on the same chart. +_ERROR_COLORS = { + "missing": "#E67E22", # dropped from truth + "extra": "#3498DB", # invented + "false": "#E74C3C", # present but wrong + "duplicate": "#95A5A6", # repeated +} +# Short category label (the taxonomy column suffix) → display label. +_CATEGORY_LABEL = { + "missing": "Missing", + "extra": "Extra", + "false": "False", + "duplicate": "Duplicate", +} + + +def render() -> None: + """Draw the Hallucination Taxonomy page.""" + st.title("Hallucination Taxonomy") + + db_path: Path = viz_settings.current_settings().db_path + if not viz_db.database_exists(db_path): + empty_state( + "Database not found.", + "Run an experiment first, or update the path in ⚙️ Settings.", + ) + return + + with viz_db.connect(db_path) as conn: + if not viz_queries.has_any_taxonomy_data(conn): + empty_state("No hallucination / error data recorded yet.") + return + tiers = viz_queries.distinct_tiers(conn) + tier = st.selectbox( + "Tier", + options=[None, *tiers], + format_func=lambda t: "All tiers" if t is None else f"Tier {t}", + ) + + entity_data = viz_queries.taxonomy_counts_by_strategy( + conn, viz_queries.ENTITY_TAXONOMY, tier=tier + ) + rel_data = viz_queries.taxonomy_counts_by_strategy( + conn, viz_queries.RELATIONSHIP_TAXONOMY, tier=tier + ) + + _stacked_bar( + entity_data, + columns=viz_queries.ENTITY_TAXONOMY, + title="Entity errors per strategy", + filename="entity_errors_by_strategy", + key="halluc-entity", + ) + _stacked_bar( + rel_data, + columns=viz_queries.RELATIONSHIP_TAXONOMY, + title="Relationship errors per strategy", + filename="relationship_errors_by_strategy", + key="halluc-rel", + ) + + +def _category_of(column: str) -> str: + """'missing_entities' / 'extra_relationships' → 'missing' / 'extra'.""" + return column.split("_", 1)[0] + + +def _label_color(hex_color: str) -> str: + """ + Pick a legible label color for text on ``hex_color`` (relative luminance, + per the design guide's cell-annotation rule): white on dark fills, + ``#333333`` on light fills. + """ + h = hex_color.lstrip("#") + r, g, b = (int(h[i : i + 2], 16) / 255 for i in (0, 2, 4)) + luminance = 0.2126 * r + 0.7152 * g + 0.0722 * b + return "#333333" if luminance > 0.5 else "white" + + +def _stacked_bar( + data: dict[str, dict[str, int]], + *, + columns: tuple[str, ...], + title: str, + filename: str, + key: str, +) -> None: + """Stacked bars: one bar per strategy, one segment per error category.""" + if not data: + empty_state("No data for the current filter.", hint=title) + return + + strategies = list(data) + names = [strategy_display_name(s) for s in strategies] + x = np.arange(len(strategies)) + bottom = np.zeros(len(strategies)) + + fig, ax = new_figure(figsize=(9.0, 4.5)) + for col in columns: + cat = _category_of(col) + fill = _ERROR_COLORS[cat] + text_color = _label_color(fill) + heights = np.array([data[s].get(col, 0) for s in strategies], dtype=float) + ax.bar( + x, + heights, + bottom=bottom, + color=fill, + label=_CATEGORY_LABEL[cat], + ) + # Value labels for non-zero segments (no hover on static figures); + # label color picked for contrast against the segment fill. + for xi, (h, b) in enumerate(zip(heights, bottom)): + if h > 0: + ax.text( + xi, + b + h / 2, + str(int(h)), + ha="center", + va="center", + fontsize=8, + color=text_color, + ) + bottom += heights + + ax.set_xticks(x) + ax.set_xticklabels(names) + ax.set_ylabel("Error count") + ax.grid(axis="y") # vertical bars → horizontal grid only + ax.legend(title="Error type") + fig.tight_layout() + render_chart(fig, filename=filename, key=key, caption=title) diff --git a/src/maestro/viz/views/overview.py b/src/maestro/viz/views/overview.py new file mode 100644 index 0000000..4bd6734 --- /dev/null +++ b/src/maestro/viz/views/overview.py @@ -0,0 +1,104 @@ +""" +MAESTRO viz — Overview view. + +Operational summary (no RQ mapping): headline metric cards plus per-strategy +run-count and cost bars. Reads run_configs / run_results (and run_environments +indirectly via the environment count). +""" + +from __future__ import annotations + +from pathlib import Path + +import streamlit as st + +from maestro.viz import db as viz_db +from maestro.viz import queries as viz_queries +from maestro.viz import settings as viz_settings +from maestro.viz.chart import new_figure, render_chart +from maestro.viz.components import empty_state +from maestro.viz.theme import strategy_color, strategy_display_name + + +def render() -> None: + """Draw the Overview page against the configured database.""" + st.title("Overview") + + db_path: Path = viz_settings.current_settings().db_path + if not viz_db.database_exists(db_path): + empty_state( + "Database not found.", + "Run an experiment first, or update the path in ⚙️ Settings.", + ) + return + + with viz_db.connect(db_path) as conn: + summary = viz_queries.overview_summary(conn) + runs_split = viz_queries.runs_by_strategy_success(conn) + cost_split = viz_queries.total_cost_by_strategy(conn) + + if summary["total_runs"] == 0: + empty_state("No runs recorded yet.", "Run an experiment first.") + return + + # --- Metric cards --- + c1, c2, c3, c4 = st.columns(4) + c1.metric("Total runs", f"{summary['total_runs']:,}") + c2.metric("Success rate", f"{summary['success_rate'] * 100:.0f}%") + c3.metric("Total cost", f"${summary['total_cost_usd']:,.2f}") + # Environment count is optional context — omit silently if none recorded. + if summary["distinct_environments"]: + c4.metric("Environments", f"{summary['distinct_environments']:,}") + + st.divider() + + # --- Runs per strategy, split by success --- + _render_runs_chart(runs_split) + # --- Total cost per strategy --- + _render_cost_chart(cost_split) + + +def _render_runs_chart(runs_split: list[tuple[str, int, int]]) -> None: + if not runs_split: + empty_state("No run results to chart yet.") + return + names = [strategy_display_name(s) for s, _, _ in runs_split] + successes = [s for _, s, _ in runs_split] + failures = [f for _, _, f in runs_split] + + fig, ax = new_figure() + # Stacked: success (strategy color) + failure (muted) per strategy. + ax.bar(names, successes, color="#1ABC9C", label="Success") + ax.bar(names, failures, bottom=successes, color="#E74C3C", label="Failure") + ax.set_ylabel("Runs") + ax.set_xlabel("Strategy") + ax.grid(axis="y") # vertical bars → horizontal grid only + ax.legend() + fig.tight_layout() + render_chart( + fig, + filename="runs_by_strategy", + key="overview-runs", + caption="Run count per strategy, split by success / failure.", + ) + + +def _render_cost_chart(cost_split: list[tuple[str, float]]) -> None: + if not cost_split: + return + names = [strategy_display_name(s) for s, _ in cost_split] + costs = [c for _, c in cost_split] + colors = [strategy_color(s) for s, _ in cost_split] + + fig, ax = new_figure() + ax.bar(names, costs, color=colors) + ax.set_ylabel("Total cost (USD)") + ax.set_xlabel("Strategy") + ax.grid(axis="y") + fig.tight_layout() + render_chart( + fig, + filename="cost_by_strategy", + key="overview-cost", + caption="Total cost (USD) per strategy.", + ) diff --git a/src/maestro/viz/views/pareto.py b/src/maestro/viz/views/pareto.py new file mode 100644 index 0000000..a8e9d0b --- /dev/null +++ b/src/maestro/viz/views/pareto.py @@ -0,0 +1,176 @@ +""" +MAESTRO viz — Pareto view (RQ4). + +Correctness vs. efficiency scatter: entity-ID F1 against cost and against +latency, colored by strategy and shaped by tier. Since matplotlib figures +have no hover, an accompanying data table carries the per-point detail. +Controls excluded. +""" + +from __future__ import annotations + +from pathlib import Path + +import streamlit as st + +from maestro.viz import db as viz_db +from maestro.viz import queries as viz_queries +from maestro.viz import settings as viz_settings +from maestro.viz.chart import new_figure, render_chart +from maestro.viz.components import empty_state +from maestro.viz.theme import strategy_color, strategy_display_name + +# Tier → matplotlib marker shape. Defined here (not the theme) as it is a +# Pareto-specific encoding. +_TIER_MARKER = {1: "o", 2: "s", 3: "D"} +_DEFAULT_MARKER = "o" + + +def render() -> None: + """Draw the Pareto page.""" + st.title("Pareto — correctness vs. efficiency") + + db_path: Path = viz_settings.current_settings().db_path + if not viz_db.database_exists(db_path): + empty_state( + "Database not found.", + "Run an experiment first, or update the path in ⚙️ Settings.", + ) + return + + with viz_db.connect(db_path) as conn: + all_strategies = viz_queries.distinct_strategies(conn) + all_tiers = viz_queries.distinct_tiers(conn) + + fcol1, fcol2 = st.columns(2) + sel_strategies = fcol1.multiselect( + "Strategies", + options=all_strategies, + default=all_strategies, + format_func=strategy_display_name, + ) + sel_tiers = fcol2.multiselect( + "Tiers", + options=all_tiers, + default=all_tiers, + format_func=lambda t: f"Tier {t}", + ) + + points = viz_queries.pareto_points( + conn, + strategies=sel_strategies or None, + tiers=sel_tiers or None, + ) + + distinct_present = {p["strategy"] for p in points} + if len(distinct_present) < 2: + empty_state( + "Add results from at least two strategies to see the Pareto comparison.", + f"Currently {len(distinct_present)} strategy with data.", + ) + return + + _scatter( + points, + x_field="cost_usd", + xlabel="Cost (USD)", + filename="pareto_cost", + key="pareto-cost", + ) + _scatter( + points, + x_field="duration_ms", + xlabel="Latency (ms)", + filename="pareto_latency", + key="pareto-latency", + ) + + # No hover on static figures — a table carries the per-run detail. + st.caption("Per-run detail") + st.dataframe( + [ + { + "run": str(p["run_id"])[:8], + "strategy": strategy_display_name(p["strategy"]), + "model": p["model"], + "tier": p["tier"], + "cost_usd": round(p["cost_usd"], 6), + "latency_ms": p["duration_ms"], + "entity_id_f1": round(p["entity_id_f1"], 4), + } + for p in points + ], + use_container_width=True, + hide_index=True, + ) + + +def _scatter( + points: list[dict], + *, + x_field: str, + xlabel: str, + filename: str, + key: str, +) -> None: + """Scatter of entity_id_f1 (y) vs ``x_field`` (x); color=strategy, marker=tier.""" + fig, ax = new_figure(figsize=(8.0, 5.0)) + + # One scatter call per (strategy, tier) combo so color + marker are set + # consistently and the legend stays meaningful. + seen_strategies: dict[str, str] = {} + seen_tiers: set[int] = set() + for p in points: + ax.scatter( + p[x_field], + p["entity_id_f1"], + color=strategy_color(p["strategy"]), + marker=_TIER_MARKER.get(p["tier"], _DEFAULT_MARKER), + s=60, + edgecolors="white", + linewidths=0.5, + ) + seen_strategies[p["strategy"]] = strategy_color(p["strategy"]) + seen_tiers.add(p["tier"]) + + ax.set_xlabel(xlabel) + ax.set_ylabel("Entity-ID F1") + ax.set_ylim(0, 1) + ax.grid(axis="both") # scatter → both-axis grid + + # Two legends: color = strategy, marker = tier. + from matplotlib.lines import Line2D + + color_handles = [ + Line2D( + [], + [], + marker="o", + linestyle="", + color=color, + label=strategy_display_name(s), + ) + for s, color in sorted(seen_strategies.items()) + ] + tier_handles = [ + Line2D( + [], + [], + marker=_TIER_MARKER.get(t, _DEFAULT_MARKER), + linestyle="", + color="#333333", + label=f"Tier {t}", + ) + for t in sorted(seen_tiers) + ] + first = ax.legend(handles=color_handles, title="Strategy", loc="lower right") + ax.add_artist(first) + ax.legend(handles=tier_handles, title="Tier", loc="upper left") + + fig.tight_layout() + render_chart( + fig, + filename=filename, + key=key, + caption=f"Entity-ID F1 vs. {xlabel.lower()} (color = strategy, marker = tier).", + ) diff --git a/src/maestro/viz/views/run_detail.py b/src/maestro/viz/views/run_detail.py new file mode 100644 index 0000000..63764d8 --- /dev/null +++ b/src/maestro/viz/views/run_detail.py @@ -0,0 +1,185 @@ +""" +MAESTRO viz — Run Detail view (diagnostic, no RQ mapping). + +Pick one run and inspect it: the input spec + ground truth (read from the +file system via experiment_config.INPUTS), the generated diagram with a +parse-validity badge, a per-run metric breakdown, and the sub-call trace for +multi-step strategies. +""" + +from __future__ import annotations + +from pathlib import Path + +import streamlit as st + +from maestro.experiment_config import INPUTS +from maestro.viz import db as viz_db +from maestro.viz import queries as viz_queries +from maestro.viz import settings as viz_settings +from maestro.viz.chart import new_figure, render_chart +from maestro.viz.components import empty_state +from maestro.viz.theme import strategy_display_name + +# example_id → InputFile, for resolving input + ground-truth file paths. The +# DB stores only example_id; the actual files live on disk per the config. +_INPUTS_BY_ID = {inp.example_id: inp for inp in INPUTS} + +# Per-run metric breakdown: the score columns shown as a horizontal bar. +_METRIC_COLUMNS = [ + "entity_id_f1", + "entity_name_f1", + "entity_lemma_f1", + "relationship_relaxed_f1", + "relationship_strict_f1", +] + + +def render() -> None: + """Draw the Run Detail page.""" + st.title("Run Detail") + + db_path: Path = viz_settings.current_settings().db_path + if not viz_db.database_exists(db_path): + empty_state( + "Database not found.", + "Run an experiment first, or update the path in ⚙️ Settings.", + ) + return + + with viz_db.connect(db_path) as conn: + runs = viz_queries.list_runs(conn) + if not runs: + empty_state("No runs available.") + return + + labels = { + r["run_id"]: ( + f"{strategy_display_name(r['strategy'])} | {r['model']} | " + f"tier {r['tier']} | {_fmt_ts(r['timestamp'])}" + ) + for r in runs + } + run_id = st.selectbox( + "Run", + options=[r["run_id"] for r in runs], + format_func=lambda rid: labels[rid], + ) + + detail = viz_queries.run_detail(conn, run_id) + subs = viz_queries.sub_results_for_run(conn, run_id) + + if detail is None: + empty_state("Selected run not found.") + return + + _render_io(detail) + st.divider() + _render_metric_breakdown(detail) + if subs: + st.divider() + _render_sub_trace(subs) + + +def _fmt_ts(ts: str) -> str: + """Format a stored UTC timestamp for display in the configured tz.""" + from datetime import datetime + + from maestro.analysis.timestamps import format_for_display + + if not ts: + return "" + cfg = viz_settings.current_settings() + try: + dt = datetime.fromisoformat(ts) + except (ValueError, TypeError): + return ts + return format_for_display(dt, cfg.display_tz) + + +def _render_io(detail: dict) -> None: + """Left: input spec + ground truth (from disk). Right: generated diagram.""" + left, right = st.columns(2) + + inp = _INPUTS_BY_ID.get(detail["example_id"]) + with left: + st.subheader("Input & ground truth") + if inp is None: + st.info( + f"No input registered for example_id " + f"`{detail['example_id']}` in experiment_config.INPUTS." + ) + else: + st.markdown("**Input spec**") + st.code(_read_or_note(inp.file_path), language="json") + st.markdown("**Ground truth**") + st.code(_read_or_note(inp.ground_truth_path), language="mermaid") + + with right: + st.subheader("Generated diagram") + parses = detail.get("parses_valid") + if parses is True: + st.success("Parses: valid", icon="✅") + elif parses is False: + st.error("Parses: invalid", icon="❌") + else: + st.warning("Parse validity: not checked", icon="❔") + output = detail.get("output_diagram_code") + if output: + st.code(output, language="mermaid") + else: + st.info("No diagram produced for this run.") + if detail.get("error"): + st.error(f"Run error: {detail['error']}") + + +def _read_or_note(path: Path) -> str: + """Read a text file, or return a short note if it's missing/unreadable.""" + try: + return Path(path).read_text(encoding="utf-8") + except (OSError, UnicodeDecodeError): + return f"(file not readable: {path})" + + +def _render_metric_breakdown(detail: dict) -> None: + """Horizontal bar of the run's F1 scores. Neutral color — one run only.""" + st.subheader("Metric breakdown") + values = [(col, detail.get(col)) for col in _METRIC_COLUMNS] + values = [(c, v) for c, v in values if v is not None] + if not values: + st.info("No metric scores recorded for this run.") + return + + labels = [c.replace("_f1", "").replace("_", " ") for c, _ in values] + scores = [float(v) for _, v in values] + + fig, ax = new_figure(figsize=(7.0, 4.0)) + ax.barh(labels, scores, color="#7F8C8D") + ax.set_xlim(0, 1) + ax.set_xlabel("F1") + ax.grid(axis="x") # horizontal bars → vertical grid only + ax.invert_yaxis() # first metric on top + fig.tight_layout() + render_chart( + fig, + filename=f"run_{str(detail['run_id'])[:8]}_metrics", + key="run-detail-metrics", + caption="F1 scores for the selected run.", + ) + + +def _render_sub_trace(subs: list[dict]) -> None: + """Expandable trace of each sub-call for a multi-step strategy.""" + st.subheader("Sub-call trace") + for s in subs: + header = f"Step {s['step_number']}: {s['step_name']}" + with st.expander(header): + cols = st.columns(3) + cols[0].metric("Prompt tokens", f"{s['prompt_tokens']:,}") + cols[1].metric("Completion tokens", f"{s['completion_tokens']:,}") + cols[2].metric("Cost", f"${s['cost_usd']:.6f}") + if s.get("output_text"): + st.markdown("**Output**") + st.code(s["output_text"]) + if s.get("error"): + st.error(f"Sub-call error: {s['error']}") diff --git a/src/maestro/viz/views/strategy_comparison.py b/src/maestro/viz/views/strategy_comparison.py new file mode 100644 index 0000000..da3bef0 --- /dev/null +++ b/src/maestro/viz/views/strategy_comparison.py @@ -0,0 +1,132 @@ +""" +MAESTRO viz — Strategy Comparison view (RQ1, RQ2). + +Grouped bars of entity- and relationship-level correctness per orchestration +strategy, filterable by tier and model, with a precision / recall / F1 toggle. +Controls are excluded (this is a comparison of strategies under test). +""" + +from __future__ import annotations + +from pathlib import Path + +import numpy as np +import streamlit as st + +from maestro.viz import db as viz_db +from maestro.viz import queries as viz_queries +from maestro.viz import settings as viz_settings +from maestro.viz.chart import new_figure, render_chart +from maestro.viz.components import empty_state +from maestro.viz.theme import strategy_color, strategy_display_name + +# Metric suffix per toggle choice. +_SUFFIX = {"F1": "f1", "Precision": "precision", "Recall": "recall"} + + +def render() -> None: + """Draw the Strategy Comparison page.""" + st.title("Strategy Comparison") + + db_path: Path = viz_settings.current_settings().db_path + if not viz_db.database_exists(db_path): + empty_state( + "Database not found.", + "Run an experiment first, or update the path in ⚙️ Settings.", + ) + return + + with viz_db.connect(db_path) as conn: + if not viz_queries.has_any_metrics(conn): + empty_state("No metric results yet.") + return + tiers = viz_queries.distinct_tiers(conn) + models = viz_queries.distinct_models(conn) + + # --- Filters (shared across both charts via widget state) --- + fcol1, fcol2, fcol3 = st.columns([1, 2, 1]) + tier = fcol1.selectbox("Tier", options=tiers, format_func=lambda t: f"Tier {t}") + selected_models = fcol2.multiselect("Models", options=models, default=models) + measure = fcol3.radio("Measure", list(_SUFFIX), horizontal=True) + suffix = _SUFFIX[measure] + + entity_cols = [f"{m}_{suffix}" for m in viz_queries.ENTITY_METRICS] + rel_cols = [f"{m}_{suffix}" for m in viz_queries.RELATIONSHIP_METRICS] + + entity_data = viz_queries.metric_means_by_strategy( + conn, entity_cols, tier=tier, models=selected_models or None + ) + rel_data = viz_queries.metric_means_by_strategy( + conn, rel_cols, tier=tier, models=selected_models or None + ) + + _grouped_bar( + entity_data, + entity_cols, + labels=["ID", "Name", "Lemma"], + title=f"Entity {measure} per strategy", + ylabel=f"Entity {measure}", + filename=f"entity_{suffix}_by_strategy", + key="strat-entity", + ) + _grouped_bar( + rel_data, + rel_cols, + labels=["Relaxed", "Strict"], + title=f"Relationship {measure} per strategy", + ylabel=f"Relationship {measure}", + filename=f"relationship_{suffix}_by_strategy", + key="strat-rel", + ) + + +def _grouped_bar( + data: dict[str, dict[str, float]], + columns: list[str], + *, + labels: list[str], + title: str, + ylabel: str, + filename: str, + key: str, +) -> None: + """ + Grouped bars: one group per strategy, one bar per metric column. Strategy + identity is the bar color; the metric variant is distinguished by position + + legend. Empty-state per chart when the filter yields nothing. + """ + if not data: + empty_state("No data for the current filter.", hint=title) + return + + strategies = list(data) + n_groups = len(strategies) + n_bars = len(columns) + x = np.arange(n_groups) + width = 0.8 / n_bars + + fig, ax = new_figure(figsize=(9.0, 4.5)) + for i, (col, label) in enumerate(zip(columns, labels)): + # Each metric variant offset within the strategy's group. Use the + # strategy color but vary alpha across variants so the strategy stays + # identifiable and the variant is distinguishable. + heights = [data[s].get(col, 0.0) for s in strategies] + offset = (i - (n_bars - 1) / 2) * width + colors = [strategy_color(s) for s in strategies] + ax.bar( + x + offset, + heights, + width, + label=label, + color=colors, + alpha=1.0 - 0.22 * i, + ) + + ax.set_xticks(x) + ax.set_xticklabels([strategy_display_name(s) for s in strategies]) + ax.set_ylabel(ylabel) + ax.set_ylim(0, 1) + ax.grid(axis="y") + ax.legend(title="Variant") + fig.tight_layout() + render_chart(fig, filename=filename, key=key, caption=title) diff --git a/tests/viz/test_views.py b/tests/viz/test_views.py new file mode 100644 index 0000000..d7b422f --- /dev/null +++ b/tests/viz/test_views.py @@ -0,0 +1,357 @@ +""" +Tests for the five data-view query functions, against a synthetic in-memory +database with multiple strategies, models, tiers, and control rows. + +The view modules themselves are thin Streamlit wrappers (selectboxes, columns, +st.pyplot) that need a script context to exercise; their *logic* lives in the +queries here and in the design-system chart/theme code already covered by +test_theme.py. These tests pin the query behavior every view depends on — +especially control exclusion, filtering, and graceful degradation on missing +tables — using the real schema and the production insert helpers. +""" + +from __future__ import annotations + +import sqlite3 +import uuid + +import pytest + +pytest.importorskip("streamlit") +pytest.importorskip("matplotlib") + +from maestro.db.client import SCHEMA # noqa: E402 +from maestro.db.queries import insert_metric_result # noqa: E402 +from maestro.schemas import MetricResult # noqa: E402 +from maestro.viz import queries as q # noqa: E402 + +# View registry import check lives here too. +from maestro.viz.views import VIEWS # noqa: E402 + + +def _conn() -> sqlite3.Connection: + conn = sqlite3.connect(":memory:") + conn.row_factory = sqlite3.Row + conn.executescript(SCHEMA) + return conn + + +def _insert_run( + conn: sqlite3.Connection, + *, + strategy: str, + model: str, + tier: int, + f1: float, + cost: float = 0.001, + duration_ms: int = 100, + success: bool = True, + missing_entities: int = 0, + extra_entities: int = 0, +) -> str: + """Insert a full config+result+metric triple; return the run_id string.""" + run_id = uuid.uuid4() + rid = str(run_id) + conn.execute( + "INSERT INTO run_configs " + "(run_id, strategy, model, example_id, tier, run_number, timestamp) " + "VALUES (?, ?, ?, 'ex_01', ?, 1, '2026-01-01T00:00:00Z')", + (rid, strategy, model, tier), + ) + conn.execute( + "INSERT INTO run_results " + "(run_id, output_diagram_code, prompt_tokens, completion_tokens, " + " duration_ms, cost_usd, error, retry_count) " + "VALUES (?, ?, 10, 10, ?, ?, ?, 0)", + ( + rid, + "graph TD; a-->b" if success else None, + duration_ms, + cost, + None if success else "boom", + ), + ) + insert_metric_result( + conn, + MetricResult( + run_id=run_id, + parses_valid=True, + entity_id_precision=f1, + entity_id_recall=f1, + entity_id_f1=f1, + entity_name_precision=0.0, + entity_name_recall=0.0, + entity_name_f1=0.0, + entity_lemma_precision=0.0, + entity_lemma_recall=0.0, + entity_lemma_f1=0.0, + relationship_relaxed_precision=0.0, + relationship_relaxed_recall=0.0, + relationship_relaxed_f1=0.0, + relationship_strict_precision=0.0, + relationship_strict_recall=0.0, + relationship_strict_f1=0.0, + entities_in_output=0, + entities_in_truth=0, + relationships_in_output=0, + relationships_in_truth=0, + missing_entities=missing_entities, + extra_entities=extra_entities, + false_entities=0, + duplicate_entities=0, + missing_relationships=0, + extra_relationships=0, + false_relationships=0, + duplicate_relationships=0, + ), + ) + return rid + + +def _populate(conn: sqlite3.Connection) -> None: + """Two LLM strategies across two tiers, plus a control row.""" + _insert_run( + conn, strategy="single_agent", model="gpt-4o-mini-2024-07-18", tier=1, f1=0.6 + ) + _insert_run( + conn, strategy="single_agent", model="gpt-4o-mini-2024-07-18", tier=2, f1=0.7 + ) + _insert_run( + conn, + strategy="crew_ai", + model="mistral-small-2603", + tier=1, + f1=0.8, + missing_entities=2, + extra_entities=1, + ) + _insert_run(conn, strategy="crew_ai", model="mistral-small-2603", tier=2, f1=0.9) + _insert_run(conn, strategy="null_control", model="control", tier=1, f1=0.0) + + +# --------------------------------------------------------------------------- +# Registry +# --------------------------------------------------------------------------- + + +def test_views_registry_has_all_data_views(): + labels = {label for label, _ in VIEWS} + for expected in ( + "Home", + "Overview", + "Strategy Comparison", + "Pareto", + "Run Detail", + "Diagram Visualizer", + "Hallucination Taxonomy", + ): + assert expected in labels + assert all(callable(fn) for _, fn in VIEWS) + + +# --------------------------------------------------------------------------- +# Overview +# --------------------------------------------------------------------------- + + +def test_overview_summary_counts(): + conn = _conn() + _populate(conn) + s = q.overview_summary(conn) + assert s["total_runs"] == 5 + assert s["successful_runs"] == 5 + assert s["success_rate"] == 1.0 + assert s["total_cost_usd"] > 0 + + +def test_overview_summary_empty_db(): + s = q.overview_summary(_conn()) + assert s["total_runs"] == 0 + assert s["success_rate"] == 0.0 + + +def test_runs_by_strategy_includes_controls(): + """Overview is operational — controls ARE included here.""" + conn = _conn() + _populate(conn) + rows = dict((r[0], (r[1], r[2])) for r in q.runs_by_strategy_success(conn)) + assert "null_control" in rows + + +# --------------------------------------------------------------------------- +# Strategy Comparison +# --------------------------------------------------------------------------- + + +def test_metric_means_excludes_controls(): + conn = _conn() + _populate(conn) + data = q.metric_means_by_strategy(conn, ["entity_id_f1"]) + assert "null_control" not in data + assert set(data) == {"single_agent", "crew_ai"} + + +def test_metric_means_tier_filter(): + conn = _conn() + _populate(conn) + tier1 = q.metric_means_by_strategy(conn, ["entity_id_f1"], tier=1) + # single_agent tier-1 f1 = 0.6, crew_ai tier-1 f1 = 0.8 + assert tier1["single_agent"]["entity_id_f1"] == pytest.approx(0.6) + assert tier1["crew_ai"]["entity_id_f1"] == pytest.approx(0.8) + + +def test_metric_means_rejects_bad_column(): + conn = _conn() + _populate(conn) + with pytest.raises(ValueError): + q.metric_means_by_strategy(conn, ["entity_id_f1; DROP TABLE x"]) + + +def test_distinct_tiers_and_models(): + conn = _conn() + _populate(conn) + assert q.distinct_tiers(conn) == [1, 2] + models = q.distinct_models(conn) + assert "control" not in models + assert "gpt-4o-mini-2024-07-18" in models + + +# --------------------------------------------------------------------------- +# Pareto +# --------------------------------------------------------------------------- + + +def test_pareto_points_exclude_controls(): + conn = _conn() + _populate(conn) + pts = q.pareto_points(conn) + strategies = {p["strategy"] for p in pts} + assert "null_control" not in strategies + # Each point carries the plotted fields. + p = pts[0] + assert {"cost_usd", "duration_ms", "entity_id_f1", "tier", "model"} <= set(p) + + +def test_pareto_strategy_filter(): + conn = _conn() + _populate(conn) + pts = q.pareto_points(conn, strategies=["crew_ai"]) + assert {p["strategy"] for p in pts} == {"crew_ai"} + + +def test_distinct_strategies_excludes_controls_by_default(): + conn = _conn() + _populate(conn) + assert "null_control" not in q.distinct_strategies(conn) + assert "null_control" in q.distinct_strategies(conn, exclude_controls=False) + + +# --------------------------------------------------------------------------- +# Run Detail +# --------------------------------------------------------------------------- + + +def test_list_and_detail_roundtrip(): + conn = _conn() + rid = _insert_run( + conn, strategy="single_agent", model="gpt-4o-mini-2024-07-18", tier=1, f1=0.6 + ) + runs = q.list_runs(conn) + assert any(r["run_id"] == rid for r in runs) + detail = q.run_detail(conn, rid) + assert detail is not None + assert detail["strategy"] == "single_agent" + assert detail["entity_id_f1"] == pytest.approx(0.6) + + +def test_run_detail_unknown_id(): + conn = _conn() + _populate(conn) + assert q.run_detail(conn, str(uuid.uuid4())) is None + + +def test_sub_results_empty_when_none(): + conn = _conn() + rid = _insert_run( + conn, strategy="sop_based", model="mistral-small-2603", tier=1, f1=0.5 + ) + assert q.sub_results_for_run(conn, rid) == [] + + +# --------------------------------------------------------------------------- +# Hallucination Taxonomy +# --------------------------------------------------------------------------- + + +def test_has_any_taxonomy_data(): + conn = _conn() + # All-zero taxonomy → False. + _insert_run( + conn, strategy="single_agent", model="gpt-4o-mini-2024-07-18", tier=1, f1=0.6 + ) + assert q.has_any_taxonomy_data(conn) is False + # A row with a non-zero count → True. + _insert_run( + conn, + strategy="crew_ai", + model="mistral-small-2603", + tier=1, + f1=0.8, + missing_entities=3, + ) + assert q.has_any_taxonomy_data(conn) is True + + +def test_taxonomy_counts_by_strategy(): + conn = _conn() + _populate(conn) + data = q.taxonomy_counts_by_strategy(conn, q.ENTITY_TAXONOMY) + # crew_ai tier-1 row had missing_entities=2, extra_entities=1. + assert data["crew_ai"]["missing_entities"] == 2 + assert data["crew_ai"]["extra_entities"] == 1 + + +def test_taxonomy_rejects_bad_column(): + conn = _conn() + _populate(conn) + with pytest.raises(ValueError): + q.taxonomy_counts_by_strategy(conn, ("missing_entities; DROP TABLE x",)) + + +# --------------------------------------------------------------------------- +# Mermaid rendering (Diagram Visualizer) +# --------------------------------------------------------------------------- + + +def test_mermaid_render_blank_source_returns_none(): + """Empty/blank source short-circuits to None (nothing to render).""" + from maestro.viz.mermaid_render import render_mermaid_svg + + assert render_mermaid_svg("") is None + assert render_mermaid_svg(" \n ") is None + + +def test_mermaid_render_handles_missing_mmdc(monkeypatch): + """With mmdc absent, the renderer returns None (caller shows code).""" + import maestro.viz.mermaid_render as mr + + monkeypatch.setattr(mr.shutil, "which", lambda _: None) + assert mr.mmdc_available() is False + assert mr.render_mermaid_svg("graph TD; a-->b") is None + + +# --------------------------------------------------------------------------- +# Graceful degradation — every query no-ops on an empty (schemaless) DB +# --------------------------------------------------------------------------- + + +def test_queries_safe_on_schemaless_db(): + bare = sqlite3.connect(":memory:") + bare.row_factory = sqlite3.Row + assert q.overview_summary(bare)["total_runs"] == 0 + assert q.runs_by_strategy_success(bare) == [] + assert q.metric_means_by_strategy(bare, ["entity_id_f1"]) == {} + assert q.pareto_points(bare) == [] + assert q.list_runs(bare) == [] + assert q.has_any_taxonomy_data(bare) is False + assert q.taxonomy_counts_by_strategy(bare, q.ENTITY_TAXONOMY) == {}