Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions data/05_bpmn_1.JSON
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
},
{
"id": "gw_result",
"name": "",
"name": "Result",
"type": "exclusiveGateway",
"lane": null,
"attached_to": null
Expand Down Expand Up @@ -61,7 +61,7 @@
},
{
"id": "gw_manager_decision",
"name": "",
"name": "Manager Decision",
"type": "exclusiveGateway",
"lane": null,
"attached_to": null
Expand Down
4 changes: 2 additions & 2 deletions data/25_bpmn_3.JSON
Original file line number Diff line number Diff line change
Expand Up @@ -180,15 +180,15 @@
},
{
"id": "xgw_approval_result",
"name": "",
"name": "Vacation Approval",
"type": "exclusiveGateway",
"pool": "pool_vacation",
"lane": null,
"attached_to": null
},
{
"id": "xgw_manual_result",
"name": "",
"name": "Vacation Approved",
"type": "exclusiveGateway",
"pool": "pool_vacation",
"lane": null,
Expand Down
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,10 @@ select = ["E", "F", "I", "W"]
# human description of the diagram. These are intentionally long prose strings
# where wrapping would hurt readability, so line length is not enforced here.
"src/maestro/experiment_config.py" = ["E501"]
# prompts.py holds the canonical Mermaid output contract: each rule is a single
# prose line shown to the model verbatim. Wrapping a rule would split it across
# bullets in the rendered prompt, so line length is not enforced here.
"src/maestro/prompts.py" = ["E501"]

[tool.pytest.ini_options]
testpaths = ["tests"]
Expand Down
22 changes: 22 additions & 0 deletions src/maestro/analysis/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,21 @@ def _lemmatize_label(label: str) -> str:
r"\s*[\]\)\}]+" # closing bracket(s)
)


def _strip_inline_labels(line: str) -> str:
"""
Replace every inline node definition (``id["Label"]``) with its bare ``id``.

Edge lines may redeclare a node's label on one or both endpoints, e.g.
``a["A"] --> b["B"]``. The edge regexes expect the id to sit directly
against the operator, so a labelled *source* would otherwise break edge
extraction. Collapsing each ``id[...]`` to ``id`` leaves a clean
``a --> b`` for the operator scan; node labels are captured separately by
``extract_nodes`` so nothing is lost here.
"""
return _NODE_DEF.sub(lambda m: m.group(1), line)


# Edge label between pipes, e.g. -->|"Green (no risk)"| — stripped before node
# scanning so its text is never mistaken for a node definition.
_PIPE_LABEL = re.compile(r"\|[^|]*\|")
Expand Down Expand Up @@ -240,6 +255,10 @@ def _add(src: str, tgt: str, rel_type: str, undirected: bool = False) -> None:
line = raw.strip()
if not line or line.startswith("%%"):
continue
# Collapse any inline node labels (``a["A"] --> b["B"]``) to bare ids
# so a labelled source endpoint can't hide the edge from the operator
# scan. Node labels themselves are captured by ``extract_nodes``.
line = _strip_inline_labels(line)
for m in _EDGE.finditer(line):
src, op, tgt = m.group(1), m.group(2), m.group(3)
if op in ("o--o", "--o", "--x"):
Expand All @@ -266,6 +285,9 @@ def extract_attachments(mermaid_code: str) -> list[dict]:
line = raw.strip()
if not line or line.startswith("%%"):
continue
# Same inline-label collapse as extract_relationships: an attachment
# written ``task["T"] o--o evt(("E"))`` must still match _ATTACH.
line = _strip_inline_labels(line)
for m in _ATTACH.finditer(line):
a, b = sorted((m.group(1), m.group(2)))
if (a, b) not in seen:
Expand Down
58 changes: 58 additions & 0 deletions src/maestro/prompts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""
MAESTRO — Canonical Mermaid output contract (single source of truth).

The rules that tell a model how to emit Mermaid live HERE and nowhere else.
Providers supply the system-message identity from ``MERMAID_SYSTEM_IDENTITY``;
the single-agent baseline and multi-step step 3 build their user prompts from
``render_rules()``. Defining the contract once keeps every provider and every
orchestration strategy on a byte-identical output contract — so quality
differences are attributable to orchestration (the independent variable), not
to drifting prompt wording.

Why this module imports nothing from ``maestro``: ``providers`` and
``strategies`` both depend on it, so any back-import would create a cycle.
Keep it dependency-free (plain strings + one helper).

The optional ``skill`` layer in ``render_rules`` is the future
prompt-enhancement variable: an append-only block, never an edit to the
baseline rules, so the enhancement stays an isolatable condition.
"""

from __future__ import annotations

# System-message identity. Was duplicated verbatim as ``SYSTEM_PROMPT`` in
# every provider subclass; now defined once and assigned on ``LLMProvider``.
MERMAID_SYSTEM_IDENTITY = (
"You are a diagram generation assistant. "
"Respond only with valid Mermaid diagram code. "
"Do not include any explanation, markdown fencing, or additional text."
)

# Unified user-prompt output rules. Was hand-copied (with drift) in
# ``single.py`` and step 3 of ``_extraction.py``; now one contract applied
# identically to both. Brace-free on purpose so it can be embedded into a
# template string ahead of any later ``.format()`` call without escaping.
MERMAID_RULES = """\
- Begin the diagram with a flowchart header, `flowchart LR`; do not use C4, sequence, class, or other diagram types
- Output only valid Mermaid syntax
- Wrap node labels in double quotes, e.g. node_id["My Label"], so labels with spaces, parentheses, slashes, or line breaks stay parseable
- If a node has no label, write just its id (e.g. gw_result) — never an empty bracket like node_id[""]
- Quote edge labels the same way, with no spaces inside the pipes, e.g. a -->|"My edge"| b; for an unlabelled edge use a plain arrow a --> b and never an empty label like -->|| or -->| |
- Include every entity and relationship from the input
- Preserve hierarchy using subgraphs for pools, lanes, and subprocesses
- Do not invent entities or relationships not present in the input
- Do not include explanations or markdown code fences
- Do not use internal or relationship IDs as edge labels"""


def render_rules(skill: str | None = None) -> str:
"""
Return the canonical Mermaid rules, optionally extended by a skills block.

The skills block is APPEND-ONLY: it is concatenated after the baseline
rules and must never edit them, so a baseline run (``skill=None``) and an
enhanced run differ only by the added text. Baseline callers pass ``None``.
"""
if skill is None:
return MERMAID_RULES
return MERMAID_RULES + "\n" + skill
7 changes: 1 addition & 6 deletions src/maestro/providers/anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,7 @@ class AnthropicProvider(LLMProvider):
Uses the official anthropic SDK — add 'anthropic>=0.25.0' to pyproject.toml.
"""

# Instructs the model to output diagram code only — no prose or fencing
SYSTEM_PROMPT = (
"You are a diagram generation assistant. "
"Respond only with valid Mermaid diagram code. "
"Do not include any explanation, markdown fencing, or additional text."
)
# SYSTEM_PROMPT inherited from LLMProvider (maestro.prompts).

# Max tokens for the completion — diagram code is rarely long
MAX_TOKENS = 4096
Expand Down
7 changes: 7 additions & 0 deletions src/maestro/providers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from abc import ABC, abstractmethod

from maestro.prompts import MERMAID_SYSTEM_IDENTITY
from maestro.schemas import ModelPricing, RunConfig, RunResult


Expand All @@ -17,6 +18,12 @@ class LLMProvider(ABC):
# Centralized temperature setting — 0 for reproducibility across all providers
TEMPERATURE = 0

# Default system identity for Mermaid generation, shared by every provider.
# Subclasses inherit this; a strategy that needs a different identity for a
# given call (e.g. SOP steps 1-2 requesting JSON) passes system_prompt
# explicitly to complete().
SYSTEM_PROMPT = MERMAID_SYSTEM_IDENTITY

def __init__(self, api_key: str, pricing: ModelPricing) -> None:
# api_key stored on instance — never logged or serialised
self.api_key = api_key
Expand Down
6 changes: 1 addition & 5 deletions src/maestro/providers/gemini.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,7 @@ class GeminiProvider(LLMProvider):
Uses the official google-genai SDK — add 'google-genai>=1.0' to pyproject.toml.
"""

SYSTEM_PROMPT = (
"You are a diagram generation assistant. "
"Respond only with valid Mermaid diagram code. "
"Do not include any explanation, markdown fencing, or additional text."
)
# SYSTEM_PROMPT inherited from LLMProvider (maestro.prompts).

MAX_TOKENS = 4096

Expand Down
6 changes: 1 addition & 5 deletions src/maestro/providers/mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,7 @@ class MistralProvider(LLMProvider):
``mistralai.client.errors`` respectively.
"""

SYSTEM_PROMPT = (
"You are a diagram generation assistant. "
"Respond only with valid Mermaid diagram code. "
"Do not include any explanation, markdown fencing, or additional text."
)
# SYSTEM_PROMPT inherited from LLMProvider (maestro.prompts).

MAX_TOKENS = 4096

Expand Down
7 changes: 1 addition & 6 deletions src/maestro/providers/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,7 @@ class OpenAIProvider(LLMProvider):
# logged as "openai", misattributing failures in multi-provider runs.
_PROVIDER_NAME = "openai"

# Same role as AnthropicProvider.SYSTEM_PROMPT
SYSTEM_PROMPT = (
"You are a diagram generation assistant. "
"Respond only with valid Mermaid diagram code. "
"Do not include any explanation, markdown fencing, or additional text."
)
# SYSTEM_PROMPT inherited from LLMProvider (maestro.prompts).

# Max tokens for the completion
MAX_TOKENS = 4096
Expand Down
100 changes: 73 additions & 27 deletions src/maestro/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,8 +231,10 @@ def parse_args() -> argparse.Namespace:
parser.add_argument(
"--strategy",
type=str,
choices=[s.value for s in Strategy],
help="Run only this strategy (default: all enabled)",
help=(
"Run only these strategies (default: all enabled). "
"Comma-separated for several, e.g. --strategy single_agent,lang_graph"
),
)
parser.add_argument(
"--tier",
Expand All @@ -243,12 +245,19 @@ def parse_args() -> argparse.Namespace:
parser.add_argument(
"--model",
type=str,
help="Run only this model (default: all registered)",
help=(
"Run only these models (default: all registered). "
"Comma-separated for several, e.g. "
"--model gpt-4o-mini-2024-07-18,deepseek-v4-flash"
),
)
parser.add_argument(
"--example",
type=str,
help="Run only this example_id (default: all registered)",
help=(
"Run only these example_ids (default: all registered). "
"Comma-separated for several, e.g. --example bpmn_1_03,it_1_07"
),
)
parser.add_argument(
"--repeats",
Expand Down Expand Up @@ -294,30 +303,67 @@ def parse_args() -> argparse.Namespace:
# ---------------------------------------------------------------------------


def _split_csv(value: str | None) -> list[str] | None:
"""
Parse a comma-separated filter value into a clean list, or None if the
flag was absent. Empty/whitespace-only entries are dropped so trailing
commas and stray spaces don't create phantom filter values.
"""
if value is None:
return None
return [part.strip() for part in value.split(",") if part.strip()]


def build_matrix(args: argparse.Namespace) -> list[dict]:
"""
Build the experiment matrix as a list of dicts, each representing one run.
Applies CLI filters to narrow the cross-product.
Applies CLI filters to narrow the cross-product. The --strategy, --model
and --example flags accept a comma-separated list (membership filter).
"""
examples = _split_csv(args.example)
model_names = _split_csv(args.model)
strategy_names = _split_csv(args.strategy)

# Validate filter values up front (argparse no longer does, now that the
# flags accept lists). Catches a typo before any matrix work or API spend —
# a misspelled value in a list would otherwise silently shrink the matrix.
def _reject_unknown(flag: str, given: list[str], valid: set[str]) -> None:
unknown = [v for v in given if v not in valid]
if unknown:
print(
f"ERROR: unknown {flag} value(s): {', '.join(unknown)}. "
f"Known: {', '.join(sorted(valid))}",
file=sys.stderr,
)
sys.exit(2)

if strategy_names:
_reject_unknown("--strategy", strategy_names, {s.value for s in Strategy})
if examples:
_reject_unknown("--example", examples, {i.example_id for i in INPUTS})
# --model is validated below, after the strategy filter is known: an unknown
# model only matters when a real (LLM) strategy is actually selected, so the
# control-only no-op (--strategy null_control --model anything) is preserved.

# Filter inputs
inputs = INPUTS
if args.tier:
inputs = [i for i in inputs if i.tier.value == args.tier]
if args.example:
inputs = [i for i in inputs if i.example_id == args.example]
if examples:
inputs = [i for i in inputs if i.example_id in examples]

# Filter strategies
strategies = STRATEGIES
if args.strategy:
strategies = [s for s in strategies if s.value == args.strategy]
if strategy_names:
strategies = [s for s in strategies if s.value in strategy_names]

# Filter models — applies only to real (LLM) strategies. Control rows
# ignore --model because they don't use a model; --model gpt-4o-mini
# should narrow which LLM rows run but should not silently drop the
# sanity floor/ceiling rows the experiment needs.
models = MODELS
if args.model:
models = [m for m in models if m.model == args.model]
if model_names:
models = [m for m in models if m.model in model_names]

# Partition by strategy kind. Controls are deterministic in (model,
# repeat) — collapsing both dimensions to a single row per
Expand All @@ -326,22 +372,22 @@ def build_matrix(args: argparse.Namespace) -> list[dict]:
real_strategies = [s for s in strategies if s not in CONTROL_STRATEGIES]
control_strategies = [s for s in strategies if s in CONTROL_STRATEGIES]

# Fail fast on an unknown --model only when it actually matters:
# `--strategy null_control --model typo` should be a no-op on --model
# (controls don't use any model) rather than aborting. Without this
# guard, however, `--model gpt-4o-min` would silently produce just
# the 3 control rows when the user wanted a single LLM cell — looks
# like a tiny matrix instead of the misuse it is. So: validate the
# model flag only when there's at least one real strategy left after
# the strategy filter.
if args.model and real_strategies and not models:
known = ", ".join(m.model for m in MODELS)
print(
f"ERROR: --model {args.model!r} matches no registered model. "
f"Known: {known}",
file=sys.stderr,
)
sys.exit(2)
# Fail fast on any unknown --model value, but only when a real (LLM)
# strategy is selected. `--strategy null_control --model typo` stays a
# no-op on --model (controls don't use any model), so it must not abort.
# When a real strategy IS selected, a misspelled model would otherwise
# silently shrink the matrix (e.g. `--model gpt-4o-mini-2024-07-18,typo`
# would quietly run only the valid one) — so reject the typo loudly.
if model_names and real_strategies:
registered = {m.model for m in MODELS}
unknown = [m for m in model_names if m not in registered]
if unknown:
print(
f"ERROR: unknown --model value(s): {', '.join(unknown)}. "
f"Known: {', '.join(sorted(registered))}",
file=sys.stderr,
)
sys.exit(2)

matrix = []

Expand Down
Loading