Colinho22 · Colinho22 · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026
diff --git a/data/05_bpmn_1.JSON b/data/05_bpmn_1.JSON
@@ -33,7 +33,7 @@
     },
     {
       "id": "gw_result",
-      "name": "",
+      "name": "Result",
       "type": "exclusiveGateway",
       "lane": null,
       "attached_to": null
@@ -61,7 +61,7 @@
     },
     {
       "id": "gw_manager_decision",
-      "name": "",
+      "name": "Manager Decision",
       "type": "exclusiveGateway",
       "lane": null,
       "attached_to": null

diff --git a/data/25_bpmn_3.JSON b/data/25_bpmn_3.JSON
@@ -180,15 +180,15 @@
     },
     {
       "id": "xgw_approval_result",
-      "name": "",
+      "name": "Vacation Approval",
       "type": "exclusiveGateway",
       "pool": "pool_vacation",
       "lane": null,
       "attached_to": null
     },
     {
       "id": "xgw_manual_result",
-      "name": "",
+      "name": "Vacation Approved",
       "type": "exclusiveGateway",
       "pool": "pool_vacation",
       "lane": null,

diff --git a/pyproject.toml b/pyproject.toml
@@ -101,6 +101,10 @@ select = ["E", "F", "I", "W"]
 # human description of the diagram. These are intentionally long prose strings
 # where wrapping would hurt readability, so line length is not enforced here.
 "src/maestro/experiment_config.py" = ["E501"]
+# prompts.py holds the canonical Mermaid output contract: each rule is a single
+# prose line shown to the model verbatim. Wrapping a rule would split it across
+# bullets in the rendered prompt, so line length is not enforced here.
+"src/maestro/prompts.py" = ["E501"]
 
 [tool.pytest.ini_options]
 testpaths = ["tests"]

diff --git a/src/maestro/analysis/metrics.py b/src/maestro/analysis/metrics.py
@@ -133,6 +133,21 @@ def _lemmatize_label(label: str) -> str:
     r"\s*[\]\)\}]+"  # closing bracket(s)
 )
 
+
+def _strip_inline_labels(line: str) -> str:
+    """
+    Replace every inline node definition (``id["Label"]``) with its bare ``id``.
+
+    Edge lines may redeclare a node's label on one or both endpoints, e.g.
+    ``a["A"] --> b["B"]``. The edge regexes expect the id to sit directly
+    against the operator, so a labelled *source* would otherwise break edge
+    extraction. Collapsing each ``id[...]`` to ``id`` leaves a clean
+    ``a --> b`` for the operator scan; node labels are captured separately by
+    ``extract_nodes`` so nothing is lost here.
+    """
+    return _NODE_DEF.sub(lambda m: m.group(1), line)
+
+
 # Edge label between pipes, e.g. -->|"Green (no risk)"| — stripped before node
 # scanning so its text is never mistaken for a node definition.
 _PIPE_LABEL = re.compile(r"\|[^|]*\|")
@@ -240,6 +255,10 @@ def _add(src: str, tgt: str, rel_type: str, undirected: bool = False) -> None:
         line = raw.strip()
         if not line or line.startswith("%%"):
             continue
+        # Collapse any inline node labels (``a["A"] --> b["B"]``) to bare ids
+        # so a labelled source endpoint can't hide the edge from the operator
+        # scan. Node labels themselves are captured by ``extract_nodes``.
+        line = _strip_inline_labels(line)
         for m in _EDGE.finditer(line):
             src, op, tgt = m.group(1), m.group(2), m.group(3)
             if op in ("o--o", "--o", "--x"):
@@ -266,6 +285,9 @@ def extract_attachments(mermaid_code: str) -> list[dict]:
         line = raw.strip()
         if not line or line.startswith("%%"):
             continue
+        # Same inline-label collapse as extract_relationships: an attachment
+        # written ``task["T"] o--o evt(("E"))`` must still match _ATTACH.
+        line = _strip_inline_labels(line)
         for m in _ATTACH.finditer(line):
             a, b = sorted((m.group(1), m.group(2)))
             if (a, b) not in seen:

diff --git a/src/maestro/prompts.py b/src/maestro/prompts.py
@@ -0,0 +1,58 @@
+"""
+MAESTRO — Canonical Mermaid output contract (single source of truth).
+
+The rules that tell a model how to emit Mermaid live HERE and nowhere else.
+Providers supply the system-message identity from ``MERMAID_SYSTEM_IDENTITY``;
+the single-agent baseline and multi-step step 3 build their user prompts from
+``render_rules()``. Defining the contract once keeps every provider and every
+orchestration strategy on a byte-identical output contract — so quality
+differences are attributable to orchestration (the independent variable), not
+to drifting prompt wording.
+
+Why this module imports nothing from ``maestro``: ``providers`` and
+``strategies`` both depend on it, so any back-import would create a cycle.
+Keep it dependency-free (plain strings + one helper).
+
+The optional ``skill`` layer in ``render_rules`` is the future
+prompt-enhancement variable: an append-only block, never an edit to the
+baseline rules, so the enhancement stays an isolatable condition.
+"""
+
+from __future__ import annotations
+
+# System-message identity. Was duplicated verbatim as ``SYSTEM_PROMPT`` in
+# every provider subclass; now defined once and assigned on ``LLMProvider``.
+MERMAID_SYSTEM_IDENTITY = (
+    "You are a diagram generation assistant. "
+    "Respond only with valid Mermaid diagram code. "
+    "Do not include any explanation, markdown fencing, or additional text."
+)
+
+# Unified user-prompt output rules. Was hand-copied (with drift) in
+# ``single.py`` and step 3 of ``_extraction.py``; now one contract applied
+# identically to both. Brace-free on purpose so it can be embedded into a
+# template string ahead of any later ``.format()`` call without escaping.
+MERMAID_RULES = """\
+- Begin the diagram with a flowchart header, `flowchart LR`; do not use C4, sequence, class, or other diagram types
+- Output only valid Mermaid syntax
+- Wrap node labels in double quotes, e.g. node_id["My Label"], so labels with spaces, parentheses, slashes, or line breaks stay parseable
+- If a node has no label, write just its id (e.g. gw_result) — never an empty bracket like node_id[""]
+- Quote edge labels the same way, with no spaces inside the pipes, e.g. a -->|"My edge"| b; for an unlabelled edge use a plain arrow a --> b and never an empty label like -->|| or -->| |
+- Include every entity and relationship from the input
+- Preserve hierarchy using subgraphs for pools, lanes, and subprocesses
+- Do not invent entities or relationships not present in the input
+- Do not include explanations or markdown code fences
+- Do not use internal or relationship IDs as edge labels"""
+
+
+def render_rules(skill: str | None = None) -> str:
+    """
+    Return the canonical Mermaid rules, optionally extended by a skills block.
+
+    The skills block is APPEND-ONLY: it is concatenated after the baseline
+    rules and must never edit them, so a baseline run (``skill=None``) and an
+    enhanced run differ only by the added text. Baseline callers pass ``None``.
+    """
+    if skill is None:
+        return MERMAID_RULES
+    return MERMAID_RULES + "\n" + skill
diff --git a/src/maestro/providers/anthropic.py b/src/maestro/providers/anthropic.py
@@ -28,12 +28,7 @@ class AnthropicProvider(LLMProvider):
     Uses the official anthropic SDK — add 'anthropic>=0.25.0' to pyproject.toml.
     """
 
-    # Instructs the model to output diagram code only — no prose or fencing
-    SYSTEM_PROMPT = (
-        "You are a diagram generation assistant. "
-        "Respond only with valid Mermaid diagram code. "
-        "Do not include any explanation, markdown fencing, or additional text."
-    )
+    # SYSTEM_PROMPT inherited from LLMProvider (maestro.prompts).
 
     # Max tokens for the completion — diagram code is rarely long
     MAX_TOKENS = 4096

diff --git a/src/maestro/providers/base.py b/src/maestro/providers/base.py
@@ -5,6 +5,7 @@
 
 from abc import ABC, abstractmethod
 
+from maestro.prompts import MERMAID_SYSTEM_IDENTITY
 from maestro.schemas import ModelPricing, RunConfig, RunResult
 
 
@@ -17,6 +18,12 @@ class LLMProvider(ABC):
     # Centralized temperature setting — 0 for reproducibility across all providers
     TEMPERATURE = 0
 
+    # Default system identity for Mermaid generation, shared by every provider.
+    # Subclasses inherit this; a strategy that needs a different identity for a
+    # given call (e.g. SOP steps 1-2 requesting JSON) passes system_prompt
+    # explicitly to complete().
+    SYSTEM_PROMPT = MERMAID_SYSTEM_IDENTITY
+
     def __init__(self, api_key: str, pricing: ModelPricing) -> None:
         # api_key stored on instance — never logged or serialised
         self.api_key = api_key

diff --git a/src/maestro/providers/gemini.py b/src/maestro/providers/gemini.py
@@ -24,11 +24,7 @@ class GeminiProvider(LLMProvider):
     Uses the official google-genai SDK — add 'google-genai>=1.0' to pyproject.toml.
     """
 
-    SYSTEM_PROMPT = (
-        "You are a diagram generation assistant. "
-        "Respond only with valid Mermaid diagram code. "
-        "Do not include any explanation, markdown fencing, or additional text."
-    )
+    # SYSTEM_PROMPT inherited from LLMProvider (maestro.prompts).
 
     MAX_TOKENS = 4096
 

diff --git a/src/maestro/providers/mistral.py b/src/maestro/providers/mistral.py
@@ -25,11 +25,7 @@ class MistralProvider(LLMProvider):
     ``mistralai.client.errors`` respectively.
     """
 
-    SYSTEM_PROMPT = (
-        "You are a diagram generation assistant. "
-        "Respond only with valid Mermaid diagram code. "
-        "Do not include any explanation, markdown fencing, or additional text."
-    )
+    # SYSTEM_PROMPT inherited from LLMProvider (maestro.prompts).
 
     MAX_TOKENS = 4096
 

diff --git a/src/maestro/providers/openai.py b/src/maestro/providers/openai.py
@@ -36,12 +36,7 @@ class OpenAIProvider(LLMProvider):
     # logged as "openai", misattributing failures in multi-provider runs.
     _PROVIDER_NAME = "openai"
 
-    # Same role as AnthropicProvider.SYSTEM_PROMPT
-    SYSTEM_PROMPT = (
-        "You are a diagram generation assistant. "
-        "Respond only with valid Mermaid diagram code. "
-        "Do not include any explanation, markdown fencing, or additional text."
-    )
+    # SYSTEM_PROMPT inherited from LLMProvider (maestro.prompts).
 
     # Max tokens for the completion
     MAX_TOKENS = 4096

diff --git a/src/maestro/run.py b/src/maestro/run.py
@@ -231,8 +231,10 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--strategy",
         type=str,
-        choices=[s.value for s in Strategy],
-        help="Run only this strategy (default: all enabled)",
+        help=(
+            "Run only these strategies (default: all enabled). "
+            "Comma-separated for several, e.g. --strategy single_agent,lang_graph"
+        ),
     )
     parser.add_argument(
         "--tier",
@@ -243,12 +245,19 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--model",
         type=str,
-        help="Run only this model (default: all registered)",
+        help=(
+            "Run only these models (default: all registered). "
+            "Comma-separated for several, e.g. "
+            "--model gpt-4o-mini-2024-07-18,deepseek-v4-flash"
+        ),
     )
     parser.add_argument(
         "--example",
         type=str,
-        help="Run only this example_id (default: all registered)",
+        help=(
+            "Run only these example_ids (default: all registered). "
+            "Comma-separated for several, e.g. --example bpmn_1_03,it_1_07"
+        ),
     )
     parser.add_argument(
         "--repeats",
@@ -294,30 +303,67 @@ def parse_args() -> argparse.Namespace:
 # ---------------------------------------------------------------------------
 
 
+def _split_csv(value: str | None) -> list[str] | None:
+    """
+    Parse a comma-separated filter value into a clean list, or None if the
+    flag was absent. Empty/whitespace-only entries are dropped so trailing
+    commas and stray spaces don't create phantom filter values.
+    """
+    if value is None:
+        return None
+    return [part.strip() for part in value.split(",") if part.strip()]
+
+
 def build_matrix(args: argparse.Namespace) -> list[dict]:
     """
     Build the experiment matrix as a list of dicts, each representing one run.
-    Applies CLI filters to narrow the cross-product.
+    Applies CLI filters to narrow the cross-product. The --strategy, --model
+    and --example flags accept a comma-separated list (membership filter).
     """
+    examples = _split_csv(args.example)
+    model_names = _split_csv(args.model)
+    strategy_names = _split_csv(args.strategy)
+
+    # Validate filter values up front (argparse no longer does, now that the
+    # flags accept lists). Catches a typo before any matrix work or API spend —
+    # a misspelled value in a list would otherwise silently shrink the matrix.
+    def _reject_unknown(flag: str, given: list[str], valid: set[str]) -> None:
+        unknown = [v for v in given if v not in valid]
+        if unknown:
+            print(
+                f"ERROR: unknown {flag} value(s): {', '.join(unknown)}. "
+                f"Known: {', '.join(sorted(valid))}",
+                file=sys.stderr,
+            )
+            sys.exit(2)
+
+    if strategy_names:
+        _reject_unknown("--strategy", strategy_names, {s.value for s in Strategy})
+    if examples:
+        _reject_unknown("--example", examples, {i.example_id for i in INPUTS})
+    # --model is validated below, after the strategy filter is known: an unknown
+    # model only matters when a real (LLM) strategy is actually selected, so the
+    # control-only no-op (--strategy null_control --model anything) is preserved.
+
     # Filter inputs
     inputs = INPUTS
     if args.tier:
         inputs = [i for i in inputs if i.tier.value == args.tier]
-    if args.example:
-        inputs = [i for i in inputs if i.example_id == args.example]
+    if examples:
+        inputs = [i for i in inputs if i.example_id in examples]
 
     # Filter strategies
     strategies = STRATEGIES
-    if args.strategy:
-        strategies = [s for s in strategies if s.value == args.strategy]
+    if strategy_names:
+        strategies = [s for s in strategies if s.value in strategy_names]
 
     # Filter models — applies only to real (LLM) strategies. Control rows
     # ignore --model because they don't use a model; --model gpt-4o-mini
     # should narrow which LLM rows run but should not silently drop the
     # sanity floor/ceiling rows the experiment needs.
     models = MODELS
-    if args.model:
-        models = [m for m in models if m.model == args.model]
+    if model_names:
+        models = [m for m in models if m.model in model_names]
 
     # Partition by strategy kind. Controls are deterministic in (model,
     # repeat) — collapsing both dimensions to a single row per
@@ -326,22 +372,22 @@ def build_matrix(args: argparse.Namespace) -> list[dict]:
     real_strategies = [s for s in strategies if s not in CONTROL_STRATEGIES]
     control_strategies = [s for s in strategies if s in CONTROL_STRATEGIES]
 
-    # Fail fast on an unknown --model only when it actually matters:
-    # `--strategy null_control --model typo` should be a no-op on --model
-    # (controls don't use any model) rather than aborting. Without this
-    # guard, however, `--model gpt-4o-min` would silently produce just
-    # the 3 control rows when the user wanted a single LLM cell — looks
-    # like a tiny matrix instead of the misuse it is. So: validate the
-    # model flag only when there's at least one real strategy left after
-    # the strategy filter.
-    if args.model and real_strategies and not models:
-        known = ", ".join(m.model for m in MODELS)
-        print(
-            f"ERROR: --model {args.model!r} matches no registered model. "
-            f"Known: {known}",
-            file=sys.stderr,
-        )
-        sys.exit(2)
+    # Fail fast on any unknown --model value, but only when a real (LLM)
+    # strategy is selected. `--strategy null_control --model typo` stays a
+    # no-op on --model (controls don't use any model), so it must not abort.
+    # When a real strategy IS selected, a misspelled model would otherwise
+    # silently shrink the matrix (e.g. `--model gpt-4o-mini-2024-07-18,typo`
+    # would quietly run only the valid one) — so reject the typo loudly.
+    if model_names and real_strategies:
+        registered = {m.model for m in MODELS}
+        unknown = [m for m in model_names if m not in registered]
+        if unknown:
+            print(
+                f"ERROR: unknown --model value(s): {', '.join(unknown)}. "
+                f"Known: {', '.join(sorted(registered))}",
+                file=sys.stderr,
+            )
+            sys.exit(2)
 
     matrix = []