diff --git a/data/05_bpmn_1.JSON b/data/05_bpmn_1.JSON index a5a76d7..2dc8c93 100644 --- a/data/05_bpmn_1.JSON +++ b/data/05_bpmn_1.JSON @@ -33,7 +33,7 @@ }, { "id": "gw_result", - "name": "", + "name": "Result", "type": "exclusiveGateway", "lane": null, "attached_to": null @@ -61,7 +61,7 @@ }, { "id": "gw_manager_decision", - "name": "", + "name": "Manager Decision", "type": "exclusiveGateway", "lane": null, "attached_to": null diff --git a/data/25_bpmn_3.JSON b/data/25_bpmn_3.JSON index e74985f..045aa37 100644 --- a/data/25_bpmn_3.JSON +++ b/data/25_bpmn_3.JSON @@ -180,7 +180,7 @@ }, { "id": "xgw_approval_result", - "name": "", + "name": "Vacation Approval", "type": "exclusiveGateway", "pool": "pool_vacation", "lane": null, @@ -188,7 +188,7 @@ }, { "id": "xgw_manual_result", - "name": "", + "name": "Vacation Approved", "type": "exclusiveGateway", "pool": "pool_vacation", "lane": null, diff --git a/pyproject.toml b/pyproject.toml index d4b7e6a..a4975f1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,6 +101,10 @@ select = ["E", "F", "I", "W"] # human description of the diagram. These are intentionally long prose strings # where wrapping would hurt readability, so line length is not enforced here. "src/maestro/experiment_config.py" = ["E501"] +# prompts.py holds the canonical Mermaid output contract: each rule is a single +# prose line shown to the model verbatim. Wrapping a rule would split it across +# bullets in the rendered prompt, so line length is not enforced here. +"src/maestro/prompts.py" = ["E501"] [tool.pytest.ini_options] testpaths = ["tests"] diff --git a/src/maestro/analysis/metrics.py b/src/maestro/analysis/metrics.py index d595f6c..18a5e00 100644 --- a/src/maestro/analysis/metrics.py +++ b/src/maestro/analysis/metrics.py @@ -133,6 +133,21 @@ def _lemmatize_label(label: str) -> str: r"\s*[\]\)\}]+" # closing bracket(s) ) + +def _strip_inline_labels(line: str) -> str: + """ + Replace every inline node definition (``id["Label"]``) with its bare ``id``. + + Edge lines may redeclare a node's label on one or both endpoints, e.g. + ``a["A"] --> b["B"]``. The edge regexes expect the id to sit directly + against the operator, so a labelled *source* would otherwise break edge + extraction. Collapsing each ``id[...]`` to ``id`` leaves a clean + ``a --> b`` for the operator scan; node labels are captured separately by + ``extract_nodes`` so nothing is lost here. + """ + return _NODE_DEF.sub(lambda m: m.group(1), line) + + # Edge label between pipes, e.g. -->|"Green (no risk)"| — stripped before node # scanning so its text is never mistaken for a node definition. _PIPE_LABEL = re.compile(r"\|[^|]*\|") @@ -240,6 +255,10 @@ def _add(src: str, tgt: str, rel_type: str, undirected: bool = False) -> None: line = raw.strip() if not line or line.startswith("%%"): continue + # Collapse any inline node labels (``a["A"] --> b["B"]``) to bare ids + # so a labelled source endpoint can't hide the edge from the operator + # scan. Node labels themselves are captured by ``extract_nodes``. + line = _strip_inline_labels(line) for m in _EDGE.finditer(line): src, op, tgt = m.group(1), m.group(2), m.group(3) if op in ("o--o", "--o", "--x"): @@ -266,6 +285,9 @@ def extract_attachments(mermaid_code: str) -> list[dict]: line = raw.strip() if not line or line.startswith("%%"): continue + # Same inline-label collapse as extract_relationships: an attachment + # written ``task["T"] o--o evt(("E"))`` must still match _ATTACH. + line = _strip_inline_labels(line) for m in _ATTACH.finditer(line): a, b = sorted((m.group(1), m.group(2))) if (a, b) not in seen: diff --git a/src/maestro/prompts.py b/src/maestro/prompts.py new file mode 100644 index 0000000..ee68e63 --- /dev/null +++ b/src/maestro/prompts.py @@ -0,0 +1,58 @@ +""" +MAESTRO — Canonical Mermaid output contract (single source of truth). + +The rules that tell a model how to emit Mermaid live HERE and nowhere else. +Providers supply the system-message identity from ``MERMAID_SYSTEM_IDENTITY``; +the single-agent baseline and multi-step step 3 build their user prompts from +``render_rules()``. Defining the contract once keeps every provider and every +orchestration strategy on a byte-identical output contract — so quality +differences are attributable to orchestration (the independent variable), not +to drifting prompt wording. + +Why this module imports nothing from ``maestro``: ``providers`` and +``strategies`` both depend on it, so any back-import would create a cycle. +Keep it dependency-free (plain strings + one helper). + +The optional ``skill`` layer in ``render_rules`` is the future +prompt-enhancement variable: an append-only block, never an edit to the +baseline rules, so the enhancement stays an isolatable condition. +""" + +from __future__ import annotations + +# System-message identity. Was duplicated verbatim as ``SYSTEM_PROMPT`` in +# every provider subclass; now defined once and assigned on ``LLMProvider``. +MERMAID_SYSTEM_IDENTITY = ( + "You are a diagram generation assistant. " + "Respond only with valid Mermaid diagram code. " + "Do not include any explanation, markdown fencing, or additional text." +) + +# Unified user-prompt output rules. Was hand-copied (with drift) in +# ``single.py`` and step 3 of ``_extraction.py``; now one contract applied +# identically to both. Brace-free on purpose so it can be embedded into a +# template string ahead of any later ``.format()`` call without escaping. +MERMAID_RULES = """\ +- Begin the diagram with a flowchart header, `flowchart LR`; do not use C4, sequence, class, or other diagram types +- Output only valid Mermaid syntax +- Wrap node labels in double quotes, e.g. node_id["My Label"], so labels with spaces, parentheses, slashes, or line breaks stay parseable +- If a node has no label, write just its id (e.g. gw_result) — never an empty bracket like node_id[""] +- Quote edge labels the same way, with no spaces inside the pipes, e.g. a -->|"My edge"| b; for an unlabelled edge use a plain arrow a --> b and never an empty label like -->|| or -->| | +- Include every entity and relationship from the input +- Preserve hierarchy using subgraphs for pools, lanes, and subprocesses +- Do not invent entities or relationships not present in the input +- Do not include explanations or markdown code fences +- Do not use internal or relationship IDs as edge labels""" + + +def render_rules(skill: str | None = None) -> str: + """ + Return the canonical Mermaid rules, optionally extended by a skills block. + + The skills block is APPEND-ONLY: it is concatenated after the baseline + rules and must never edit them, so a baseline run (``skill=None``) and an + enhanced run differ only by the added text. Baseline callers pass ``None``. + """ + if skill is None: + return MERMAID_RULES + return MERMAID_RULES + "\n" + skill diff --git a/src/maestro/providers/anthropic.py b/src/maestro/providers/anthropic.py index 263da4e..92d3fee 100644 --- a/src/maestro/providers/anthropic.py +++ b/src/maestro/providers/anthropic.py @@ -28,12 +28,7 @@ class AnthropicProvider(LLMProvider): Uses the official anthropic SDK — add 'anthropic>=0.25.0' to pyproject.toml. """ - # Instructs the model to output diagram code only — no prose or fencing - SYSTEM_PROMPT = ( - "You are a diagram generation assistant. " - "Respond only with valid Mermaid diagram code. " - "Do not include any explanation, markdown fencing, or additional text." - ) + # SYSTEM_PROMPT inherited from LLMProvider (maestro.prompts). # Max tokens for the completion — diagram code is rarely long MAX_TOKENS = 4096 diff --git a/src/maestro/providers/base.py b/src/maestro/providers/base.py index 220b835..b50c0f8 100644 --- a/src/maestro/providers/base.py +++ b/src/maestro/providers/base.py @@ -5,6 +5,7 @@ from abc import ABC, abstractmethod +from maestro.prompts import MERMAID_SYSTEM_IDENTITY from maestro.schemas import ModelPricing, RunConfig, RunResult @@ -17,6 +18,12 @@ class LLMProvider(ABC): # Centralized temperature setting — 0 for reproducibility across all providers TEMPERATURE = 0 + # Default system identity for Mermaid generation, shared by every provider. + # Subclasses inherit this; a strategy that needs a different identity for a + # given call (e.g. SOP steps 1-2 requesting JSON) passes system_prompt + # explicitly to complete(). + SYSTEM_PROMPT = MERMAID_SYSTEM_IDENTITY + def __init__(self, api_key: str, pricing: ModelPricing) -> None: # api_key stored on instance — never logged or serialised self.api_key = api_key diff --git a/src/maestro/providers/gemini.py b/src/maestro/providers/gemini.py index 52729a6..dab9267 100644 --- a/src/maestro/providers/gemini.py +++ b/src/maestro/providers/gemini.py @@ -24,11 +24,7 @@ class GeminiProvider(LLMProvider): Uses the official google-genai SDK — add 'google-genai>=1.0' to pyproject.toml. """ - SYSTEM_PROMPT = ( - "You are a diagram generation assistant. " - "Respond only with valid Mermaid diagram code. " - "Do not include any explanation, markdown fencing, or additional text." - ) + # SYSTEM_PROMPT inherited from LLMProvider (maestro.prompts). MAX_TOKENS = 4096 diff --git a/src/maestro/providers/mistral.py b/src/maestro/providers/mistral.py index 2821b17..d61f75e 100644 --- a/src/maestro/providers/mistral.py +++ b/src/maestro/providers/mistral.py @@ -25,11 +25,7 @@ class MistralProvider(LLMProvider): ``mistralai.client.errors`` respectively. """ - SYSTEM_PROMPT = ( - "You are a diagram generation assistant. " - "Respond only with valid Mermaid diagram code. " - "Do not include any explanation, markdown fencing, or additional text." - ) + # SYSTEM_PROMPT inherited from LLMProvider (maestro.prompts). MAX_TOKENS = 4096 diff --git a/src/maestro/providers/openai.py b/src/maestro/providers/openai.py index 8ebb125..0f0d6dd 100644 --- a/src/maestro/providers/openai.py +++ b/src/maestro/providers/openai.py @@ -36,12 +36,7 @@ class OpenAIProvider(LLMProvider): # logged as "openai", misattributing failures in multi-provider runs. _PROVIDER_NAME = "openai" - # Same role as AnthropicProvider.SYSTEM_PROMPT - SYSTEM_PROMPT = ( - "You are a diagram generation assistant. " - "Respond only with valid Mermaid diagram code. " - "Do not include any explanation, markdown fencing, or additional text." - ) + # SYSTEM_PROMPT inherited from LLMProvider (maestro.prompts). # Max tokens for the completion MAX_TOKENS = 4096 diff --git a/src/maestro/run.py b/src/maestro/run.py index 809cff0..8d82ac2 100644 --- a/src/maestro/run.py +++ b/src/maestro/run.py @@ -231,8 +231,10 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--strategy", type=str, - choices=[s.value for s in Strategy], - help="Run only this strategy (default: all enabled)", + help=( + "Run only these strategies (default: all enabled). " + "Comma-separated for several, e.g. --strategy single_agent,lang_graph" + ), ) parser.add_argument( "--tier", @@ -243,12 +245,19 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--model", type=str, - help="Run only this model (default: all registered)", + help=( + "Run only these models (default: all registered). " + "Comma-separated for several, e.g. " + "--model gpt-4o-mini-2024-07-18,deepseek-v4-flash" + ), ) parser.add_argument( "--example", type=str, - help="Run only this example_id (default: all registered)", + help=( + "Run only these example_ids (default: all registered). " + "Comma-separated for several, e.g. --example bpmn_1_03,it_1_07" + ), ) parser.add_argument( "--repeats", @@ -294,30 +303,67 @@ def parse_args() -> argparse.Namespace: # --------------------------------------------------------------------------- +def _split_csv(value: str | None) -> list[str] | None: + """ + Parse a comma-separated filter value into a clean list, or None if the + flag was absent. Empty/whitespace-only entries are dropped so trailing + commas and stray spaces don't create phantom filter values. + """ + if value is None: + return None + return [part.strip() for part in value.split(",") if part.strip()] + + def build_matrix(args: argparse.Namespace) -> list[dict]: """ Build the experiment matrix as a list of dicts, each representing one run. - Applies CLI filters to narrow the cross-product. + Applies CLI filters to narrow the cross-product. The --strategy, --model + and --example flags accept a comma-separated list (membership filter). """ + examples = _split_csv(args.example) + model_names = _split_csv(args.model) + strategy_names = _split_csv(args.strategy) + + # Validate filter values up front (argparse no longer does, now that the + # flags accept lists). Catches a typo before any matrix work or API spend — + # a misspelled value in a list would otherwise silently shrink the matrix. + def _reject_unknown(flag: str, given: list[str], valid: set[str]) -> None: + unknown = [v for v in given if v not in valid] + if unknown: + print( + f"ERROR: unknown {flag} value(s): {', '.join(unknown)}. " + f"Known: {', '.join(sorted(valid))}", + file=sys.stderr, + ) + sys.exit(2) + + if strategy_names: + _reject_unknown("--strategy", strategy_names, {s.value for s in Strategy}) + if examples: + _reject_unknown("--example", examples, {i.example_id for i in INPUTS}) + # --model is validated below, after the strategy filter is known: an unknown + # model only matters when a real (LLM) strategy is actually selected, so the + # control-only no-op (--strategy null_control --model anything) is preserved. + # Filter inputs inputs = INPUTS if args.tier: inputs = [i for i in inputs if i.tier.value == args.tier] - if args.example: - inputs = [i for i in inputs if i.example_id == args.example] + if examples: + inputs = [i for i in inputs if i.example_id in examples] # Filter strategies strategies = STRATEGIES - if args.strategy: - strategies = [s for s in strategies if s.value == args.strategy] + if strategy_names: + strategies = [s for s in strategies if s.value in strategy_names] # Filter models — applies only to real (LLM) strategies. Control rows # ignore --model because they don't use a model; --model gpt-4o-mini # should narrow which LLM rows run but should not silently drop the # sanity floor/ceiling rows the experiment needs. models = MODELS - if args.model: - models = [m for m in models if m.model == args.model] + if model_names: + models = [m for m in models if m.model in model_names] # Partition by strategy kind. Controls are deterministic in (model, # repeat) — collapsing both dimensions to a single row per @@ -326,22 +372,22 @@ def build_matrix(args: argparse.Namespace) -> list[dict]: real_strategies = [s for s in strategies if s not in CONTROL_STRATEGIES] control_strategies = [s for s in strategies if s in CONTROL_STRATEGIES] - # Fail fast on an unknown --model only when it actually matters: - # `--strategy null_control --model typo` should be a no-op on --model - # (controls don't use any model) rather than aborting. Without this - # guard, however, `--model gpt-4o-min` would silently produce just - # the 3 control rows when the user wanted a single LLM cell — looks - # like a tiny matrix instead of the misuse it is. So: validate the - # model flag only when there's at least one real strategy left after - # the strategy filter. - if args.model and real_strategies and not models: - known = ", ".join(m.model for m in MODELS) - print( - f"ERROR: --model {args.model!r} matches no registered model. " - f"Known: {known}", - file=sys.stderr, - ) - sys.exit(2) + # Fail fast on any unknown --model value, but only when a real (LLM) + # strategy is selected. `--strategy null_control --model typo` stays a + # no-op on --model (controls don't use any model), so it must not abort. + # When a real strategy IS selected, a misspelled model would otherwise + # silently shrink the matrix (e.g. `--model gpt-4o-mini-2024-07-18,typo` + # would quietly run only the valid one) — so reject the typo loudly. + if model_names and real_strategies: + registered = {m.model for m in MODELS} + unknown = [m for m in model_names if m not in registered] + if unknown: + print( + f"ERROR: unknown --model value(s): {', '.join(unknown)}. " + f"Known: {', '.join(sorted(registered))}", + file=sys.stderr, + ) + sys.exit(2) matrix = [] diff --git a/src/maestro/strategies/_extraction.py b/src/maestro/strategies/_extraction.py index a479191..e3d3269 100644 --- a/src/maestro/strategies/_extraction.py +++ b/src/maestro/strategies/_extraction.py @@ -18,6 +18,8 @@ import json +from maestro.prompts import render_rules + # --------------------------------------------------------------------------- # Step 1 — Extract entities (nodes) from the input dataset # --------------------------------------------------------------------------- @@ -84,23 +86,22 @@ # Step 3 — Render Mermaid diagram from entities + relationships # --------------------------------------------------------------------------- -STEP_3_PROMPT = """\ +# Rules come from the canonical contract (maestro.prompts) so step 3 and the +# single-agent baseline are given a byte-identical output contract. The runtime +# placeholders are escaped (``{{...}}``) so they survive this f-string and are +# filled later by ``.format(entities_json=..., relationships_json=...)``. +STEP_3_PROMPT = f"""\ You are given a set of entities and relationships extracted from a dataset. Your task is to generate a Mermaid diagram that accurately represents them. Rules: -- Output only valid Mermaid syntax -- Include all entities with correct hierarchy (subgraphs for pools/lanes/subprocesses) -- Include all relationships as edges -- Do not invent entities or relationships not provided -- Do not include explanations or markdown code fences -- Do not use relationship IDs as edge labels +{render_rules()} Entities: -{entities_json} +{{entities_json}} Relationships: -{relationships_json} +{{relationships_json}} """ diff --git a/src/maestro/strategies/single.py b/src/maestro/strategies/single.py index 7ef2336..b9d3182 100644 --- a/src/maestro/strategies/single.py +++ b/src/maestro/strategies/single.py @@ -19,22 +19,23 @@ import json +from maestro.prompts import render_rules from maestro.schemas import InputFile, RunConfig, RunResult, SubResult from maestro.strategies.base import BaseStrategy -PROMPT_TEMPLATE = """\ +# Rules come from the canonical contract (maestro.prompts) so single-agent and +# the multi-step strategies are given a byte-identical output contract. The +# runtime placeholder is escaped as ``{{input_data}}`` so it survives this +# f-string and is filled later by ``.format(input_data=...)``. +PROMPT_TEMPLATE = f"""\ You are given a dataset describing entities and their relationships. Your task is to generate a Mermaid diagram that accurately represents this data. Rules: -- Output only valid Mermaid syntax -- Include all entities and relationships from the input -- Do not invent entities or relationships not present in the data -- Do not include explanations or markdown code fences -- Do not use internal IDs as edge labels +{render_rules()} Input data: -{input_data} +{{input_data}} """ diff --git a/tests/analysis/test_extraction.py b/tests/analysis/test_extraction.py index 0621f10..39854e7 100644 --- a/tests/analysis/test_extraction.py +++ b/tests/analysis/test_extraction.py @@ -180,6 +180,31 @@ def test_sequence_flow_solid_arrow(): assert rels[0]["type"] == "sequence_flow" +def test_edge_with_inline_labels_on_both_endpoints(): + """ + Regression: a model may redeclare node labels on the edge line itself, + e.g. ``a["A"] --> b["B"]``. A labelled *source* used to hide the edge from + the operator scan (returned []), zeroing the relationship score on an + otherwise-correct diagram. Bare ids must still be recovered. + """ + code = 'flowchart TD\n a["A"] --> b["B"]\n b["B"] -->|"go"| c["C"]\n' + assert _pairs(extract_relationships(code)) == {("a", "b"), ("b", "c")} + + +def test_edge_inline_label_with_newline_and_parens(): + """The redeclared label can contain \\n and parentheses (BPMN gateways).""" + code = 'flowchart TD\n gw{"Gateway\\n(Split)"} --> task["Task 1"]\n' + assert _pairs(extract_relationships(code)) == {("gw", "task")} + + +def test_attachment_with_inline_labels_on_both_endpoints(): + """Same inline-label collapse must apply to o--o attachment edges.""" + code = 'flowchart LR\n host["Host"] o--o evt(("Boundary"))\n' + atts = extract_attachments(code) + assert len(atts) == 1 + assert tuple(sorted((atts[0]["a"], atts[0]["b"]))) == ("evt", "host") + + # --------------------------------------------------------------------------- # 3b — container metrics (reuse entity matchers) # --------------------------------------------------------------------------- diff --git a/tests/test_prompts.py b/tests/test_prompts.py new file mode 100644 index 0000000..5804326 --- /dev/null +++ b/tests/test_prompts.py @@ -0,0 +1,158 @@ +""" +Guards for the single-source-of-truth Mermaid output contract (maestro.prompts). + +These tests pin the contract so it cannot silently drift back into the +per-provider / per-strategy copies it was consolidated from: + + * snapshot — the canonical identity + rules text is intentional, + so a change must be a deliberate edit to this file. + * no-duplication — every provider's SYSTEM_PROMPT *is* the shared + identity object (catches anyone re-inlining a literal), + and the identity actually reaches provider.complete(). + * identical-injection — the rules block embedded in the single-agent prompt + equals the one in multi-step step 3 (the drift this + refactor fixed). +""" + +from __future__ import annotations + +from maestro.prompts import ( + MERMAID_RULES, + MERMAID_SYSTEM_IDENTITY, + render_rules, +) +from maestro.providers.anthropic import AnthropicProvider +from maestro.providers.base import LLMProvider +from maestro.providers.deepseek import DeepSeekProvider +from maestro.providers.gemini import GeminiProvider +from maestro.providers.mistral import MistralProvider +from maestro.providers.openai import OpenAIProvider +from maestro.strategies._extraction import STEP_3_PROMPT +from maestro.strategies.single import PROMPT_TEMPLATE + +# Concrete providers whose SYSTEM_PROMPT must resolve to the shared identity. +# deepseek defines no literal of its own — it must inherit via OpenAIProvider. +_PROVIDERS = [ + AnthropicProvider, + OpenAIProvider, + GeminiProvider, + MistralProvider, + DeepSeekProvider, +] + + +# --------------------------------------------------------------------------- +# Snapshot — canonical text is pinned; edits must be deliberate. +# --------------------------------------------------------------------------- + + +def test_system_identity_snapshot(): + assert MERMAID_SYSTEM_IDENTITY == ( + "You are a diagram generation assistant. " + "Respond only with valid Mermaid diagram code. " + "Do not include any explanation, markdown fencing, or additional text." + ) + + +def test_rules_snapshot(): + assert MERMAID_RULES == ( + "- Begin the diagram with a flowchart header, `flowchart LR`; " + "do not use C4, sequence, class, or other diagram types\n" + "- Output only valid Mermaid syntax\n" + '- Wrap node labels in double quotes, e.g. node_id["My Label"], so ' + "labels with spaces, parentheses, slashes, or line breaks stay " + "parseable\n" + "- If a node has no label, write just its id (e.g. gw_result) — " + 'never an empty bracket like node_id[""]\n' + "- Quote edge labels the same way, with no spaces inside the pipes, " + 'e.g. a -->|"My edge"| b; for an unlabelled edge use a plain arrow ' + "a --> b and never an empty label like -->|| or -->| |\n" + "- Include every entity and relationship from the input\n" + "- Preserve hierarchy using subgraphs for pools, lanes, and subprocesses\n" + "- Do not invent entities or relationships not present in the input\n" + "- Do not include explanations or markdown code fences\n" + "- Do not use internal or relationship IDs as edge labels" + ) + + +def test_rules_are_brace_free(): + """Braces would break the later .format() call on the strategy templates.""" + assert "{" not in MERMAID_RULES and "}" not in MERMAID_RULES + + +# --------------------------------------------------------------------------- +# No-duplication — providers share the one identity object, not copies. +# --------------------------------------------------------------------------- + + +def test_base_defines_the_identity(): + assert LLMProvider.SYSTEM_PROMPT is MERMAID_SYSTEM_IDENTITY + + +def test_no_provider_reinlines_system_prompt(): + for provider_cls in _PROVIDERS: + assert provider_cls.SYSTEM_PROMPT is MERMAID_SYSTEM_IDENTITY, ( + f"{provider_cls.__name__} re-inlined SYSTEM_PROMPT instead of " + "inheriting the shared identity" + ) + + +def test_fallback_identity_resolves_to_shared(recording_provider_factory): + """ + When a caller passes system_prompt=None, real providers fall back to + self.SYSTEM_PROMPT (the ``system_prompt if system_prompt is not None else + self.SYSTEM_PROMPT`` expression in every complete()). This exercises that + fallback expression against an LLMProvider subclass and asserts it resolves + to the shared identity object — so a re-inlined per-provider literal would + be caught here, not just at the class-attribute level. + """ + provider = recording_provider_factory(outputs=["graph TD\n a"]) + system_prompt = None + effective_system = ( + system_prompt if system_prompt is not None else provider.SYSTEM_PROMPT + ) + assert effective_system is MERMAID_SYSTEM_IDENTITY + + +# --------------------------------------------------------------------------- +# Identical-injection — single-agent and step 3 carry the same rules block. +# --------------------------------------------------------------------------- + + +def _rules_block(template: str) -> str: + """Extract the block under 'Rules:' up to the next blank line.""" + return template.split("Rules:\n", 1)[1].split("\n\n", 1)[0] + + +def test_single_and_step3_inject_identical_rules(): + single_rules = _rules_block(PROMPT_TEMPLATE) + step3_rules = _rules_block(STEP_3_PROMPT) + assert single_rules == step3_rules == render_rules() + + +def test_templates_keep_runtime_placeholders(): + """The .format() placeholders must survive the f-string composition.""" + assert "{input_data}" in PROMPT_TEMPLATE + assert "{entities_json}" in STEP_3_PROMPT + assert "{relationships_json}" in STEP_3_PROMPT + + +def test_templates_format_without_stray_braces(): + PROMPT_TEMPLATE.format(input_data="{}") + STEP_3_PROMPT.format(entities_json="[]", relationships_json="[]") + + +# --------------------------------------------------------------------------- +# render_rules — baseline vs append-only skills layer. +# --------------------------------------------------------------------------- + + +def test_render_rules_baseline_is_canonical(): + assert render_rules() == MERMAID_RULES + assert render_rules(None) == MERMAID_RULES + + +def test_render_rules_skill_is_append_only(): + rendered = render_rules("- Prefer graph LR") + assert rendered.startswith(MERMAID_RULES) + assert rendered.endswith("- Prefer graph LR") diff --git a/tests/test_run_filters.py b/tests/test_run_filters.py new file mode 100644 index 0000000..efd1c98 --- /dev/null +++ b/tests/test_run_filters.py @@ -0,0 +1,103 @@ +""" +Filter validation in run.build_matrix. + +The --strategy / --model / --example flags accept comma-separated lists. A typo +in a list would silently shrink the experiment matrix (run fewer cells than the +user intended) — costly to notice halfway through a multi-hour run. These tests +pin the fail-fast behavior: + + * unknown --strategy / --example → exit 2 (strict) + * unknown --model → exit 2 *only* when a real LLM strategy is + selected; a control-only run ignores + --model (controls use no model), so a + bad model there is a deliberate no-op. +""" + +from __future__ import annotations + +import argparse + +import pytest + +from maestro.run import _split_csv, build_matrix + + +def _args(**overrides) -> argparse.Namespace: + """Namespace with the build_matrix-relevant fields, overridable per test.""" + base = dict(strategy=None, tier=None, model=None, example=None, repeats=1) + base.update(overrides) + return argparse.Namespace(**base) + + +# --------------------------------------------------------------------------- +# _split_csv +# --------------------------------------------------------------------------- + + +def test_split_csv_none_passes_through(): + assert _split_csv(None) is None + + +def test_split_csv_trims_and_drops_empties(): + assert _split_csv("a, b ,,c,") == ["a", "b", "c"] + + +# --------------------------------------------------------------------------- +# Valid filters build a matrix +# --------------------------------------------------------------------------- + + +def test_valid_list_filters_build_matrix(): + args = _args( + example="bpmn_1_03,it_1_07", + model="gpt-4o-mini-2024-07-18,deepseek-v4-flash", + strategy="single_agent,lang_graph", + repeats=2, + ) + matrix = build_matrix(args) + # 2 inputs × 2 strategies × 2 models × 2 repeats + assert len(matrix) == 16 + assert {c["input_file"].example_id for c in matrix} == {"bpmn_1_03", "it_1_07"} + assert {c["model_pricing"].model for c in matrix} == { + "gpt-4o-mini-2024-07-18", + "deepseek-v4-flash", + } + + +# --------------------------------------------------------------------------- +# Unknown values fail fast +# --------------------------------------------------------------------------- + + +def test_unknown_strategy_exits_2(): + with pytest.raises(SystemExit) as exc: + build_matrix(_args(strategy="single_agent,typo_strat")) + assert exc.value.code == 2 + + +def test_unknown_example_exits_2(): + with pytest.raises(SystemExit) as exc: + build_matrix(_args(example="bpmn_1_03,not_a_real_id", strategy="single_agent")) + assert exc.value.code == 2 + + +def test_unknown_model_exits_2_with_real_strategy(): + with pytest.raises(SystemExit) as exc: + build_matrix( + _args( + model="gpt-4o-mini-2024-07-18,typo_model", + strategy="single_agent", + example="bpmn_1_03", + ) + ) + assert exc.value.code == 2 + + +def test_unknown_model_is_noop_for_control_only_run(): + """Controls use no model, so a bad --model must not abort a control run.""" + matrix = build_matrix( + _args(model="typo_model", strategy="null_control", example="bpmn_1_03") + ) + # One control row for the single input — bad model ignored, no exit. + assert len(matrix) == 1 + assert matrix[0]["strategy"].value == "null_control"