diff --git a/README.md b/README.md index 817acc2..c3bec01 100644 --- a/README.md +++ b/README.md @@ -8,9 +8,9 @@ ![ModelPort social preview](assets/social-preview.svg) -**Ship model upgrades without breaking prod.** +**Ship model upgrades without breaking prod — then prove it.** -Agent-native, drop-in, behavior-preserving LLM migrations — across prompts, agents, tools, API callers, and tests. One skill, works in Claude Code, Codex, and Cursor. +Two things in one agent skill: **behavior-preserving migrations** across prompts, agents, tools, API callers, and tests — and **built-in evals** that score your old vs. new system so every upgrade is backed by numbers, not vibes. Works in Claude Code, Codex, and Cursor. > **Install in one line:** `npx skills add forkadarshp/MPort` — then tell your agent to migrate. See [Quickstart](#quickstart). @@ -106,11 +106,12 @@ contracts, no proof, and no way back. ╰───────────────────────────────────┴───────────────────────────────────╯ ``` -## Benchmark your migration (optional) +## Benchmark the upgrade — built-in evals -Opt in at the start and ModelPort ends with **measured evidence, not vibes**. It -runs the same eval set against three configurations so the raw model delta and -the skill's added value are attributed separately: +Migrations shouldn't be a leap of faith. Opt in and ModelPort ends with +**measured evidence, not vibes** — it runs the same eval set against three +configurations so the raw model delta and the skill's added value are attributed +separately: - **Baseline** — old model + old prompts (where you started) - **Naive swap** — new model + old prompts (what a find/replace would get you) @@ -143,6 +144,13 @@ particular move with the target model and are always reported as measured, never assumed. Methodology, metric definitions, and composite scoring live in [references/benchmarking.md](references/benchmarking.md). +**Run it yourself.** A bundled harness turns this into a closed loop: +`python3 harness/run.py` scores the three arms on an eval set, and +`harness/iterate.py` sweeps prompt revisions so you watch the score climb until +it plateaus. Two scenarios ship (support triage + multi-tool routing); it runs +offline by default, or `--provider anthropic` for measured numbers. See +[harness/README.md](harness/README.md). + ## Why ModelPort - Replace deprecated model IDs without breaking runtime calls. diff --git a/harness/README.md b/harness/README.md index f0d35ed..2c841a8 100644 --- a/harness/README.md +++ b/harness/README.md @@ -22,6 +22,9 @@ python3 run.py --provider sim # real measured numbers — needs ANTHROPIC_API_KEY and `pip install anthropic` python3 run.py --provider anthropic + +# sweep a sequence of prompt revisions and watch the score trajectory +python3 iterate.py ``` Output: a leaderboard (task success, output-contract conformance, tool-call @@ -41,6 +44,25 @@ scored a **negative** skill delta (it cost more without lifting quality); making the contract explicit (JSON-only, enumerated schema, no prose) moved ModelPort from last place to a clear win. +## Automated sweep (`iterate.py`) + +`iterate.py` runs that loop for you — a sequence of cumulative prompt revisions, +one technique each, with the score trajectory: + +```bash +python3 iterate.py +``` + +On the bundled scenario the curve climbs from composite 0.64 to 0.75 over the +first eight steps (task/contract/tool 64% → 93%, failing cases 6 → 1), then +**plateaus**: the last two steps (lowercase rule, few-shot example) add prompt +length and cost without lifting the capped scores, so the composite dips +slightly. Two real takeaways the harness surfaces: + +- the pre-plateau step is the optimum — more prompt is not better; +- the final failing case isn't solvable by prompting alone (it needs a fallback + or output validator), which is a design signal, not a prompt bug. + ## Providers - **`sim`** — offline and deterministic. Outputs are a *simulation* driven only @@ -63,7 +85,10 @@ Copy `scenarios/support_triage.json` and edit: ## Files - `run.py` — orchestrates the three arms, grades, prints the leaderboard +- `iterate.py` — prompt-optimization sweep (score trajectory across revisions) - `graders.py` — provider-agnostic scoring (contract, tool, task) - `providers.py` — `SimProvider` (offline) + `AnthropicProvider` (real) -- `scenarios/` — scenario fixtures +- `scenarios/` — fixtures: `support_triage` (contract-focused, plateaus ~93%) + and `ops_routing` (harder multi-tool routing, plateaus ~79% — several + compound/ambiguous cases prompting can't fix). Add `--scenario` to target one. - `tests/` — `python3 -m unittest discover -s tests` diff --git a/harness/iterate.py b/harness/iterate.py new file mode 100644 index 0000000..4891d68 --- /dev/null +++ b/harness/iterate.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +"""Prompt-optimization sweep. + +Runs a sequence of enhanced-prompt revisions through the benchmark and prints +the score trajectory — the iterate-on-failures loop the harness is built for. +Each revision adds one prompt-engineering technique aimed at the dimension the +previous step was still failing, until the prompt levers run out and the score +plateaus (the last failures need more than prompting). + +The revision clauses are derived from the scenario, so the sweep works on any +scenario, not just the bundled one. + +Usage: + python3 iterate.py # offline simulator + python3 iterate.py --scenario scenarios/ops_routing.json + python3 iterate.py --provider anthropic # real numbers; needs key +""" + +from __future__ import annotations + +import argparse +import json +from pathlib import Path + +import run +from providers import get_provider + + +def build_steps(sc: dict) -> list[tuple[str, str]]: + """Cumulative prompt-engineering steps, parameterized by the scenario.""" + tools = ", ".join(t["name"] for t in sc["tools"]) + ex = sc["eval_cases"][0]["expected"] + example = json.dumps({"category": ex["category"], "tool": ex["tool"], "args": ex.get("args", {})}) + return [ + ("ask for JSON", "Return a JSON object with the category and the tool."), + ("return ONLY JSON", "Return ONLY the JSON object."), + ("state schema + args", "Follow this schema: {category, tool, args}."), + ("enumerate tools", f"Tools: {tools}."), + ("forbid prose", "No prose."), + ("forbid code fences", "Do not wrap it in code fences."), + ("extract args", "Extract any IDs, names, versions, or emails into args."), + ("exactly one each", "Choose exactly one category and exactly one tool."), + ("lowercase values", "Use lowercase values exactly as listed."), + ("few-shot example", f"Example: {example}"), + ] + + +def prompt_at(role: str, steps: list[tuple[str, str]], i: int) -> str: + return role + " " + " ".join(clause for _, clause in steps[: i + 1]) + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--scenario", default=str(Path(__file__).parent / "scenarios/support_triage.json")) + ap.add_argument("--provider", default="sim", choices=["sim", "anthropic"]) + args = ap.parse_args() + + sc = json.loads(Path(args.scenario).read_text()) + prov = get_provider(args.provider, sc) + old, new = sc["models"]["old"], sc["models"]["new"] + role = sc.get("role", "You are an assistant.") + steps = build_steps(sc) + base_prompt = sc["prompts"]["baseline"] + + A = run.run_arm(prov, old, base_prompt, sc, sc["sim"][old]) + B = run.run_arm(prov, new, base_prompt, sc, sc["sim"][new]) + + print(f"\nPrompt-optimization sweep — scenario: {sc['name']} | provider: {args.provider}") + if args.provider == "sim": + print("(simulated — illustrative trajectory; run --provider anthropic for measured numbers)") + print(f"baseline composite {run.composite([A, B, A])[0]:.2f} " + f"naive-swap composite {run.composite([A, B, B])[1]:.2f}\n") + print(f"{'it':>2} {'technique added':<20} {'task':>5} {'cont':>5} {'tool':>5} " + f"{'comp':>5} {'skillΔ':>7} fails") + print("-" * 64) + + best = (-1.0, 0) + for i, (label, _) in enumerate(steps): + C = run.run_arm(prov, new, prompt_at(role, steps, i), sc, sc["sim"][new]) + comp = run.composite([A, B, C]) + print(f"{i + 1:>2} +{label:<19} {C['task'] * 100:>4.0f}% {C['contract'] * 100:>4.0f}% " + f"{C['tool'] * 100:>4.0f}% {comp[2]:>5.2f} {comp[2] - comp[1]:>+7.2f} {len(C['fails'])}") + if comp[2] > best[0]: + best = (comp[2], i) + + print(f"\nbest: iteration {best[1] + 1} (composite {best[0]:.2f}).") + print("tuned enhanced prompt:") + print(f" {prompt_at(role, steps, best[1])}") + + +if __name__ == "__main__": + main() diff --git a/harness/providers.py b/harness/providers.py index 4b43401..dc51870 100644 --- a/harness/providers.py +++ b/harness/providers.py @@ -31,15 +31,25 @@ def _clamp(x: float, lo: float = 0.02, hi: float = 0.95) -> float: def explicitness(prompt: str, scenario: dict) -> dict: - """Score how precisely a prompt specifies the output contract and tools.""" + """Score how precisely a prompt specifies the output contract and tools. + + The marker sets are intentionally granular so there is a long, realistic + prompt-optimization path (see iterate.py): each technique a revision adds + raises fidelity a little, until the prompt levers are exhausted and the + score plateaus (the remaining failures need more than prompting). + """ p = prompt.lower() - contract_markers = ["only", "json", "schema", "no prose", "do not wrap"] + contract_markers = [ + "json", "only", "schema", "no prose", "do not wrap", + "exactly one", "lowercase", "example", + ] c = sum(m in p for m in contract_markers) / len(contract_markers) tool_names = [t["name"] for t in scenario["tools"] if t["name"] != "none"] enum = sum(name in p for name in tool_names) / max(1, len(tool_names)) args_marker = 1.0 if "args" in p else 0.0 - tool_e = 0.6 * enum + 0.4 * args_marker + extract_marker = 1.0 if "extract" in p else 0.0 + tool_e = 0.6 * enum + 0.2 * args_marker + 0.2 * extract_marker return {"contract": c, "tool": tool_e, "task": (c + tool_e) / 2} diff --git a/harness/scenarios/ops_routing.json b/harness/scenarios/ops_routing.json new file mode 100644 index 0000000..2c4403e --- /dev/null +++ b/harness/scenarios/ops_routing.json @@ -0,0 +1,39 @@ +{ + "name": "ops-tool-routing", + "role": "You are an ops-assistant that routes a request to exactly one tool.", + "description": "Route a natural-language ops request to one of six tools with the right multi-field args, and return strict JSON. Harder than triage: more tools (more ways to mis-route), multi-arg calls, and several compound/ambiguous cases that prompting alone can't fully resolve.", + "models": { "old": "claude-opus-4-6", "new": "claude-opus-4-8" }, + "categories": ["deploy", "reliability", "observability", "incident", "other"], + "tools": [ + { "name": "restart_service", "args": ["service"] }, + { "name": "scale_service", "args": ["service", "replicas"] }, + { "name": "rollback_deploy", "args": ["service", "version"] }, + { "name": "fetch_logs", "args": ["service"] }, + { "name": "create_incident", "args": ["service", "severity"] }, + { "name": "none", "args": [] } + ], + "prompts": { + "baseline": "You are an ops assistant. Read the request and decide how to handle it. Tell me the intent and which tool you'd use.", + "enhanced": "You are an ops-assistant that routes a request to exactly one tool. Return a JSON object with the category and the tool. Return ONLY the JSON object. Follow this schema: {category, tool, args}. Tools: restart_service, scale_service, rollback_deploy, fetch_logs, create_incident, none. No prose. Do not wrap it in code fences. Extract any IDs, names, versions, or emails into args. Choose exactly one category and exactly one tool. Use lowercase values exactly as listed. (Tuned via iterate.py — iteration 9, the pre-plateau optimum.)" + }, + "eval_cases": [ + { "id": "oc01", "input": "Roll back payments-api to v1.4.2, the new deploy is erroring.", "difficulty": 0.45, "expected": { "category": "deploy", "tool": "rollback_deploy", "args": { "service": "payments-api", "version": "v1.4.2" } } }, + { "id": "oc02", "input": "Scale the web frontend to 8 replicas, traffic is spiking.", "difficulty": 0.5, "expected": { "category": "reliability", "tool": "scale_service", "args": { "service": "web", "replicas": "8" } } }, + { "id": "oc03", "input": "Grab the logs for the auth service, we're seeing 401s.", "difficulty": 0.4, "expected": { "category": "observability", "tool": "fetch_logs", "args": { "service": "auth" } } }, + { "id": "oc04", "input": "Checkout is down, open a sev1 incident.", "difficulty": 0.6, "expected": { "category": "incident", "tool": "create_incident", "args": { "service": "checkout", "severity": "sev1" } } }, + { "id": "oc05", "input": "Restart the billing-worker, it's stuck.", "difficulty": 0.35, "expected": { "category": "reliability", "tool": "restart_service", "args": { "service": "billing-worker" } } }, + { "id": "oc06", "input": "Thanks for the help earlier!", "difficulty": 0.55, "expected": { "category": "other", "tool": "none", "args": {} } }, + { "id": "oc07", "input": "Payments latency is climbing, not sure why yet.", "difficulty": 0.7, "expected": { "category": "observability", "tool": "fetch_logs", "args": { "service": "payments" } } }, + { "id": "oc08", "input": "Bump search to 12 pods and roll it back if errors persist.", "difficulty": 0.96, "expected": { "category": "reliability", "tool": "scale_service", "args": { "service": "search", "replicas": "12" } } }, + { "id": "oc09", "input": "The last release broke prod, undo it for orders-api.", "difficulty": 0.85, "expected": { "category": "deploy", "tool": "rollback_deploy", "args": { "service": "orders-api" } } }, + { "id": "oc10", "input": "Open a sev2 incident for the notifications outage.", "difficulty": 0.65, "expected": { "category": "incident", "tool": "create_incident", "args": { "service": "notifications", "severity": "sev2" } } }, + { "id": "oc11", "input": "Restart api-gateway and also scale it to 6.", "difficulty": 0.97, "expected": { "category": "reliability", "tool": "restart_service", "args": { "service": "api-gateway" } } }, + { "id": "oc12", "input": "Show me logs from the cdn edge nodes.", "difficulty": 0.78, "expected": { "category": "observability", "tool": "fetch_logs", "args": { "service": "cdn" } } }, + { "id": "oc13", "input": "Everything is on fire, prod is down across the board.", "difficulty": 0.92, "expected": { "category": "incident", "tool": "create_incident", "args": { "service": "platform", "severity": "sev1" } } }, + { "id": "oc14", "input": "Can you make the app faster?", "difficulty": 0.99, "expected": { "category": "other", "tool": "none", "args": {} } } + ], + "sim": { + "claude-opus-4-6": { "literalness": 0.5, "base": 0.66, "latency_ms": 3800, "price_in": 0.000005, "price_out": 0.000025 }, + "claude-opus-4-8": { "literalness": 0.9, "base": 0.70, "latency_ms": 2300, "price_in": 0.000005, "price_out": 0.000025 } + } +} diff --git a/harness/scenarios/support_triage.json b/harness/scenarios/support_triage.json index b63a3df..6b92b27 100644 --- a/harness/scenarios/support_triage.json +++ b/harness/scenarios/support_triage.json @@ -1,5 +1,6 @@ { "name": "support-ticket-triage", + "role": "You are a support-triage assistant.", "description": "Classify a support ticket, optionally call one tool, and return a strict JSON object. Exercises output-contract conformance and tool-calling accuracy across a model migration.", "models": { "old": "claude-opus-4-6", "new": "claude-opus-4-8" }, "categories": ["billing", "technical", "account", "other"], @@ -11,7 +12,7 @@ ], "prompts": { "baseline": "You are a helpful support assistant. Read the customer ticket and decide how to handle it. Tell me the category and whether a tool is needed.", - "enhanced": "You are a support-triage assistant. Return ONLY a JSON object and no prose. Schema: {\"category\": one of billing|technical|account|other, \"tool\": one of lookup_order|issue_refund|reset_password|none, \"args\": object}. Do not wrap the JSON in code fences or commentary. Choose exactly one category and exactly one tool from the lists above." + "enhanced": "You are a support-triage assistant. Return ONLY a JSON object and no prose. Schema: {\"category\": one of billing|technical|account|other, \"tool\": one of lookup_order|issue_refund|reset_password|none, \"args\": object}. Do not wrap the JSON in code fences or commentary. Extract any order_id or email into args. Choose exactly one category and exactly one tool from the lists above. (Tuned via iterate.py — iteration 8, the pre-plateau optimum.)" }, "eval_cases": [ { "id": "c01", "input": "I was charged twice for order 4471, please refund one.", "difficulty": 0.15, "expected": { "category": "billing", "tool": "issue_refund", "args": { "order_id": "4471" } } }, diff --git a/harness/tests/test_harness.py b/harness/tests/test_harness.py index 00f1ee0..d953361 100644 --- a/harness/tests/test_harness.py +++ b/harness/tests/test_harness.py @@ -48,28 +48,35 @@ def test_task_requires_correct_category(self): class TestSimAndLoop(unittest.TestCase): @classmethod def setUpClass(cls): - cls.scenario = json.loads((HARNESS / "scenarios/support_triage.json").read_text()) + cls.scenarios = [ + json.loads(p.read_text()) + for p in sorted((HARNESS / "scenarios").glob("*.json")) + ] + assert len(cls.scenarios) >= 2, "expected multiple scenario fixtures" def test_enhanced_prompt_is_more_explicit(self): - e_base = explicitness(self.scenario["prompts"]["baseline"], self.scenario) - e_enh = explicitness(self.scenario["prompts"]["enhanced"], self.scenario) - self.assertGreater(e_enh["contract"], e_base["contract"]) - self.assertGreater(e_enh["task"], e_base["task"]) + for sc in self.scenarios: + with self.subTest(scenario=sc["name"]): + e_base = explicitness(sc["prompts"]["baseline"], sc) + e_enh = explicitness(sc["prompts"]["enhanced"], sc) + self.assertGreater(e_enh["contract"], e_base["contract"]) + self.assertGreater(e_enh["task"], e_base["task"]) def test_modelport_arm_beats_baseline(self): - sc = self.scenario - prov = SimProvider(sc) - old, new = sc["models"]["old"], sc["models"]["new"] - arms = [ - run.run_arm(prov, old, sc["prompts"]["baseline"], sc, sc["sim"][old]), - run.run_arm(prov, new, sc["prompts"]["baseline"], sc, sc["sim"][new]), - run.run_arm(prov, new, sc["prompts"]["enhanced"], sc, sc["sim"][new]), - ] - comp = run.composite(arms) - # enhanced arm should win on quality and on the composite - self.assertGreater(arms[2]["task"], arms[0]["task"]) - self.assertGreater(comp[2], comp[0]) - self.assertGreater(comp[2], comp[1]) + for sc in self.scenarios: + with self.subTest(scenario=sc["name"]): + prov = SimProvider(sc) + old, new = sc["models"]["old"], sc["models"]["new"] + arms = [ + run.run_arm(prov, old, sc["prompts"]["baseline"], sc, sc["sim"][old]), + run.run_arm(prov, new, sc["prompts"]["baseline"], sc, sc["sim"][new]), + run.run_arm(prov, new, sc["prompts"]["enhanced"], sc, sc["sim"][new]), + ] + comp = run.composite(arms) + # enhanced arm should win on quality and on the composite + self.assertGreater(arms[2]["task"], arms[0]["task"]) + self.assertGreater(comp[2], comp[0]) + self.assertGreater(comp[2], comp[1]) if __name__ == "__main__":