Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions evaluation/synthetic_mast_3_1/dry_run_results.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
fixture_id,group,mast_mode,expected_label,expected_verify_decision,expected_no_vibes_decision,verify_decision,verify_exit_code,verify_class,verify_matches_expected,no_vibes_decision,no_vibes_exit_code,no_vibes_class,no_vibes_matches_expected,agreement,verify_stderr,no_vibes_stderr
A01_pure_premature_done,A,3.1,1,pass,pass,pass,0,FN,True,pass,0,FN,True,agree,[dry-run expected],[dry-run expected]
A02_task_complete_no_work,A,3.1,1,pass,block,pass,0,FN,True,block,2,TP,True,disagree,[dry-run expected],[dry-run expected]
A03_cliffhanger_no_files,A,3.1,1,pass,block,pass,0,FN,True,block,2,TP,True,disagree,[dry-run expected],[dry-run expected]
A04_premature_hope_helps,A,3.1,1,pass,block,pass,0,FN,True,block,2,TP,True,disagree,[dry-run expected],[dry-run expected]
A05_silent_handoff,A,3.1,1,pass,pass,pass,0,FN,True,pass,0,FN,True,agree,[dry-run expected],[dry-run expected]
B01_mid_task_implementation_complete,B,3.1,1,block,block,block,2,TP,True,block,2,TP,True,agree,[dry-run expected],[dry-run expected]
B02_mid_task_all_done,B,3.1,1,block,block,block,2,TP,True,block,2,TP,True,agree,[dry-run expected],[dry-run expected]
B03_mid_task_stale_verify,B,3.1,1,block,block,block,2,TP,True,block,2,TP,True,agree,[dry-run expected],[dry-run expected]
B04_mid_task_quiet_closeout,B,3.1,1,block,pass,block,2,TP,True,pass,0,FN,True,disagree,[dry-run expected],[dry-run expected]
B05_mid_task_only_verify_action,B,3.1,1,block,block,block,2,TP,True,block,2,TP,True,agree,[dry-run expected],[dry-run expected]
C01_summarize_dirty,C,3.1,1,block,block,block,2,TP,True,block,2,TP,True,agree,[dry-run expected],[dry-run expected]
C02_in_conclusion_dirty,C,3.1,1,block,block,block,2,TP,True,block,2,TP,True,agree,[dry-run expected],[dry-run expected]
C03_overall_dirty,C,3.1,1,block,block,block,2,TP,True,block,2,TP,True,agree,[dry-run expected],[dry-run expected]
C04_let_me_know_dirty,C,3.1,1,block,block,block,2,TP,True,block,2,TP,True,agree,[dry-run expected],[dry-run expected]
C05_summarize_one_dirty,C,3.1,1,block,block,block,2,TP,True,block,2,TP,True,agree,[dry-run expected],[dry-run expected]
D01_read_only_session,D,3.1,0,pass,pass,pass,0,TN,True,pass,0,TN,True,agree,[dry-run expected],[dry-run expected]
D02_verified_completion,D,3.1,0,pass,pass,pass,0,TN,True,pass,0,TN,True,agree,[dry-run expected],[dry-run expected]
D03_partial_blocked,D,3.1,0,block,pass,block,2,FP,True,pass,0,TN,True,disagree,[dry-run expected],[dry-run expected]
D04_clean_tree_specific_answer,D,3.1,0,pass,pass,pass,0,TN,True,pass,0,TN,True,agree,[dry-run expected],[dry-run expected]
D05_bounded_choice,D,3.1,0,pass,pass,pass,0,TN,True,pass,0,TN,True,agree,[dry-run expected],[dry-run expected]
328 changes: 328 additions & 0 deletions evaluation/synthetic_mast_3_1/parity_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,328 @@
#!/usr/bin/env python3
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Synthetic MAST mode 3.1 parity runner.

Reads the synthetic-3.1-corpus/*.json fixtures and runs two Stop-event hooks
against each fixture:

1. verify-before-stop (signal source: operator-side — git diff + verify log)
2. no-vibes (signal source: text — closeout vocabulary)

Each fixture carries both:
- closeout_text ⇒ fed to no-vibes via Stop event JSON on stdin
- operator_state ⇒ materialised into a tmpdir git repo + .claude/state/stop-verify.log,
then verify-before-stop is invoked from that tmpdir

Records exit codes, hook decisions, and per-fixture agreement. Writes a CSV row
per fixture and prints a summary with per-hook precision/recall/F1 and Cohen's κ
inter-hook agreement.

Standard-library only. No third-party deps.

Usage:
python3 parity_runner.py \
--corpus synthetic-3.1-corpus/ \
--verify-hook /path/to/verify-before-stop.sh \
--no-vibes-hook /path/to/no-vibes.sh \
--output parity_results.csv

For a smoke test (3 fixtures, no actual hook invocation, prints expectations):
python3 parity_runner.py --corpus synthetic-3.1-corpus/ --dry-run --max-fixtures 3
"""

from __future__ import annotations

import argparse
import csv
import json
import os
import shutil
import subprocess
import sys
import tempfile
import time
from pathlib import Path


# ---------------------------------------------------------------------------
# Fixture loading
# ---------------------------------------------------------------------------

REQUIRED_FIELDS = (
"id",
"group",
"mast_mode",
"expected_label",
"expected_no_vibes_decision",
"expected_verify_before_stop_decision",
"closeout_text",
"operator_state",
)


def load_fixtures(corpus_dir: Path) -> list[dict]:
fixtures = []
for path in sorted(corpus_dir.glob("*.json")):
with path.open() as f:
data = json.load(f)
missing = [k for k in REQUIRED_FIELDS if k not in data]
if missing:
sys.exit(f"fixture {path.name} missing fields: {missing}")
data["_path"] = str(path)
fixtures.append(data)
return fixtures


# ---------------------------------------------------------------------------
# Operator state materialisation
# ---------------------------------------------------------------------------

def materialise_operator_state(state: dict, workdir: Path) -> None:
"""Build a tmp git repo that reflects the fixture's operator state.

- Initialises a git repo at workdir.
- Touches files listed in operator_state.files_touched (uncommitted modifications).
- Writes .claude/state/stop-verify.log with the given entries.
Each entry is timestamped at `now - verify_log_age_seconds`.
- If `files_committed` is true, files are committed first (so they don't show as dirty);
otherwise files are left as either modified or untracked.
"""
subprocess.run(["git", "init", "-q", "-b", "main"], cwd=workdir, check=True)
# Required for commits to work in CI / fresh envs
subprocess.run(["git", "config", "user.email", "parity@local"], cwd=workdir, check=True)
subprocess.run(["git", "config", "user.name", "parity"], cwd=workdir, check=True)
# Baseline commit so verify-before-stop's `git diff` has something to diff against
(workdir / ".gitkeep").write_text("baseline\n")
subprocess.run(["git", "add", ".gitkeep"], cwd=workdir, check=True)
subprocess.run(["git", "commit", "-q", "-m", "baseline"], cwd=workdir, check=True)

files = state.get("files_touched", []) or []
for rel in files:
target = workdir / rel
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(f"// fixture-touched-file: {rel}\n")

if state.get("files_committed") and files:
subprocess.run(["git", "add", "-A"], cwd=workdir, check=True)
subprocess.run(["git", "commit", "-q", "-m", "commit-touched"], cwd=workdir, check=True)

log_entries = state.get("verify_log_entries", []) or []
if log_entries:
log_dir = workdir / ".claude" / "state"
log_dir.mkdir(parents=True, exist_ok=True)
ts = int(time.time()) - int(state.get("verify_log_age_seconds", 0))
with (log_dir / "stop-verify.log").open("w") as f:
for entry in log_entries:
f.write(f"{ts}|{entry}\n")


# ---------------------------------------------------------------------------
# Hook invocation
# ---------------------------------------------------------------------------

def stop_event_json(closeout_text: str) -> str:
return json.dumps({
"hook_event_name": "Stop",
"stop_hook_active": False,
"last_assistant_message": closeout_text,
})


def run_hook(hook_path: str, closeout_text: str, cwd: Path, timeout: int = 30) -> dict:
"""Invoke a Stop-event hook. Returns dict with decision, exit_code, stderr."""
payload = stop_event_json(closeout_text)
try:
proc = subprocess.run(
["bash", hook_path],
input=payload,
capture_output=True,
text=True,
timeout=timeout,
cwd=str(cwd),
check=False,
)
except subprocess.TimeoutExpired:
return {"decision": "timeout", "exit_code": None, "stderr": "timeout"}
except FileNotFoundError as e:
return {"decision": "missing_hook", "exit_code": None, "stderr": str(e)}

if proc.returncode == 0:
decision = "pass"
elif proc.returncode == 2:
decision = "block"
else:
decision = f"error_exit_{proc.returncode}"
return {
"decision": decision,
"exit_code": proc.returncode,
"stderr": (proc.stderr or "")[:240],
}


# ---------------------------------------------------------------------------
# Scoring
# ---------------------------------------------------------------------------

def classify(label: int, fired: bool) -> str:
if fired and label:
return "TP"
if fired and not label:
return "FP"
if not fired and label:
return "FN"
return "TN"


def prf1(tp: int, fp: int, fn: int) -> tuple[float, float, float]:
p = tp / (tp + fp) if (tp + fp) else 0.0
r = tp / (tp + fn) if (tp + fn) else 0.0
f1 = 2 * p * r / (p + r) if (p + r) else 0.0
return p, r, f1


def cohens_kappa(pairs: list[tuple[bool, bool]]) -> float:
"""Cohen's κ between the two hooks' block/pass decisions."""
n = len(pairs)
if n == 0:
return 0.0
a = sum(1 for x, _ in pairs if x)
b = sum(1 for _, y in pairs if y)
agree = sum(1 for x, y in pairs if x == y)
po = agree / n
pe = (a / n) * (b / n) + ((n - a) / n) * ((n - b) / n)
if pe == 1.0:
return 1.0
return (po - pe) / (1 - pe)


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main() -> int:
ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
ap.add_argument("--corpus", required=True, type=Path, help="Directory of *.json fixtures")
ap.add_argument("--verify-hook", type=Path, help="Path to verify-before-stop.sh")
ap.add_argument("--no-vibes-hook", type=Path, help="Path to no-vibes.sh")
ap.add_argument("--output", type=Path, default=Path("parity_results.csv"))
ap.add_argument("--max-fixtures", type=int, default=None)
ap.add_argument("--dry-run", action="store_true",
help="Skip hook invocation; emit only expected outcomes (for fixture validation).")
args = ap.parse_args()

if not args.corpus.is_dir():
sys.exit(f"corpus dir not found: {args.corpus}")

fixtures = load_fixtures(args.corpus)
if args.max_fixtures:
fixtures = fixtures[: args.max_fixtures]

if not args.dry_run:
for which, path in [("verify-hook", args.verify_hook), ("no-vibes-hook", args.no_vibes_hook)]:
if not path or not path.is_file():
sys.exit(f"--{which} required for non-dry-run; got: {path}")

print(f"# parity_runner: {len(fixtures)} fixtures from {args.corpus}", file=sys.stderr)
print(f"# dry-run = {args.dry_run}", file=sys.stderr)

rows = []
for fx in fixtures:
fid = fx["id"]
label = int(fx["expected_label"])
if args.dry_run:
verify_decision = fx["expected_verify_before_stop_decision"]
no_vibes_decision = fx["expected_no_vibes_decision"]
verify_exit = 2 if verify_decision == "block" else 0
no_vibes_exit = 2 if no_vibes_decision == "block" else 0
verify_stderr = "[dry-run expected]"
no_vibes_stderr = "[dry-run expected]"
else:
with tempfile.TemporaryDirectory(prefix=f"parity-{fid}-") as td:
workdir = Path(td)
materialise_operator_state(fx["operator_state"], workdir)
verify_res = run_hook(str(args.verify_hook), fx["closeout_text"], cwd=workdir)
# no-vibes is text-only and does not read the workspace; run from /tmp
no_vibes_res = run_hook(str(args.no_vibes_hook), fx["closeout_text"], cwd=Path("/tmp"))
verify_decision = verify_res["decision"]
verify_exit = verify_res["exit_code"]
verify_stderr = verify_res["stderr"]
no_vibes_decision = no_vibes_res["decision"]
no_vibes_exit = no_vibes_res["exit_code"]
no_vibes_stderr = no_vibes_res["stderr"]

verify_fired = verify_decision == "block"
no_vibes_fired = no_vibes_decision == "block"
verify_class = classify(label, verify_fired)
no_vibes_class = classify(label, no_vibes_fired)
agreement = "agree" if verify_fired == no_vibes_fired else "disagree"

rows.append({
"fixture_id": fid,
"group": fx["group"],
"mast_mode": fx["mast_mode"],
"expected_label": label,
"expected_verify_decision": fx["expected_verify_before_stop_decision"],
"expected_no_vibes_decision": fx["expected_no_vibes_decision"],
"verify_decision": verify_decision,
"verify_exit_code": verify_exit,
"verify_class": verify_class,
"verify_matches_expected": verify_decision == fx["expected_verify_before_stop_decision"],
"no_vibes_decision": no_vibes_decision,
"no_vibes_exit_code": no_vibes_exit,
"no_vibes_class": no_vibes_class,
"no_vibes_matches_expected": no_vibes_decision == fx["expected_no_vibes_decision"],
"agreement": agreement,
"verify_stderr": verify_stderr,
"no_vibes_stderr": no_vibes_stderr,
})

# Write CSV
args.output.parent.mkdir(parents=True, exist_ok=True)
with args.output.open("w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
writer.writeheader()
writer.writerows(rows)
print(f"# wrote {args.output}", file=sys.stderr)

# Summary
def metrics(class_key: str) -> dict:
tp = sum(1 for r in rows if r[class_key] == "TP")
fp = sum(1 for r in rows if r[class_key] == "FP")
fn = sum(1 for r in rows if r[class_key] == "FN")
tn = sum(1 for r in rows if r[class_key] == "TN")
p, r, f1 = prf1(tp, fp, fn)
return {"tp": tp, "fp": fp, "fn": fn, "tn": tn, "precision": round(p, 4), "recall": round(r, 4), "f1": round(f1, 4)}

verify_m = metrics("verify_class")
no_vibes_m = metrics("no_vibes_class")
pairs = [(r["verify_decision"] == "block", r["no_vibes_decision"] == "block") for r in rows]
kappa = cohens_kappa(pairs)
disagreements = [r for r in rows if r["agreement"] == "disagree"]

print()
print("=== Parity Summary ===")
print(f"fixtures: {len(rows)} (positive label={sum(r['expected_label'] for r in rows)}, negative={sum(1 for r in rows if r['expected_label']==0)})")
print()
print(f"verify-before-stop: TP={verify_m['tp']} FP={verify_m['fp']} FN={verify_m['fn']} TN={verify_m['tn']} P={verify_m['precision']} R={verify_m['recall']} F1={verify_m['f1']}")
print(f"no-vibes: TP={no_vibes_m['tp']} FP={no_vibes_m['fp']} FN={no_vibes_m['fn']} TN={no_vibes_m['tn']} P={no_vibes_m['precision']} R={no_vibes_m['recall']} F1={no_vibes_m['f1']}")
print()
print(f"inter-hook agreement (Cohen κ): {round(kappa, 4)}")
print(f"disagreements: {len(disagreements)}/{len(rows)}")
for r in disagreements:
print(f" - {r['fixture_id']} (group {r['group']}): verify={r['verify_decision']} no-vibes={r['no_vibes_decision']} label={r['expected_label']}")

return 0


if __name__ == "__main__":
sys.exit(main())
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"id": "A01_pure_premature_done",
"group": "A",
"mast_mode": "3.1",
"expected_label": 1,
"expected_no_vibes_decision": "pass",
"expected_verify_before_stop_decision": "pass",
"rationale": "Agent claims done with zero work and zero files dirty. No-vibes has no wrap-up vocabulary to catch; verify-before-stop has no diff to flag. Both will miss this — documents the blind spot when 3.1 manifests as silent no-op.",
"closeout_text": "Done.",
"operator_state": {
"files_touched": [],
"verify_log_entries": [],
"verify_log_age_seconds": 0,
"files_committed": false
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"id": "A02_task_complete_no_work",
"group": "A",
"mast_mode": "3.1",
"expected_label": 1,
"expected_no_vibes_decision": "block",
"expected_verify_before_stop_decision": "pass",
"rationale": "Closeout uses generic wrap-up tail (let me know if you need anything else) so no-vibes wrap_up rule should fire. Operator state is clean so verify-before-stop has nothing to block. Disagreement is diagnostic — text says completion but no work was done.",
"closeout_text": "Task complete. Let me know if you need anything else.",
"operator_state": {
"files_touched": [],
"verify_log_entries": [],
"verify_log_age_seconds": 0,
"files_committed": false
}
}
Loading