Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion experiments/scripts/run_cross_experiment_synthesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,17 @@ def load_json(name: str) -> dict | list:
# Load all results
# ────────────────────────────────────────────────────────────

def _normalize_results_envelope(payload):
"""V2 (review-2026-05-21): unwrap the {_meta, results} envelope used by
Strategy D / E / F so legacy consumers expecting a plain list keep working.
Strategy D was originally a list of model_results; PR #4+ wraps it in
{"_meta": ..., "results": [...]}. This shim handles both shapes.
"""
if isinstance(payload, dict) and "results" in payload and "_meta" in payload:
return payload["results"]
return payload


def load_all_results():
return {
"prediction": load_json("prediction_results.json"),
Expand All @@ -52,7 +63,9 @@ def load_all_results():
"strategy_a": load_json("strategy_a_vocab_mediation.json"),
"strategy_2": load_json("strategy2_langpair_results.json"),
"strategy_4": load_json("strategy4_prereq_results.json"),
"strategy_d": load_json("strategy_d_code_alignment.json"),
"strategy_d": _normalize_results_envelope(load_json("strategy_d_code_alignment.json")),
"strategy_e": _normalize_results_envelope(load_json("strategy_e_multimodel_probing.json")),
"strategy_f": _normalize_results_envelope(load_json("strategy_f_ood_alignment.json")),
"strategy_6r": load_json("strategy_6r_dialect_results.json"),
"rcode_token": load_json("rcode_token_control.json"),
}
Expand Down Expand Up @@ -98,6 +111,10 @@ def build_master_summary(results: dict) -> list[dict]:
for model_result in strat_d:
per_lang = model_result.get("per_language", {})
for lang, stats in per_lang.items():
# V20 (review-2026-05-21): skip the "aggregate" pseudo-key
# written by compute_per_language_R_code; it is not a cell.
if lang == "aggregate":
continue
if isinstance(stats, dict) and not stats.get("skip"):
total_cells += 1
if stats.get("p_corrected", 1.0) < 0.05:
Expand Down
36 changes: 26 additions & 10 deletions experiments/scripts/run_strategy_d_code_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def _build_run_meta() -> dict:
except Exception:
torch_version = "unknown"
return {
"started_at_utc": datetime.datetime.utcnow().isoformat() + "Z",
"started_at_utc": datetime.datetime.now(datetime.UTC).isoformat(),
"python": platform.python_version(),
"platform": platform.platform(),
"sentence_transformers": st_version,
Expand Down Expand Up @@ -281,6 +281,22 @@ def main():
for (mi, lang), p_corr in zip(p_index, corrected):
all_results[mi]["per_language"][lang]["p_corrected"] = p_corr

# V8 (review-2026-05-21): refuse to publish results if any model failed.
# Holm-Bonferroni's family-wise denominator depends on the full N; a
# partial run would silently invalidate the paper's "across 35 cells"
# claim. Set Z_GAP_ALLOW_PARTIAL_RESULTS=1 to override (e.g. debugging).
import os as _os
if failed_models and _os.environ.get("Z_GAP_ALLOW_PARTIAL_RESULTS") != "1":
print(
f"\n[FATAL] {len(failed_models)}/{len(MODELS)} model(s) failed; "
f"refusing to write partial results.\n"
f" Failed: {[f['label'] for f in failed_models]}\n"
f" Holm-Bonferroni denominator depends on full N.\n"
f" Set Z_GAP_ALLOW_PARTIAL_RESULTS=1 to override.",
file=sys.stderr,
)
sys.exit(2)

# Summary
print(f"\n{'='*60}")
print("CROSS-MODEL SUMMARY (Holm-Bonferroni corrected)")
Expand Down Expand Up @@ -311,10 +327,8 @@ def main():

print(f"\n R_code > 1 and significant: {n_supported}/{n_total} cells")

# Figures
make_figures(all_results)

# Save
# V7 (review-2026-05-21): save JSON BEFORE generating figures so a
# matplotlib failure does not discard hours of compute.
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
out_path = RESULTS_DIR / "strategy_d_code_alignment.json"

Expand All @@ -325,7 +339,7 @@ def _convert(obj):
if isinstance(obj, (np.bool_,)): return bool(obj)
return obj

run_meta["finished_at_utc"] = datetime.datetime.utcnow().isoformat() + "Z"
run_meta["finished_at_utc"] = datetime.datetime.now(datetime.UTC).isoformat()
run_meta["n_models_attempted"] = len(MODELS)
run_meta["n_models_succeeded"] = len(all_results)
run_meta["failed_models"] = failed_models
Expand All @@ -334,10 +348,12 @@ def _convert(obj):
with open(out_path, "w") as f:
json.dump(payload, f, indent=2, default=_convert)
print(f"\n Results saved: {out_path}")
if failed_models:
print(f" [WARN] {len(failed_models)} model(s) skipped due to errors:")
for err in failed_models:
print(f" - {err['label']}: {err['error_type']}")

# Figures last (best-effort, isolated from results JSON).
try:
make_figures(all_results)
except Exception as e: # noqa: BLE001
print(f" [WARN] make_figures failed: {type(e).__name__}: {e}", file=sys.stderr)


if __name__ == "__main__":
Expand Down
90 changes: 78 additions & 12 deletions experiments/scripts/run_strategy_e_multimodel_probing.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,15 +89,39 @@ def run_model_probing(model_name: str, label: str, kwargs: dict) -> dict:
embeddings = {k: embeddings_array[i] for i, k in enumerate(keys)}
print(f" {len(embeddings)} NL embeddings ready ({len(ops)} ops × {len(LANGUAGES)} langs)")

# V11 (review-2026-05-21): guard against missing categories. The
# original `categories[op_id]` raised KeyError on any op without a
# category field, which the outer try/except silently classified as a
# whole-model failure. We now skip the op explicitly and surface a
# warning so the failure mode is visible.
def _label(op_id: str) -> int | None:
cat = categories.get(op_id)
if cat is None:
return None
if cat not in ("computational", "judgment"):
return None
return 1 if cat == "computational" else 0

skipped_ops_train = []
# --- Probe 1: category (chance 50%) ---
X_train, y_train = [], []
for op_id in all_ids:
key = f"{op_id}_en"
if key in embeddings:
lbl = _label(op_id)
if lbl is None:
skipped_ops_train.append(op_id)
continue
X_train.append(embeddings[key])
y_train.append(1 if categories[op_id] == "computational" else 0)
y_train.append(lbl)
if skipped_ops_train:
print(f" [WARN] skipped {len(skipped_ops_train)} train ops with unknown category: "
f"{skipped_ops_train[:5]}{'...' if len(skipped_ops_train) > 5 else ''}",
file=sys.stderr)
X_train = np.array(X_train)
y_train = np.array(y_train)
if len(X_train) == 0:
raise RuntimeError("no labeled training samples — every op had an unknown category")

clf_cat = LogisticRegression(max_iter=2000, random_state=SEED, C=1.0)
clf_cat.fit(X_train, y_train)
Expand All @@ -107,8 +131,21 @@ def run_model_probing(model_name: str, label: str, kwargs: dict) -> dict:
for op_id in all_ids:
key = f"{op_id}_{lang}"
if key in embeddings:
lbl = _label(op_id)
if lbl is None:
continue
X_test.append(embeddings[key])
y_test.append(1 if categories[op_id] == "computational" else 0)
y_test.append(lbl)
# V11: guard empty test set so the script reports it instead of
# crashing on `clf.predict(np.array([]))`.
if not X_test:
cat_results[lang] = {
"accuracy": float("nan"),
"n_correct": 0, "n_total": 0,
"p_value_vs_chance": float("nan"),
"skip": True,
}
continue
X_test = np.array(X_test)
y_test = np.array(y_test)
preds = clf_cat.predict(X_test)
Expand All @@ -122,7 +159,9 @@ def run_model_probing(model_name: str, label: str, kwargs: dict) -> dict:
"p_value_vs_chance": _binomial_p_vs_chance(n_correct, n_total, 0.5),
}

cat_transfer = float(np.mean([r["accuracy"] for lang, r in cat_results.items() if lang != "en"]))
_non_en_cat = [r["accuracy"] for lang, r in cat_results.items()
if lang != "en" and not r.get("skip")]
cat_transfer = float(np.nanmean(_non_en_cat)) if _non_en_cat else float("nan")

# --- Probe 2: operation identity (chance 1%) ---
op_to_idx = {op_id: i for i, op_id in enumerate(all_ids)}
Expand All @@ -146,6 +185,15 @@ def run_model_probing(model_name: str, label: str, kwargs: dict) -> dict:
if key in embeddings:
X_test.append(embeddings[key])
y_test.append(op_to_idx[op_id])
# V11: guard empty test set.
if not X_test:
op_results[lang] = {
"accuracy": float("nan"),
"n_correct": 0, "n_total": 0,
"p_value_vs_chance": float("nan"),
"skip": True,
}
continue
X_test = np.array(X_test)
y_test = np.array(y_test)
preds = clf_op.predict(X_test)
Expand All @@ -158,7 +206,9 @@ def run_model_probing(model_name: str, label: str, kwargs: dict) -> dict:
"n_total": n_total,
"p_value_vs_chance": _binomial_p_vs_chance(n_correct, n_total, chance_op),
}
op_transfer = float(np.mean([r["accuracy"] for lang, r in op_results.items() if lang != "en"]))
_non_en_op = [r["accuracy"] for lang, r in op_results.items()
if lang != "en" and not r.get("skip")]
op_transfer = float(np.nanmean(_non_en_op)) if _non_en_op else float("nan")

# Print
print(f"\n Probe 1 (category, chance 50%):")
Expand Down Expand Up @@ -249,7 +299,11 @@ def make_heatmaps(all_results: list[dict]):
for mi, res in enumerate(all_results):
labels.append(res["label"])
for li, lang in enumerate(LANGUAGES):
matrix[mi, li] = res[probe_key]["per_language"][lang]["accuracy"]
# V11: per_language may have been skipped (empty test set);
# fall back to NaN so seaborn shows a blank cell instead of
# KeyError on a missing key.
cell = res[probe_key]["per_language"].get(lang, {})
matrix[mi, li] = cell.get("accuracy", float("nan"))
sns.heatmap(
matrix, annot=True, fmt=".2f", cmap="YlGn",
xticklabels=LANGUAGES, yticklabels=labels,
Expand Down Expand Up @@ -298,6 +352,18 @@ def main():
)
gc.collect()

# V8 (review-2026-05-21): same partial-success guard as Strategy D.
import os as _os
if failed_models and _os.environ.get("Z_GAP_ALLOW_PARTIAL_RESULTS") != "1":
print(
f"\n[FATAL] {len(failed_models)}/{len(MODELS)} model(s) failed; "
f"refusing to write partial Strategy E results.\n"
f" Failed: {[f['label'] for f in failed_models]}\n"
f" Set Z_GAP_ALLOW_PARTIAL_RESULTS=1 to override.",
file=sys.stderr,
)
sys.exit(2)

# Summary
print(f"\n{'='*60}")
print("CROSS-MODEL P3 SUMMARY")
Expand All @@ -311,9 +377,8 @@ def main():
op_xfer = res["operation_probe"]["mean_transfer"]
print(f"{res['label']:<25s} {cat_en:>7.3f} {cat_xfer:>12.3f} {op_en:>6.3f} {op_xfer:>12.3f}")

make_heatmaps(all_results)

# Save
# V7 (review-2026-05-21): save BEFORE figures so a matplotlib failure
# does not lose the probing results.
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
run_meta["finished_at_utc"] = datetime.datetime.now(datetime.UTC).isoformat()
run_meta["n_models_attempted"] = len(MODELS)
Expand All @@ -333,10 +398,11 @@ def _convert(obj):
with open(out_path, "w") as f:
json.dump(payload, f, indent=2, default=_convert)
print(f"\n Results saved: {out_path}")
if failed_models:
print(f" [WARN] {len(failed_models)} model(s) skipped:")
for err in failed_models:
print(f" - {err['label']}: {err['error_type']}")

try:
make_heatmaps(all_results)
except Exception as e: # noqa: BLE001
print(f" [WARN] make_heatmaps failed: {type(e).__name__}: {e}", file=sys.stderr)


if __name__ == "__main__":
Expand Down
35 changes: 28 additions & 7 deletions experiments/scripts/run_strategy_f_ood_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,14 @@ def load_ood_stimuli() -> tuple[list[dict], dict[str, str]]:
with open(DATA_DIR / "tier3_compositional.json") as f:
tier3 = json.load(f)
ops = tier2 + tier3
# V12 (review-2026-05-21): assert op_id uniqueness across the two tiers
# so a future id collision does not silently double-count pairings in
# compute_per_language_R_code.
op_ids = [op["id"] for op in ops]
if len(set(op_ids)) != len(op_ids):
from collections import Counter
dups = [k for k, v in Counter(op_ids).items() if v > 1]
raise ValueError(f"tier2/tier3 op_id collision: {dups}")
code_equivalents = {op["id"]: op["code"] for op in ops}
return ops, code_equivalents

Expand Down Expand Up @@ -253,6 +261,19 @@ def main():
)
gc.collect()

# V8 (review-2026-05-21): refuse partial results so paper's "35/35 OOD
# cells" claim is never silently invalidated by a model dropout.
import os as _os
if failed and _os.environ.get("Z_GAP_ALLOW_PARTIAL_RESULTS") != "1":
print(
f"\n[FATAL] {len(failed)}/{len(MODELS)} model(s) failed; "
f"refusing to write partial Strategy F results.\n"
f" Failed: {[f['label'] for f in failed]}\n"
f" Set Z_GAP_ALLOW_PARTIAL_RESULTS=1 to override.",
file=sys.stderr,
)
sys.exit(2)

# Holm-Bonferroni across all (model, language) cells
all_p, p_index = [], []
for mi, res in enumerate(all_results):
Expand Down Expand Up @@ -298,9 +319,7 @@ def main():
print(f"\n OOD R_code > 1 and significant: {n_sig}/{n_total} cells")
print(f" (Strategy D tier1 baseline: 35/35 cells)")

make_figure(all_results)

# Save
# V7 (review-2026-05-21): save BEFORE figures.
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
run_meta["finished_at_utc"] = datetime.datetime.now(datetime.UTC).isoformat()
run_meta["n_models_attempted"] = len(MODELS)
Expand All @@ -323,10 +342,12 @@ def _convert(obj):
with open(out_path, "w") as f:
json.dump(payload, f, indent=2, default=_convert)
print(f"\n Results saved: {out_path}")
if failed:
print(f" [WARN] {len(failed)} model(s) skipped:")
for err in failed:
print(f" - {err['label']}: {err['error_type']}")

# Figures last (best-effort).
try:
make_figure(all_results)
except Exception as e: # noqa: BLE001
print(f" [WARN] make_figure failed: {type(e).__name__}: {e}", file=sys.stderr)


if __name__ == "__main__":
Expand Down
Loading
Loading