heznpc · heznpc · May 28, 2026 · May 28, 2026
diff --git a/experiments/scripts/run_cross_experiment_synthesis.py b/experiments/scripts/run_cross_experiment_synthesis.py
@@ -42,6 +42,17 @@ def load_json(name: str) -> dict | list:
 # Load all results
 # ────────────────────────────────────────────────────────────
 
+def _normalize_results_envelope(payload):
+    """V2 (review-2026-05-21): unwrap the {_meta, results} envelope used by
+    Strategy D / E / F so legacy consumers expecting a plain list keep working.
+    Strategy D was originally a list of model_results; PR #4+ wraps it in
+    {"_meta": ..., "results": [...]}. This shim handles both shapes.
+    """
+    if isinstance(payload, dict) and "results" in payload and "_meta" in payload:
+        return payload["results"]
+    return payload
+
+
 def load_all_results():
     return {
         "prediction": load_json("prediction_results.json"),
@@ -52,7 +63,9 @@ def load_all_results():
         "strategy_a": load_json("strategy_a_vocab_mediation.json"),
         "strategy_2": load_json("strategy2_langpair_results.json"),
         "strategy_4": load_json("strategy4_prereq_results.json"),
-        "strategy_d": load_json("strategy_d_code_alignment.json"),
+        "strategy_d": _normalize_results_envelope(load_json("strategy_d_code_alignment.json")),
+        "strategy_e": _normalize_results_envelope(load_json("strategy_e_multimodel_probing.json")),
+        "strategy_f": _normalize_results_envelope(load_json("strategy_f_ood_alignment.json")),
         "strategy_6r": load_json("strategy_6r_dialect_results.json"),
         "rcode_token": load_json("rcode_token_control.json"),
     }
@@ -98,6 +111,10 @@ def build_master_summary(results: dict) -> list[dict]:
         for model_result in strat_d:
             per_lang = model_result.get("per_language", {})
             for lang, stats in per_lang.items():
+                # V20 (review-2026-05-21): skip the "aggregate" pseudo-key
+                # written by compute_per_language_R_code; it is not a cell.
+                if lang == "aggregate":
+                    continue
                 if isinstance(stats, dict) and not stats.get("skip"):
                     total_cells += 1
                     if stats.get("p_corrected", 1.0) < 0.05:

diff --git a/experiments/scripts/run_strategy_d_code_alignment.py b/experiments/scripts/run_strategy_d_code_alignment.py
@@ -218,7 +218,7 @@ def _build_run_meta() -> dict:
     except Exception:
         torch_version = "unknown"
     return {
-        "started_at_utc": datetime.datetime.utcnow().isoformat() + "Z",
+        "started_at_utc": datetime.datetime.now(datetime.UTC).isoformat(),
         "python": platform.python_version(),
         "platform": platform.platform(),
         "sentence_transformers": st_version,
@@ -281,6 +281,22 @@ def main():
         for (mi, lang), p_corr in zip(p_index, corrected):
             all_results[mi]["per_language"][lang]["p_corrected"] = p_corr
 
+    # V8 (review-2026-05-21): refuse to publish results if any model failed.
+    # Holm-Bonferroni's family-wise denominator depends on the full N; a
+    # partial run would silently invalidate the paper's "across 35 cells"
+    # claim. Set Z_GAP_ALLOW_PARTIAL_RESULTS=1 to override (e.g. debugging).
+    import os as _os
+    if failed_models and _os.environ.get("Z_GAP_ALLOW_PARTIAL_RESULTS") != "1":
+        print(
+            f"\n[FATAL] {len(failed_models)}/{len(MODELS)} model(s) failed; "
+            f"refusing to write partial results.\n"
+            f"        Failed: {[f['label'] for f in failed_models]}\n"
+            f"        Holm-Bonferroni denominator depends on full N.\n"
+            f"        Set Z_GAP_ALLOW_PARTIAL_RESULTS=1 to override.",
+            file=sys.stderr,
+        )
+        sys.exit(2)
+
     # Summary
     print(f"\n{'='*60}")
     print("CROSS-MODEL SUMMARY (Holm-Bonferroni corrected)")
@@ -311,10 +327,8 @@ def main():
 
     print(f"\n  R_code > 1 and significant: {n_supported}/{n_total} cells")
 
-    # Figures
-    make_figures(all_results)
-
-    # Save
+    # V7 (review-2026-05-21): save JSON BEFORE generating figures so a
+    # matplotlib failure does not discard hours of compute.
     RESULTS_DIR.mkdir(parents=True, exist_ok=True)
     out_path = RESULTS_DIR / "strategy_d_code_alignment.json"
 
@@ -325,7 +339,7 @@ def _convert(obj):
         if isinstance(obj, (np.bool_,)): return bool(obj)
         return obj
 
-    run_meta["finished_at_utc"] = datetime.datetime.utcnow().isoformat() + "Z"
+    run_meta["finished_at_utc"] = datetime.datetime.now(datetime.UTC).isoformat()
     run_meta["n_models_attempted"] = len(MODELS)
     run_meta["n_models_succeeded"] = len(all_results)
     run_meta["failed_models"] = failed_models
@@ -334,10 +348,12 @@ def _convert(obj):
     with open(out_path, "w") as f:
         json.dump(payload, f, indent=2, default=_convert)
     print(f"\n  Results saved: {out_path}")
-    if failed_models:
-        print(f"  [WARN] {len(failed_models)} model(s) skipped due to errors:")
-        for err in failed_models:
-            print(f"    - {err['label']}: {err['error_type']}")
+
+    # Figures last (best-effort, isolated from results JSON).
+    try:
+        make_figures(all_results)
+    except Exception as e:  # noqa: BLE001
+        print(f"  [WARN] make_figures failed: {type(e).__name__}: {e}", file=sys.stderr)
 
 
 if __name__ == "__main__":

diff --git a/experiments/scripts/run_strategy_e_multimodel_probing.py b/experiments/scripts/run_strategy_e_multimodel_probing.py
@@ -89,15 +89,39 @@ def run_model_probing(model_name: str, label: str, kwargs: dict) -> dict:
     embeddings = {k: embeddings_array[i] for i, k in enumerate(keys)}
     print(f"  {len(embeddings)} NL embeddings ready ({len(ops)} ops × {len(LANGUAGES)} langs)")
 
+    # V11 (review-2026-05-21): guard against missing categories. The
+    # original `categories[op_id]` raised KeyError on any op without a
+    # category field, which the outer try/except silently classified as a
+    # whole-model failure. We now skip the op explicitly and surface a
+    # warning so the failure mode is visible.
+    def _label(op_id: str) -> int | None:
+        cat = categories.get(op_id)
+        if cat is None:
+            return None
+        if cat not in ("computational", "judgment"):
+            return None
+        return 1 if cat == "computational" else 0
+
+    skipped_ops_train = []
     # --- Probe 1: category (chance 50%) ---
     X_train, y_train = [], []
     for op_id in all_ids:
         key = f"{op_id}_en"
         if key in embeddings:
+            lbl = _label(op_id)
+            if lbl is None:
+                skipped_ops_train.append(op_id)
+                continue
             X_train.append(embeddings[key])
-            y_train.append(1 if categories[op_id] == "computational" else 0)
+            y_train.append(lbl)
+    if skipped_ops_train:
+        print(f"  [WARN] skipped {len(skipped_ops_train)} train ops with unknown category: "
+              f"{skipped_ops_train[:5]}{'...' if len(skipped_ops_train) > 5 else ''}",
+              file=sys.stderr)
     X_train = np.array(X_train)
     y_train = np.array(y_train)
+    if len(X_train) == 0:
+        raise RuntimeError("no labeled training samples — every op had an unknown category")
 
     clf_cat = LogisticRegression(max_iter=2000, random_state=SEED, C=1.0)
     clf_cat.fit(X_train, y_train)
@@ -107,8 +131,21 @@ def run_model_probing(model_name: str, label: str, kwargs: dict) -> dict:
         for op_id in all_ids:
             key = f"{op_id}_{lang}"
             if key in embeddings:
+                lbl = _label(op_id)
+                if lbl is None:
+                    continue
                 X_test.append(embeddings[key])
-                y_test.append(1 if categories[op_id] == "computational" else 0)
+                y_test.append(lbl)
+        # V11: guard empty test set so the script reports it instead of
+        # crashing on `clf.predict(np.array([]))`.
+        if not X_test:
+            cat_results[lang] = {
+                "accuracy": float("nan"),
+                "n_correct": 0, "n_total": 0,
+                "p_value_vs_chance": float("nan"),
+                "skip": True,
+            }
+            continue
         X_test = np.array(X_test)
         y_test = np.array(y_test)
         preds = clf_cat.predict(X_test)
@@ -122,7 +159,9 @@ def run_model_probing(model_name: str, label: str, kwargs: dict) -> dict:
             "p_value_vs_chance": _binomial_p_vs_chance(n_correct, n_total, 0.5),
         }
 
-    cat_transfer = float(np.mean([r["accuracy"] for lang, r in cat_results.items() if lang != "en"]))
+    _non_en_cat = [r["accuracy"] for lang, r in cat_results.items()
+                   if lang != "en" and not r.get("skip")]
+    cat_transfer = float(np.nanmean(_non_en_cat)) if _non_en_cat else float("nan")
 
     # --- Probe 2: operation identity (chance 1%) ---
     op_to_idx = {op_id: i for i, op_id in enumerate(all_ids)}
@@ -146,6 +185,15 @@ def run_model_probing(model_name: str, label: str, kwargs: dict) -> dict:
             if key in embeddings:
                 X_test.append(embeddings[key])
                 y_test.append(op_to_idx[op_id])
+        # V11: guard empty test set.
+        if not X_test:
+            op_results[lang] = {
+                "accuracy": float("nan"),
+                "n_correct": 0, "n_total": 0,
+                "p_value_vs_chance": float("nan"),
+                "skip": True,
+            }
+            continue
         X_test = np.array(X_test)
         y_test = np.array(y_test)
         preds = clf_op.predict(X_test)
@@ -158,7 +206,9 @@ def run_model_probing(model_name: str, label: str, kwargs: dict) -> dict:
             "n_total": n_total,
             "p_value_vs_chance": _binomial_p_vs_chance(n_correct, n_total, chance_op),
         }
-    op_transfer = float(np.mean([r["accuracy"] for lang, r in op_results.items() if lang != "en"]))
+    _non_en_op = [r["accuracy"] for lang, r in op_results.items()
+                  if lang != "en" and not r.get("skip")]
+    op_transfer = float(np.nanmean(_non_en_op)) if _non_en_op else float("nan")
 
     # Print
     print(f"\n  Probe 1 (category, chance 50%):")
@@ -249,7 +299,11 @@ def make_heatmaps(all_results: list[dict]):
         for mi, res in enumerate(all_results):
             labels.append(res["label"])
             for li, lang in enumerate(LANGUAGES):
-                matrix[mi, li] = res[probe_key]["per_language"][lang]["accuracy"]
+                # V11: per_language may have been skipped (empty test set);
+                # fall back to NaN so seaborn shows a blank cell instead of
+                # KeyError on a missing key.
+                cell = res[probe_key]["per_language"].get(lang, {})
+                matrix[mi, li] = cell.get("accuracy", float("nan"))
         sns.heatmap(
             matrix, annot=True, fmt=".2f", cmap="YlGn",
             xticklabels=LANGUAGES, yticklabels=labels,
@@ -298,6 +352,18 @@ def main():
             )
             gc.collect()
 
+    # V8 (review-2026-05-21): same partial-success guard as Strategy D.
+    import os as _os
+    if failed_models and _os.environ.get("Z_GAP_ALLOW_PARTIAL_RESULTS") != "1":
+        print(
+            f"\n[FATAL] {len(failed_models)}/{len(MODELS)} model(s) failed; "
+            f"refusing to write partial Strategy E results.\n"
+            f"        Failed: {[f['label'] for f in failed_models]}\n"
+            f"        Set Z_GAP_ALLOW_PARTIAL_RESULTS=1 to override.",
+            file=sys.stderr,
+        )
+        sys.exit(2)
+
     # Summary
     print(f"\n{'='*60}")
     print("CROSS-MODEL P3 SUMMARY")
@@ -311,9 +377,8 @@ def main():
         op_xfer = res["operation_probe"]["mean_transfer"]
         print(f"{res['label']:<25s}  {cat_en:>7.3f}  {cat_xfer:>12.3f}  {op_en:>6.3f}  {op_xfer:>12.3f}")
 
-    make_heatmaps(all_results)
-
-    # Save
+    # V7 (review-2026-05-21): save BEFORE figures so a matplotlib failure
+    # does not lose the probing results.
     RESULTS_DIR.mkdir(parents=True, exist_ok=True)
     run_meta["finished_at_utc"] = datetime.datetime.now(datetime.UTC).isoformat()
     run_meta["n_models_attempted"] = len(MODELS)
@@ -333,10 +398,11 @@ def _convert(obj):
     with open(out_path, "w") as f:
         json.dump(payload, f, indent=2, default=_convert)
     print(f"\n  Results saved: {out_path}")
-    if failed_models:
-        print(f"  [WARN] {len(failed_models)} model(s) skipped:")
-        for err in failed_models:
-            print(f"    - {err['label']}: {err['error_type']}")
+
+    try:
+        make_heatmaps(all_results)
+    except Exception as e:  # noqa: BLE001
+        print(f"  [WARN] make_heatmaps failed: {type(e).__name__}: {e}", file=sys.stderr)
 
 
 if __name__ == "__main__":

diff --git a/experiments/scripts/run_strategy_f_ood_alignment.py b/experiments/scripts/run_strategy_f_ood_alignment.py
@@ -75,6 +75,14 @@ def load_ood_stimuli() -> tuple[list[dict], dict[str, str]]:
     with open(DATA_DIR / "tier3_compositional.json") as f:
         tier3 = json.load(f)
     ops = tier2 + tier3
+    # V12 (review-2026-05-21): assert op_id uniqueness across the two tiers
+    # so a future id collision does not silently double-count pairings in
+    # compute_per_language_R_code.
+    op_ids = [op["id"] for op in ops]
+    if len(set(op_ids)) != len(op_ids):
+        from collections import Counter
+        dups = [k for k, v in Counter(op_ids).items() if v > 1]
+        raise ValueError(f"tier2/tier3 op_id collision: {dups}")
     code_equivalents = {op["id"]: op["code"] for op in ops}
     return ops, code_equivalents
 
@@ -253,6 +261,19 @@ def main():
             )
             gc.collect()
 
+    # V8 (review-2026-05-21): refuse partial results so paper's "35/35 OOD
+    # cells" claim is never silently invalidated by a model dropout.
+    import os as _os
+    if failed and _os.environ.get("Z_GAP_ALLOW_PARTIAL_RESULTS") != "1":
+        print(
+            f"\n[FATAL] {len(failed)}/{len(MODELS)} model(s) failed; "
+            f"refusing to write partial Strategy F results.\n"
+            f"        Failed: {[f['label'] for f in failed]}\n"
+            f"        Set Z_GAP_ALLOW_PARTIAL_RESULTS=1 to override.",
+            file=sys.stderr,
+        )
+        sys.exit(2)
+
     # Holm-Bonferroni across all (model, language) cells
     all_p, p_index = [], []
     for mi, res in enumerate(all_results):
@@ -298,9 +319,7 @@ def main():
     print(f"\n  OOD R_code > 1 and significant: {n_sig}/{n_total} cells")
     print(f"  (Strategy D tier1 baseline: 35/35 cells)")
 
-    make_figure(all_results)
-
-    # Save
+    # V7 (review-2026-05-21): save BEFORE figures.
     RESULTS_DIR.mkdir(parents=True, exist_ok=True)
     run_meta["finished_at_utc"] = datetime.datetime.now(datetime.UTC).isoformat()
     run_meta["n_models_attempted"] = len(MODELS)
@@ -323,10 +342,12 @@ def _convert(obj):
     with open(out_path, "w") as f:
         json.dump(payload, f, indent=2, default=_convert)
     print(f"\n  Results saved: {out_path}")
-    if failed:
-        print(f"  [WARN] {len(failed)} model(s) skipped:")
-        for err in failed:
-            print(f"    - {err['label']}: {err['error_type']}")
+
+    # Figures last (best-effort).
+    try:
+        make_figure(all_results)
+    except Exception as e:  # noqa: BLE001
+        print(f"  [WARN] make_figure failed: {type(e).__name__}: {e}", file=sys.stderr)
 
 
 if __name__ == "__main__":