Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
349 changes: 349 additions & 0 deletions experiments/scripts/run_strategy_e_multimodel_probing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,349 @@
#!/usr/bin/env python3
"""Strategy E: Multi-Model P3 Cross-Lingual Probing.

Closes M5 from the 2026-05-21 pre-experiment review. Extends the paper's
P3 result (originally MiniLM-L12 only; see paper/main.tex §5.5 "P3 Results")
to the 7-model set used in Strategy D so the Z_sem stratification claim
no longer rests on a single model.

For each of the 7 models:
- Embed all 100 operations × 5 languages (uses EmbeddingCache for hits)
- Train LogisticRegression on English embeddings:
Probe 1: category (computational vs judgment, chance 50%)
Probe 2: operation identity (100-way, chance 1%)
- Test cross-lingual transfer accuracy on each non-English language
- Compute binomial p-values against chance for each cell

Outputs:
- results/strategy_e_multimodel_probing.json (full per-cell data + meta)
- results/figures/strategy_e_category_heatmap.png
- results/figures/strategy_e_operation_heatmap.png

Run-meta (review-2026-05-21 pattern): timestamp, python/torch/st versions,
seed, per-model success/failure with try/except wrap.
"""

from __future__ import annotations

import datetime
import gc
import json
import platform
import sys
from pathlib import Path

import numpy as np

sys.path.insert(0, str(Path(__file__).parent.parent))

from src.stimuli import get_all_operations, LANGUAGES
from src.embeddings import SentenceTransformerEmbedder, EmbeddingCache

ROOT = Path(__file__).parent.parent
RESULTS_DIR = ROOT / "results"
FIGURES_DIR = RESULTS_DIR / "figures"
CACHE_DIR = RESULTS_DIR / "embeddings"

# Same 7-model set as Strategy D (run_strategy_d_code_alignment.py).
# Kept in sync manually; consider a shared model_registry.py if extended.
MODELS = [
("microsoft/unixcoder-base", "UniXcoder (code)", {}),
("paraphrase-multilingual-MiniLM-L12-v2", "MiniLM-L12 (NL)", {}),
("nomic-ai/nomic-embed-text-v1.5", "Nomic v1.5 (NL+code)", {"trust_remote_code": True}),
("intfloat/multilingual-e5-small", "E5-small (NL)", {}),
("intfloat/multilingual-e5-base", "E5-base (NL)", {}),
("intfloat/multilingual-e5-large", "E5-large (NL)", {}),
("BAAI/bge-m3", "BGE-M3 (NL+code)", {}),
]

# Random seed mirrors Strategy D for cross-experiment consistency
SEED = 42


def _binomial_p_vs_chance(n_correct: int, n_total: int, p_chance: float) -> float:
"""One-sided binomial test: P(X >= n_correct | n_total, p_chance)."""
from scipy import stats as scipy_stats
return float(scipy_stats.binomtest(n_correct, n_total, p=p_chance, alternative="greater").pvalue)


def run_model_probing(model_name: str, label: str, kwargs: dict) -> dict:
"""Run P3 (category + operation) probes for one model."""
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

print(f"\n{'='*60}")
print(f" {label} ({model_name})")
print(f"{'='*60}")

ops = get_all_operations()
categories = {op.id: op.category for op in ops}
all_ids = [op.id for op in ops]

cache = EmbeddingCache(CACHE_DIR)
model = SentenceTransformerEmbedder(model_name, **kwargs)
print(f" dim={model.dimension}")

# Embed all 100 ops × 5 langs (cache hits if Strategy D / earlier P3 ran)
texts, keys = [], []
for op in ops:
for lang in LANGUAGES:
desc = op.descriptions.get(lang)
if desc:
texts.append(desc)
keys.append(f"{op.id}_{lang}")

embeddings_array = cache.get_or_compute(model, texts)
embeddings = {k: embeddings_array[i] for i, k in enumerate(keys)}
print(f" {len(embeddings)} NL embeddings ready ({len(ops)} ops × {len(LANGUAGES)} langs)")

# --- Probe 1: category (chance 50%) ---
X_train, y_train = [], []
for op_id in all_ids:
key = f"{op_id}_en"
if key in embeddings:
X_train.append(embeddings[key])
y_train.append(1 if categories[op_id] == "computational" else 0)
X_train = np.array(X_train)
y_train = np.array(y_train)

clf_cat = LogisticRegression(max_iter=2000, random_state=SEED, C=1.0)
clf_cat.fit(X_train, y_train)
cat_results = {}
for lang in LANGUAGES:
X_test, y_test = [], []
for op_id in all_ids:
key = f"{op_id}_{lang}"
if key in embeddings:
X_test.append(embeddings[key])
y_test.append(1 if categories[op_id] == "computational" else 0)
X_test = np.array(X_test)
y_test = np.array(y_test)
preds = clf_cat.predict(X_test)
acc = float(accuracy_score(y_test, preds))
n_correct = int(np.sum(preds == y_test))
n_total = int(len(y_test))
cat_results[lang] = {
"accuracy": acc,
"n_correct": n_correct,
"n_total": n_total,
"p_value_vs_chance": _binomial_p_vs_chance(n_correct, n_total, 0.5),
}

cat_transfer = float(np.mean([r["accuracy"] for lang, r in cat_results.items() if lang != "en"]))

# --- Probe 2: operation identity (chance 1%) ---
op_to_idx = {op_id: i for i, op_id in enumerate(all_ids)}
X_train2, y_train2 = [], []
for op_id in all_ids:
key = f"{op_id}_en"
if key in embeddings:
X_train2.append(embeddings[key])
y_train2.append(op_to_idx[op_id])
X_train2 = np.array(X_train2)
y_train2 = np.array(y_train2)

clf_op = LogisticRegression(max_iter=3000, random_state=SEED, C=1.0)
clf_op.fit(X_train2, y_train2)
op_results = {}
chance_op = 1.0 / len(all_ids)
for lang in LANGUAGES:
X_test, y_test = [], []
for op_id in all_ids:
key = f"{op_id}_{lang}"
if key in embeddings:
X_test.append(embeddings[key])
y_test.append(op_to_idx[op_id])
X_test = np.array(X_test)
y_test = np.array(y_test)
preds = clf_op.predict(X_test)
acc = float(accuracy_score(y_test, preds))
n_correct = int(np.sum(preds == y_test))
n_total = int(len(y_test))
op_results[lang] = {
"accuracy": acc,
"n_correct": n_correct,
"n_total": n_total,
"p_value_vs_chance": _binomial_p_vs_chance(n_correct, n_total, chance_op),
}
op_transfer = float(np.mean([r["accuracy"] for lang, r in op_results.items() if lang != "en"]))

# Print
print(f"\n Probe 1 (category, chance 50%):")
print(f" {'Lang':<6s} {'acc':>6s} {'n_correct/n_total':>18s} {'p_vs_chance':>12s}")
print(f" {'─'*48}")
for lang in LANGUAGES:
r = cat_results[lang]
marker = "(train)" if lang == "en" else ""
print(f" {lang:<6s} {r['accuracy']:>6.3f} {r['n_correct']:>9d}/{r['n_total']:<8d} {r['p_value_vs_chance']:>12.4g} {marker}")
print(f" mean transfer (non-en): {cat_transfer:.3f}")

print(f"\n Probe 2 (operation 100-way, chance 1%):")
print(f" {'Lang':<6s} {'acc':>6s} {'n_correct/n_total':>18s} {'p_vs_chance':>12s}")
print(f" {'─'*48}")
for lang in LANGUAGES:
r = op_results[lang]
marker = "(train)" if lang == "en" else ""
print(f" {lang:<6s} {r['accuracy']:>6.3f} {r['n_correct']:>9d}/{r['n_total']:<8d} {r['p_value_vs_chance']:>12.4g} {marker}")
print(f" mean transfer (non-en): {op_transfer:.3f}")

result = {
"model": model_name,
"label": label,
"dim": int(model.dimension),
"category_probe": {
"per_language": cat_results,
"mean_transfer": cat_transfer,
},
"operation_probe": {
"per_language": op_results,
"mean_transfer": op_transfer,
},
}

del model, embeddings_array, embeddings
gc.collect()
return result


def _build_run_meta() -> dict:
try:
import sentence_transformers as _st
st_version = _st.__version__
except Exception:
st_version = "unknown"
try:
import torch
torch_version = torch.__version__
except Exception:
torch_version = "unknown"
try:
import sklearn
skl_version = sklearn.__version__
except Exception:
skl_version = "unknown"
return {
"started_at_utc": datetime.datetime.now(datetime.UTC).isoformat(),
"python": platform.python_version(),
"platform": platform.platform(),
"sentence_transformers": st_version,
"torch": torch_version,
"sklearn": skl_version,
"numpy": np.__version__,
"seed": SEED,
"review_id": "review-2026-05-21",
"closes": "M5 (multi-model P3 probing)",
}


def make_heatmaps(all_results: list[dict]):
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import seaborn as sns

FIGURES_DIR.mkdir(parents=True, exist_ok=True)
n_models = len(all_results)
n_langs = len(LANGUAGES)

for probe_name, probe_key, vmin, vmax, chance_line in [
("category", "category_probe", 0.5, 1.0, 0.5),
("operation", "operation_probe", 0.0, 1.0, 0.01),
]:
fig, ax = plt.subplots(figsize=(10, 5))
matrix = np.zeros((n_models, n_langs))
labels = []
for mi, res in enumerate(all_results):
labels.append(res["label"])
for li, lang in enumerate(LANGUAGES):
matrix[mi, li] = res[probe_key]["per_language"][lang]["accuracy"]
sns.heatmap(
matrix, annot=True, fmt=".2f", cmap="YlGn",
xticklabels=LANGUAGES, yticklabels=labels,
vmin=vmin, vmax=vmax, linewidths=0.5, ax=ax,
)
ax.set_title(
f"Strategy E: {probe_name.capitalize()} probe — cross-lingual transfer accuracy\n"
f"Train on English; chance = {chance_line:.2f}"
)
fig.tight_layout()
path = FIGURES_DIR / f"strategy_e_{probe_name}_heatmap.png"
fig.savefig(path, dpi=150, bbox_inches="tight")
plt.close(fig)
print(f" Figure saved: {path.name}")


def main():
print("=" * 60)
print("Strategy E: Multi-Model P3 Cross-Lingual Probing")
print(f"({len(MODELS)} models × {len(LANGUAGES)} languages)")
print("=" * 60)

run_meta = _build_run_meta()
print(f"\n started_at_utc={run_meta['started_at_utc']}")
print(f" python={run_meta['python']} st={run_meta['sentence_transformers']} sklearn={run_meta['sklearn']}")
print(f" seed={run_meta['seed']}")

all_results = []
failed_models = []
for model_name, label, kwargs in MODELS:
try:
res = run_model_probing(model_name, label, kwargs)
all_results.append(res)
except Exception as exc: # noqa: BLE001
err = {
"model": model_name,
"label": label,
"error_type": type(exc).__name__,
"error_message": str(exc),
}
failed_models.append(err)
print(
f"\n [SKIP] {label} ({model_name}) failed: "
f"{err['error_type']}: {err['error_message']}",
file=sys.stderr,
)
gc.collect()

# Summary
print(f"\n{'='*60}")
print("CROSS-MODEL P3 SUMMARY")
print(f"{'='*60}")
print(f"\n{'Model':<25s} {'cat_en':>7s} {'cat_transfer':>12s} {'op_en':>6s} {'op_transfer':>12s}")
print(f"{'─'*70}")
for res in all_results:
cat_en = res["category_probe"]["per_language"]["en"]["accuracy"]
cat_xfer = res["category_probe"]["mean_transfer"]
op_en = res["operation_probe"]["per_language"]["en"]["accuracy"]
op_xfer = res["operation_probe"]["mean_transfer"]
print(f"{res['label']:<25s} {cat_en:>7.3f} {cat_xfer:>12.3f} {op_en:>6.3f} {op_xfer:>12.3f}")

make_heatmaps(all_results)

# Save
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
run_meta["finished_at_utc"] = datetime.datetime.now(datetime.UTC).isoformat()
run_meta["n_models_attempted"] = len(MODELS)
run_meta["n_models_succeeded"] = len(all_results)
run_meta["failed_models"] = failed_models

out_path = RESULTS_DIR / "strategy_e_multimodel_probing.json"
payload = {"_meta": run_meta, "results": all_results}

def _convert(obj):
if isinstance(obj, (np.integer,)): return int(obj)
if isinstance(obj, (np.floating,)): return float(obj)
if isinstance(obj, np.ndarray): return obj.tolist()
if isinstance(obj, (np.bool_,)): return bool(obj)
return obj

with open(out_path, "w") as f:
json.dump(payload, f, indent=2, default=_convert)
print(f"\n Results saved: {out_path}")
if failed_models:
print(f" [WARN] {len(failed_models)} model(s) skipped:")
for err in failed_models:
print(f" - {err['label']}: {err['error_type']}")


if __name__ == "__main__":
main()
23 changes: 15 additions & 8 deletions paper/main.tex
Original file line number Diff line number Diff line change
Expand Up @@ -494,23 +494,30 @@ \subsection{Pilot Experiment and Results}\label{sec:pilot}
\paragraph{P7 Extension: Punctuation Robustness.}
We extend P7 to punctuation and formatting variants. For each of 100 English operations, we generate 10 variants: bare, period, question mark, exclamation, ellipsis, colon, lowercase, UPPERCASE, extra spaces, and article removal. $R_{\text{punct}} = d_{\text{semantic}} / d_{\text{punct}} = 13.6$---punctuation variants are ${\sim}14\times$ closer than semantically different operations, far exceeding spacing robustness ($R_{\text{spacing}} \approx 2.9$). Most variants drift minimally (period: 0.014, question mark: 0.013). The outlier is UPPERCASE (drift = 0.192), which acts as a pragmatic signal (emphasis, shouting)---evidence that $\Zprag$ is encoded in surface-form cues even when $\Zsem$ is unchanged.

\paragraph{P3 Results: Stratification Separability.}
We train linear probes on English embeddings (MiniLM-L12) and test cross-lingual transfer, directly testing whether $\Zsem$ is separable across languages.
\paragraph{P3 Results: Stratification Separability (7-model).}
We train linear probes on English embeddings and test cross-lingual transfer across the same 7-model set used in the NL-code alignment experiment, directly testing whether $\Zsem$ is separable across languages and whether the separability is model-dependent.

\begin{table}[h]
\centering
\small
\begin{tabular}{lccccc}
\begin{tabular}{lcc|cc}
\toprule
\textbf{Probe task} & \textbf{en} & \textbf{es} & \textbf{zh} & \textbf{ar} & \textbf{ko} \\
& \multicolumn{2}{c|}{\textbf{Category} (chance $0.50$)} & \multicolumn{2}{c}{\textbf{Operation} (chance $0.01$)} \\
\textbf{Model} & en (train) & non-en mean & en (train) & non-en mean \\
\midrule
Category (comp/judg, chance 50\%) & 1.00 & 0.97 & 0.96 & 0.87 & 0.80 \\
Operation ID (100-way, chance 1\%) & 1.00 & 0.98 & 0.93 & 0.86 & 0.66 \\
UniXcoder (code) & 0.99 & 0.67 & 1.00 & 0.18 \\
MiniLM-L12 (NL) & 1.00 & 0.90 & 1.00 & 0.86 \\
Nomic v1.5 & 1.00 & 0.62 & 1.00 & 0.23 \\
E5-small (NL) & 1.00 & 0.98 & 1.00 & 0.89 \\
E5-base (NL) & 1.00 & 0.98 & 1.00 & 0.96 \\
E5-large (NL) & 1.00 & 0.99 & 1.00 & 0.98 \\
BGE-M3 (NL+code) & 1.00 & 0.99 & 1.00 & 0.98 \\
\bottomrule
\end{tabular}
\caption*{\small Every non-en cell with accuracy $> 0.5$ satisfies a one-sided binomial test against chance with $p < 10^{-25}$. Mean transfer is averaged over the 4 non-English languages (ko, zh, ar, es). Strategy E (\texttt{experiments/scripts/run\_strategy\_e\_multimodel\_probing.py}).}
\end{table}

P3 is \textbf{supported}: a classifier trained only on English embeddings achieves 90\% mean accuracy on category transfer and 85.8\% on 100-way operation identification across non-English languages---far above chance. This is direct evidence that $\Zsem$ structure generalizes cross-lingually. Korean transfers worst (80\%/66\%), consistent with $\Dtrain$ effects and typological distance from English.
P3 is \textbf{supported in multilingual NL models but is model-class dependent}. The multilingual NL family (MiniLM, E5 small/base/large, BGE-M3) achieves $0.86$--$0.99$ category transfer and $0.86$--$0.98$ operation transfer---direct evidence that $\Zsem$ structure generalizes cross-lingually for models trained on multilingual NL. Code-trained (UniXcoder: $0.67$ / $0.18$) and mixed NL+code (Nomic v1.5: $0.62$ / $0.23$) models show much weaker cross-lingual operation transfer despite near-perfect English training accuracy, indicating their NL representations do not align cross-lingually even when within-English performance is adequate. The E5 family alone exhibits a clean P3 scale-convergence echo of the NL-code alignment pattern: operation transfer rises $0.89$ (small, 384d) $\to 0.96$ (base, 768d) $\to 0.98$ (large, 1024d), under fixed architecture and training recipe. This refines the original P3 claim: cross-lingual $\Zsem$ separability is a property of the multilingual NL training distribution, not an intrinsic property of all embedding spaces with $R_{\text{code}} > 1$.

\paragraph{Dialect and paraphrase distance hierarchy.}
To test whether the communicability gap is graded, we measure three distance levels using English dialect pairs (American vs.\ British, American vs.\ Indian English) and within-language paraphrases. Across three models, the observed ordering is $d_{\text{dialect}} < d_{\text{paraphrase}} < d_{\text{cross-lingual}}$ (e.g., $0.013 < 0.069 < 0.090$ for E5-large). Dialect variation produces \emph{minimal} embedding distance (British: $d \approx 0.001$; Indian: $d \approx 0.025$), because embedding models trained on standard text collapse dialectal surface variation. Paraphrases---semantically identical but lexically different---produce measurable distances ($d \approx 0.07$--$0.26$), and cross-lingual distances are largest. This hierarchy shows that the communicability gap is continuous and multi-layered: surface-form variation (dialect $<$ paraphrase) is distinct from language-level variation, and embedding models are insensitive to dialectal differences relative to lexical choice.
Expand Down Expand Up @@ -631,7 +638,7 @@ \section*{Limitations}

\textbf{Pilot measures description-level, not execution-level, convergence.} The P2 failure highlights this gap: NL embedding similarity is a proxy, not a direct test, of $\Zsem$ convergence. Our follow-up analyses (vocabulary mediation and language-pair decomposition) explain the P2 failure as a description-level vocabulary phenomenon, and the NL-code alignment experiment confirms execution-level convergence across four models and five languages (20/20 cells significant). However, all models are sentence-level embedders; decoder-only LLM representations may behave differently.

\textbf{The $Z$ stratification is a conceptual framework.} While the pilot provides supporting evidence (P7 supported, P2 failure explained by vocabulary mediation, P3 supporting cross-lingual separability), the stratification has not been validated with large-scale probing across multiple model families.
\textbf{The $Z$ stratification is a conceptual framework.} While the pilot provides supporting evidence (P7 supported, P2 failure explained by vocabulary mediation, P3 supported on 7 models with model-class dependence---multilingual NL strong, code-trained / NL+code mixed weak), large-scale probing across decoder-only LLM families (e.g., Llama 3.1 hidden states) and operation-level OOD stimuli (\texttt{tier2\_multistep.json}, \texttt{tier3\_compositional.json} in the experiment repository) remains future work.

\textbf{Vocabulary mediation analysis has limited power.} With $n=50$ operations per category and Bonferroni correction across 8 features, the minimum detectable effect is $|\rho| \geq 0.35$ (pooled) or $|\rho| \geq 0.48$ (within-category). Moderate effects may be missed. The language-pair decomposition uses ordinal typological ranks assigned by the authors, not an independent typological distance metric.

Expand Down
Loading
Loading