From b36f6c74b883ffe23425e0693eb79f19de46f3c1 Mon Sep 17 00:00:00 2001
From: wcwxy <26245345+ChaoWao@users.noreply.github.com>
Date: Sat, 11 Apr 2026 12:53:35 +0800
Subject: [PATCH] Fix: ci.py crash on macOS from duplicate libomp load

On macOS, `python ci.py -p a2a3sim` (or a5sim) aborts every task with
"OMP: Error #15: Initializing libomp.dylib, but found libomp.dylib
already initialized" (SIGABRT) before any DeviceRunner code runs.

Two distinct libomp.dylib copies get mapped into the single CI process:
homebrew's /opt/homebrew/opt/libomp/lib/libomp.dylib (via numpy ->
openblas) and pip torch's .venv/.../torch/lib/libomp.dylib. They have
different install names, so dyld loads them both and Intel's libomp
aborts on the second init. Surfaced after #493 collapsed sim CI into
one long-lived Python process; each golden's `import numpy`/`import
torch` now accumulates conflicting libomps in the same address space.

- Set KMP_DUPLICATE_LIB_OK=TRUE at the top of ci.py on darwin, before
  any import that can transitively pull in numpy or torch. This is
  Intel's documented escape hatch; safe for our workload where numpy
  and torch are only used for golden reference math, not parallel
  OMP regions.
- Document the full root cause, debugging steps, and explicit
  "what not to do" list in docs/macos-libomp-collision.md so future
  contributors don't re-investigate. Link it from docs/ci.md.
- Rewrite the two remaining numpy-based goldens
  (a2a3/{aicpu,host}_build_graph/bgemm) in torch for style consistency
  with the rest of examples/. Note this does not avoid the libomp
  collision on its own -- `import torch` transitively imports numpy.

Verified: `python ci.py` passes 32/32 sim tests (20 a2a3sim +
12 a5sim) on macOS without KMP_DUPLICATE_LIB_OK needing to be set
manually.
---
 ci.py                                         |  26 ++++-
 docs/ci.md                                    |   4 +
 docs/macos-libomp-collision.md                | 104 ++++++++++++++++++
 .../a2a3/aicpu_build_graph/bgemm/golden.py    |  16 +--
 .../a2a3/host_build_graph/bgemm/golden.py     |  16 +--
 5 files changed, 148 insertions(+), 18 deletions(-)
 create mode 100644 docs/macos-libomp-collision.md

diff --git a/ci.py b/ci.py
index 96d675c36..83f3fb04c 100644
--- a/ci.py
+++ b/ci.py
@@ -21,14 +21,36 @@
 
 from __future__ import annotations
 
+import os
+import sys
+
+# ---------------------------------------------------------------------------
+# macOS libomp collision workaround — MUST run before any import that may
+# transitively load numpy or torch.  See docs/macos-libomp-collision.md for
+# the full analysis.
+#
+# On macOS with a --system-site-packages venv, homebrew's numpy pulls in
+# /opt/homebrew/opt/libomp/lib/libomp.dylib (via openblas), while pip's
+# torch ships its own .venv/.../torch/lib/libomp.dylib under a different
+# install name (/opt/llvm-openmp/lib/libomp.dylib).  Because the two
+# dylibs have distinct install names, dyld loads them both, and Intel's
+# libomp aborts the process with "OMP: Error #15 ... libomp already
+# initialized" (SIGABRT).
+#
+# The officially-documented escape hatch is KMP_DUPLICATE_LIB_OK=TRUE.
+# For our CI workload (numpy random + torch golden compute, no heavy
+# parallel OMP regions) the two runtimes never actually race, so allowing
+# the duplicate load is safe in practice.
+# ---------------------------------------------------------------------------
+if sys.platform == "darwin":
+    os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
+
 import argparse
 import importlib.util
 import json
 import logging
-import os
 import signal
 import subprocess
-import sys
 import tempfile
 import time
 from concurrent.futures import ThreadPoolExecutor
diff --git a/docs/ci.md b/docs/ci.md
index 7fcee648b..28e1b2cdb 100644
--- a/docs/ci.md
+++ b/docs/ci.md
@@ -227,6 +227,10 @@ python ci.py -p a2a3 -d 4-7 -c 6622890 -t 600
 python ci.py -p a2a3sim -r tensormap_and_ringbuffer
 ```
 
+### Platform notes
+
+- **macOS libomp collision**: on macOS, `ci.py` sets `KMP_DUPLICATE_LIB_OK=TRUE` at the top of the file to work around a duplicate-libomp abort triggered by homebrew numpy and pip torch coexisting in one process. Do not reorder the imports or remove this workaround without reading [macos-libomp-collision.md](macos-libomp-collision.md) first.
+
 ### Task discovery
 
 `ci.py` scans two directories:
diff --git a/docs/macos-libomp-collision.md b/docs/macos-libomp-collision.md
new file mode 100644
index 000000000..25f478647
--- /dev/null
+++ b/docs/macos-libomp-collision.md
@@ -0,0 +1,104 @@
+# macOS libomp Collision in Single-Process CI
+
+## TL;DR
+
+On macOS, `ci.py` would crash with `OMP: Error #15 ... libomp.dylib already initialized` (SIGABRT, every task fails before any runtime code runs) because two different `libomp.dylib` copies get loaded into the same Python process — one via `numpy → openblas`, one via `torch`. We work around this at the top of `ci.py` by setting `KMP_DUPLICATE_LIB_OK=TRUE` before any import that can pull in numpy or torch. This doc exists so the next person who touches sim CI does not re-investigate the same rabbit hole.
+
+## Symptom
+
+Running `python ci.py -p a2a3sim` (or `a5sim`) on macOS produces, for **every** task:
+
+```text
+OMP: Error #15: Initializing libomp.dylib, but found libomp.dylib already initialized.
+OMP: Hint This means that multiple copies of the OpenMP runtime have been linked into the program.
+--- FAIL: example:a2a3/... (dev0, attempt 1) ---
+```
+
+Exit code of the spawned worker is `134` (SIGABRT). The failure happens during golden `import`, so no DeviceRunner, no `pto_runtime_c_api.cpp`, no aicpu/aicore thread ever executes.
+
+## Root Cause
+
+Two distinct `libomp.dylib` copies get mapped into the single Python process used by `ci.py`:
+
+1. **Homebrew's libomp** — `/opt/homebrew/opt/libomp/lib/libomp.dylib`, pulled in by the chain:
+   `numpy → openblas (/opt/homebrew/opt/openblas/lib/libopenblas.0.dylib) → libomp`
+
+   `numpy` is loaded from the homebrew-managed system Python because our venv is created with `--system-site-packages` (required by `.claude/rules/venv-isolation.md`). Homebrew's numpy links against homebrew's openblas, which links against homebrew's libomp.
+
+2. **PyTorch's bundled libomp** — `.venv/lib/python3.14/site-packages/torch/lib/libomp.dylib`, pulled in by:
+   `torch → torch/_C → libtorch_python → libomp`
+
+   pip's torch wheel ships its own libomp with install name `/opt/llvm-openmp/lib/libomp.dylib`.
+
+The two dylibs have **different `LC_ID_DYLIB` install names** (verified with `otool -D`), so `dyld` loads them as completely separate images even though they expose the identical `__kmpc_*` / `GOMP_*` symbol set. When the second libomp initializes, Intel's OMP runtime detects a prior active libomp and calls `abort()`.
+
+`DYLD_INSERT_LIBRARIES` and `ctypes.CDLL(..., RTLD_GLOBAL)` **do not fix this** — dyld resolves the dependency chain by install name, not by symbol matching against already-loaded libraries.
+
+Reproducer (no CI code required):
+
+```console
+$ source .venv/bin/activate
+$ python -c "import numpy; import torch"
+OMP: Error #15: Initializing libomp.dylib, but found libomp.dylib already initialized.
+[1]    12345 abort      python -c "import numpy; import torch"
+```
+
+Any golden importing `torch` after another golden has already imported `numpy` (or vice versa) is enough — and `import torch` transitively imports numpy, so even "all goldens use torch" does not avoid it.
+
+## Why It Surfaced Now
+
+Before commit `a90b0a2` ("run sim CI in single subprocess with parallel workers"), `run_sim_tasks_subprocess` launched one fresh Python subprocess **per runtime group**. Each subprocess had a clean interpreter and only loaded its own goldens, so numpy and torch rarely coexisted in the same process, and the conflict almost never manifested.
+
+After `a90b0a2`, all tasks run in one persistent process via `_run_device_worker_subprocess` plus parallel worker threads. Per-golden `import numpy` / `import torch` calls accumulate and the second libomp eventually tries to initialize.
+
+## Why Linux Does Not Hit This
+
+On Linux, homebrew is not typical; `numpy` and `torch` are usually both pip-installed into the venv, and the wheels share the same `libgomp` / `libomp` from a single location. Even when they do not, glibc's dynamic linker uses symbol versioning + the `STB_GNU_UNIQUE` model, and the OpenMP runtime is more permissive about duplicate loads. We have never reproduced OMP Error #15 on Linux for this repo.
+
+## Mitigation
+
+At the very top of `ci.py` — before any `import` that might transitively load numpy or torch — we set:
+
+```python
+if sys.platform == "darwin":
+    os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
+```
+
+`KMP_DUPLICATE_LIB_OK=TRUE` is Intel's documented escape hatch. It instructs the libomp runtime to proceed when a duplicate load is detected rather than aborting. Intel labels it "unsafe, unsupported, undocumented"; the concrete risks are:
+
+- If both libomp copies actually run parallel regions concurrently, thread pool counts can double-count and performance degrades.
+- Mixing thread-local storage between the two runtimes can misbehave in pathological cases.
+
+Neither risk applies to our workload: the goldens use numpy/torch only for random-input generation and reference computation (single-threaded in all practical paths), and the real parallel execution happens in our own C++ DeviceRunner threads, not inside libomp. In practice the two libomps sit side-by-side and nothing bad happens.
+
+## What NOT to Do
+
+- **Do not** try to fix this by `dlopen`-preloading one libomp with `ctypes.CDLL(..., RTLD_GLOBAL)`. It doesn't work — dyld resolves subsequent libomp references by install name, not by symbol, so the second copy still loads.
+- **Do not** try `DYLD_INSERT_LIBRARIES=.../torch/lib/libomp.dylib`. Same reason: different install names.
+- **Do not** drop `--system-site-packages` from the venv to try to get a pip-installed numpy — `.claude/rules/venv-isolation.md` requires `--system-site-packages` so system-level driver bindings remain accessible.
+- **Do not** "fix" it by removing `numpy` or `torch` imports from goldens. `import torch` transitively imports numpy, and writing golden reference math in pure Python is painful. Converting all goldens to torch does **not** make the conflict go away.
+- **Do not** interpret OMP Error #15 as evidence of a sim-parallel threading bug, dlopen/dlclose ordering issue, or pthread TSD race. The crash happens during Python import, well before any C++ DeviceRunner code executes. A significant amount of debugging effort was wasted in commit `5cc0814` ("fix: in progress sim parallel") chasing this misdiagnosis.
+
+## If You Need to Debug This Again
+
+1. Check the failure message: if it contains `OMP: Error #15`, it is this issue. If not, look elsewhere.
+2. Confirm two libomps are loading with:
+
+   ```console
+   DYLD_PRINT_LIBRARIES=1 python -c "import numpy, torch" 2>&1 | grep libomp
+   ```
+
+   You should see two different `libomp.dylib` paths.
+3. Verify install names:
+
+   ```console
+   otool -D /opt/homebrew/opt/libomp/lib/libomp.dylib
+   otool -D .venv/lib/python3.14/site-packages/torch/lib/libomp.dylib
+   ```
+
+4. Confirm the `ci.py` preamble still sets `KMP_DUPLICATE_LIB_OK` *before* any import that could pull numpy/torch — someone refactoring imports may accidentally put `import numpy` above the `os.environ.setdefault` line.
+
+## References
+
+- Intel OMP `KMP_DUPLICATE_LIB_OK`: <https://www.intel.com/content/www/us/en/docs/onemkl/developer-guide-macos/current/avoiding-conflicts-in-the-linkage-symbol-names.html>
+- OpenMP Error #15 FAQ: <https://openmp.llvm.org/>
diff --git a/examples/a2a3/aicpu_build_graph/bgemm/golden.py b/examples/a2a3/aicpu_build_graph/bgemm/golden.py
index 37a4d8d03..233dfa646 100644
--- a/examples/a2a3/aicpu_build_graph/bgemm/golden.py
+++ b/examples/a2a3/aicpu_build_graph/bgemm/golden.py
@@ -15,7 +15,7 @@
 Args layout: [A, B, C]  — shape/dtype/size in ContinuousTensor metadata
 """
 
-import numpy as np
+import torch
 
 __outputs__ = ["C"]
 RTOL = 1e-3
@@ -37,9 +37,9 @@
 
 def generate_inputs(params: dict) -> list:
     """Generate input tensors with tile-first memory layout."""
-    A = np.random.randn(BATCH, GRID_M, GRID_K, TILE_M, TILE_K).astype(np.float32) * 0.01
-    B = np.random.randn(BATCH, GRID_K, GRID_N, TILE_K, TILE_N).astype(np.float32) * 0.01
-    C = np.zeros((BATCH, GRID_M, GRID_N, TILE_M, TILE_N), dtype=np.float32)
+    A = torch.randn(BATCH, GRID_M, GRID_K, TILE_M, TILE_K, dtype=torch.float32) * 0.01
+    B = torch.randn(BATCH, GRID_K, GRID_N, TILE_K, TILE_N, dtype=torch.float32) * 0.01
+    C = torch.zeros(BATCH, GRID_M, GRID_N, TILE_M, TILE_N, dtype=torch.float32)
 
     A_flat = A.flatten()
     B_flat = B.flatten()
@@ -54,9 +54,9 @@ def generate_inputs(params: dict) -> list:
 
 def compute_golden(tensors: dict, params: dict) -> None:
     """Compute golden result: C[m,n] = sum(k) A[m,k] @ B[k,n]."""
-    A = tensors["A"].reshape(BATCH, GRID_M, GRID_K, TILE_M, TILE_K)
-    B = tensors["B"].reshape(BATCH, GRID_K, GRID_N, TILE_K, TILE_N)
-    C = tensors["C"].reshape(BATCH, GRID_M, GRID_N, TILE_M, TILE_N)
+    A = torch.as_tensor(tensors["A"]).reshape(BATCH, GRID_M, GRID_K, TILE_M, TILE_K)
+    B = torch.as_tensor(tensors["B"]).reshape(BATCH, GRID_K, GRID_N, TILE_K, TILE_N)
+    C = torch.as_tensor(tensors["C"]).reshape(BATCH, GRID_M, GRID_N, TILE_M, TILE_N)
 
     C[:] = 0.0
 
@@ -64,6 +64,6 @@ def compute_golden(tensors: dict, params: dict) -> None:
         for m_idx in range(GRID_M):
             for n_idx in range(GRID_N):
                 for k_idx in range(GRID_K):
-                    C[batch, m_idx, n_idx] += np.matmul(A[batch, m_idx, k_idx], B[batch, k_idx, n_idx])
+                    C[batch, m_idx, n_idx] += torch.matmul(A[batch, m_idx, k_idx], B[batch, k_idx, n_idx])
 
     tensors["C"][:] = C.flatten()
diff --git a/examples/a2a3/host_build_graph/bgemm/golden.py b/examples/a2a3/host_build_graph/bgemm/golden.py
index 771877180..d155c8832 100644
--- a/examples/a2a3/host_build_graph/bgemm/golden.py
+++ b/examples/a2a3/host_build_graph/bgemm/golden.py
@@ -15,7 +15,7 @@
 Args layout: [A, B, C] — shape/dtype/size in ContinuousTensor metadata
 """
 
-import numpy as np
+import torch
 
 __outputs__ = ["C"]
 RTOL = 1e-3
@@ -37,9 +37,9 @@
 
 def generate_inputs(params: dict) -> list:
     """Generate input tensors with tile-first memory layout."""
-    A = np.random.randn(BATCH, GRID_M, GRID_K, TILE_M, TILE_K).astype(np.float32) * 0.01
-    B = np.random.randn(BATCH, GRID_K, GRID_N, TILE_K, TILE_N).astype(np.float32) * 0.01
-    C = np.zeros((BATCH, GRID_M, GRID_N, TILE_M, TILE_N), dtype=np.float32)
+    A = torch.randn(BATCH, GRID_M, GRID_K, TILE_M, TILE_K, dtype=torch.float32) * 0.01
+    B = torch.randn(BATCH, GRID_K, GRID_N, TILE_K, TILE_N, dtype=torch.float32) * 0.01
+    C = torch.zeros(BATCH, GRID_M, GRID_N, TILE_M, TILE_N, dtype=torch.float32)
 
     return [
         ("A", A),
@@ -50,9 +50,9 @@ def generate_inputs(params: dict) -> list:
 
 def compute_golden(tensors: dict, params: dict) -> None:
     """Compute golden result: C[m,n] = sum(k) A[m,k] @ B[k,n]."""
-    A = tensors["A"].reshape(BATCH, GRID_M, GRID_K, TILE_M, TILE_K)
-    B = tensors["B"].reshape(BATCH, GRID_K, GRID_N, TILE_K, TILE_N)
-    C = tensors["C"].reshape(BATCH, GRID_M, GRID_N, TILE_M, TILE_N)
+    A = torch.as_tensor(tensors["A"]).reshape(BATCH, GRID_M, GRID_K, TILE_M, TILE_K)
+    B = torch.as_tensor(tensors["B"]).reshape(BATCH, GRID_K, GRID_N, TILE_K, TILE_N)
+    C = torch.as_tensor(tensors["C"]).reshape(BATCH, GRID_M, GRID_N, TILE_M, TILE_N)
 
     C[:] = 0.0
 
@@ -60,4 +60,4 @@ def compute_golden(tensors: dict, params: dict) -> None:
         for m_idx in range(GRID_M):
             for n_idx in range(GRID_N):
                 for k_idx in range(GRID_K):
-                    C[batch, m_idx, n_idx] += np.matmul(A[batch, m_idx, k_idx], B[batch, k_idx, n_idx])
+                    C[batch, m_idx, n_idx] += torch.matmul(A[batch, m_idx, k_idx], B[batch, k_idx, n_idx])