From b36f6c74b883ffe23425e0693eb79f19de46f3c1 Mon Sep 17 00:00:00 2001 From: wcwxy <26245345+ChaoWao@users.noreply.github.com> Date: Sat, 11 Apr 2026 12:53:35 +0800 Subject: [PATCH] Fix: ci.py crash on macOS from duplicate libomp load On macOS, `python ci.py -p a2a3sim` (or a5sim) aborts every task with "OMP: Error #15: Initializing libomp.dylib, but found libomp.dylib already initialized" (SIGABRT) before any DeviceRunner code runs. Two distinct libomp.dylib copies get mapped into the single CI process: homebrew's /opt/homebrew/opt/libomp/lib/libomp.dylib (via numpy -> openblas) and pip torch's .venv/.../torch/lib/libomp.dylib. They have different install names, so dyld loads them both and Intel's libomp aborts on the second init. Surfaced after #493 collapsed sim CI into one long-lived Python process; each golden's `import numpy`/`import torch` now accumulates conflicting libomps in the same address space. - Set KMP_DUPLICATE_LIB_OK=TRUE at the top of ci.py on darwin, before any import that can transitively pull in numpy or torch. This is Intel's documented escape hatch; safe for our workload where numpy and torch are only used for golden reference math, not parallel OMP regions. - Document the full root cause, debugging steps, and explicit "what not to do" list in docs/macos-libomp-collision.md so future contributors don't re-investigate. Link it from docs/ci.md. - Rewrite the two remaining numpy-based goldens (a2a3/{aicpu,host}_build_graph/bgemm) in torch for style consistency with the rest of examples/. Note this does not avoid the libomp collision on its own -- `import torch` transitively imports numpy. Verified: `python ci.py` passes 32/32 sim tests (20 a2a3sim + 12 a5sim) on macOS without KMP_DUPLICATE_LIB_OK needing to be set manually. --- ci.py | 26 ++++- docs/ci.md | 4 + docs/macos-libomp-collision.md | 104 ++++++++++++++++++ .../a2a3/aicpu_build_graph/bgemm/golden.py | 16 +-- .../a2a3/host_build_graph/bgemm/golden.py | 16 +-- 5 files changed, 148 insertions(+), 18 deletions(-) create mode 100644 docs/macos-libomp-collision.md diff --git a/ci.py b/ci.py index 96d675c36..83f3fb04c 100644 --- a/ci.py +++ b/ci.py @@ -21,14 +21,36 @@ from __future__ import annotations +import os +import sys + +# --------------------------------------------------------------------------- +# macOS libomp collision workaround — MUST run before any import that may +# transitively load numpy or torch. See docs/macos-libomp-collision.md for +# the full analysis. +# +# On macOS with a --system-site-packages venv, homebrew's numpy pulls in +# /opt/homebrew/opt/libomp/lib/libomp.dylib (via openblas), while pip's +# torch ships its own .venv/.../torch/lib/libomp.dylib under a different +# install name (/opt/llvm-openmp/lib/libomp.dylib). Because the two +# dylibs have distinct install names, dyld loads them both, and Intel's +# libomp aborts the process with "OMP: Error #15 ... libomp already +# initialized" (SIGABRT). +# +# The officially-documented escape hatch is KMP_DUPLICATE_LIB_OK=TRUE. +# For our CI workload (numpy random + torch golden compute, no heavy +# parallel OMP regions) the two runtimes never actually race, so allowing +# the duplicate load is safe in practice. +# --------------------------------------------------------------------------- +if sys.platform == "darwin": + os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE") + import argparse import importlib.util import json import logging -import os import signal import subprocess -import sys import tempfile import time from concurrent.futures import ThreadPoolExecutor diff --git a/docs/ci.md b/docs/ci.md index 7fcee648b..28e1b2cdb 100644 --- a/docs/ci.md +++ b/docs/ci.md @@ -227,6 +227,10 @@ python ci.py -p a2a3 -d 4-7 -c 6622890 -t 600 python ci.py -p a2a3sim -r tensormap_and_ringbuffer ``` +### Platform notes + +- **macOS libomp collision**: on macOS, `ci.py` sets `KMP_DUPLICATE_LIB_OK=TRUE` at the top of the file to work around a duplicate-libomp abort triggered by homebrew numpy and pip torch coexisting in one process. Do not reorder the imports or remove this workaround without reading [macos-libomp-collision.md](macos-libomp-collision.md) first. + ### Task discovery `ci.py` scans two directories: diff --git a/docs/macos-libomp-collision.md b/docs/macos-libomp-collision.md new file mode 100644 index 000000000..25f478647 --- /dev/null +++ b/docs/macos-libomp-collision.md @@ -0,0 +1,104 @@ +# macOS libomp Collision in Single-Process CI + +## TL;DR + +On macOS, `ci.py` would crash with `OMP: Error #15 ... libomp.dylib already initialized` (SIGABRT, every task fails before any runtime code runs) because two different `libomp.dylib` copies get loaded into the same Python process — one via `numpy → openblas`, one via `torch`. We work around this at the top of `ci.py` by setting `KMP_DUPLICATE_LIB_OK=TRUE` before any import that can pull in numpy or torch. This doc exists so the next person who touches sim CI does not re-investigate the same rabbit hole. + +## Symptom + +Running `python ci.py -p a2a3sim` (or `a5sim`) on macOS produces, for **every** task: + +```text +OMP: Error #15: Initializing libomp.dylib, but found libomp.dylib already initialized. +OMP: Hint This means that multiple copies of the OpenMP runtime have been linked into the program. +--- FAIL: example:a2a3/... (dev0, attempt 1) --- +``` + +Exit code of the spawned worker is `134` (SIGABRT). The failure happens during golden `import`, so no DeviceRunner, no `pto_runtime_c_api.cpp`, no aicpu/aicore thread ever executes. + +## Root Cause + +Two distinct `libomp.dylib` copies get mapped into the single Python process used by `ci.py`: + +1. **Homebrew's libomp** — `/opt/homebrew/opt/libomp/lib/libomp.dylib`, pulled in by the chain: + `numpy → openblas (/opt/homebrew/opt/openblas/lib/libopenblas.0.dylib) → libomp` + + `numpy` is loaded from the homebrew-managed system Python because our venv is created with `--system-site-packages` (required by `.claude/rules/venv-isolation.md`). Homebrew's numpy links against homebrew's openblas, which links against homebrew's libomp. + +2. **PyTorch's bundled libomp** — `.venv/lib/python3.14/site-packages/torch/lib/libomp.dylib`, pulled in by: + `torch → torch/_C → libtorch_python → libomp` + + pip's torch wheel ships its own libomp with install name `/opt/llvm-openmp/lib/libomp.dylib`. + +The two dylibs have **different `LC_ID_DYLIB` install names** (verified with `otool -D`), so `dyld` loads them as completely separate images even though they expose the identical `__kmpc_*` / `GOMP_*` symbol set. When the second libomp initializes, Intel's OMP runtime detects a prior active libomp and calls `abort()`. + +`DYLD_INSERT_LIBRARIES` and `ctypes.CDLL(..., RTLD_GLOBAL)` **do not fix this** — dyld resolves the dependency chain by install name, not by symbol matching against already-loaded libraries. + +Reproducer (no CI code required): + +```console +$ source .venv/bin/activate +$ python -c "import numpy; import torch" +OMP: Error #15: Initializing libomp.dylib, but found libomp.dylib already initialized. +[1] 12345 abort python -c "import numpy; import torch" +``` + +Any golden importing `torch` after another golden has already imported `numpy` (or vice versa) is enough — and `import torch` transitively imports numpy, so even "all goldens use torch" does not avoid it. + +## Why It Surfaced Now + +Before commit `a90b0a2` ("run sim CI in single subprocess with parallel workers"), `run_sim_tasks_subprocess` launched one fresh Python subprocess **per runtime group**. Each subprocess had a clean interpreter and only loaded its own goldens, so numpy and torch rarely coexisted in the same process, and the conflict almost never manifested. + +After `a90b0a2`, all tasks run in one persistent process via `_run_device_worker_subprocess` plus parallel worker threads. Per-golden `import numpy` / `import torch` calls accumulate and the second libomp eventually tries to initialize. + +## Why Linux Does Not Hit This + +On Linux, homebrew is not typical; `numpy` and `torch` are usually both pip-installed into the venv, and the wheels share the same `libgomp` / `libomp` from a single location. Even when they do not, glibc's dynamic linker uses symbol versioning + the `STB_GNU_UNIQUE` model, and the OpenMP runtime is more permissive about duplicate loads. We have never reproduced OMP Error #15 on Linux for this repo. + +## Mitigation + +At the very top of `ci.py` — before any `import` that might transitively load numpy or torch — we set: + +```python +if sys.platform == "darwin": + os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE") +``` + +`KMP_DUPLICATE_LIB_OK=TRUE` is Intel's documented escape hatch. It instructs the libomp runtime to proceed when a duplicate load is detected rather than aborting. Intel labels it "unsafe, unsupported, undocumented"; the concrete risks are: + +- If both libomp copies actually run parallel regions concurrently, thread pool counts can double-count and performance degrades. +- Mixing thread-local storage between the two runtimes can misbehave in pathological cases. + +Neither risk applies to our workload: the goldens use numpy/torch only for random-input generation and reference computation (single-threaded in all practical paths), and the real parallel execution happens in our own C++ DeviceRunner threads, not inside libomp. In practice the two libomps sit side-by-side and nothing bad happens. + +## What NOT to Do + +- **Do not** try to fix this by `dlopen`-preloading one libomp with `ctypes.CDLL(..., RTLD_GLOBAL)`. It doesn't work — dyld resolves subsequent libomp references by install name, not by symbol, so the second copy still loads. +- **Do not** try `DYLD_INSERT_LIBRARIES=.../torch/lib/libomp.dylib`. Same reason: different install names. +- **Do not** drop `--system-site-packages` from the venv to try to get a pip-installed numpy — `.claude/rules/venv-isolation.md` requires `--system-site-packages` so system-level driver bindings remain accessible. +- **Do not** "fix" it by removing `numpy` or `torch` imports from goldens. `import torch` transitively imports numpy, and writing golden reference math in pure Python is painful. Converting all goldens to torch does **not** make the conflict go away. +- **Do not** interpret OMP Error #15 as evidence of a sim-parallel threading bug, dlopen/dlclose ordering issue, or pthread TSD race. The crash happens during Python import, well before any C++ DeviceRunner code executes. A significant amount of debugging effort was wasted in commit `5cc0814` ("fix: in progress sim parallel") chasing this misdiagnosis. + +## If You Need to Debug This Again + +1. Check the failure message: if it contains `OMP: Error #15`, it is this issue. If not, look elsewhere. +2. Confirm two libomps are loading with: + + ```console + DYLD_PRINT_LIBRARIES=1 python -c "import numpy, torch" 2>&1 | grep libomp + ``` + + You should see two different `libomp.dylib` paths. +3. Verify install names: + + ```console + otool -D /opt/homebrew/opt/libomp/lib/libomp.dylib + otool -D .venv/lib/python3.14/site-packages/torch/lib/libomp.dylib + ``` + +4. Confirm the `ci.py` preamble still sets `KMP_DUPLICATE_LIB_OK` *before* any import that could pull numpy/torch — someone refactoring imports may accidentally put `import numpy` above the `os.environ.setdefault` line. + +## References + +- Intel OMP `KMP_DUPLICATE_LIB_OK`: +- OpenMP Error #15 FAQ: diff --git a/examples/a2a3/aicpu_build_graph/bgemm/golden.py b/examples/a2a3/aicpu_build_graph/bgemm/golden.py index 37a4d8d03..233dfa646 100644 --- a/examples/a2a3/aicpu_build_graph/bgemm/golden.py +++ b/examples/a2a3/aicpu_build_graph/bgemm/golden.py @@ -15,7 +15,7 @@ Args layout: [A, B, C] — shape/dtype/size in ContinuousTensor metadata """ -import numpy as np +import torch __outputs__ = ["C"] RTOL = 1e-3 @@ -37,9 +37,9 @@ def generate_inputs(params: dict) -> list: """Generate input tensors with tile-first memory layout.""" - A = np.random.randn(BATCH, GRID_M, GRID_K, TILE_M, TILE_K).astype(np.float32) * 0.01 - B = np.random.randn(BATCH, GRID_K, GRID_N, TILE_K, TILE_N).astype(np.float32) * 0.01 - C = np.zeros((BATCH, GRID_M, GRID_N, TILE_M, TILE_N), dtype=np.float32) + A = torch.randn(BATCH, GRID_M, GRID_K, TILE_M, TILE_K, dtype=torch.float32) * 0.01 + B = torch.randn(BATCH, GRID_K, GRID_N, TILE_K, TILE_N, dtype=torch.float32) * 0.01 + C = torch.zeros(BATCH, GRID_M, GRID_N, TILE_M, TILE_N, dtype=torch.float32) A_flat = A.flatten() B_flat = B.flatten() @@ -54,9 +54,9 @@ def generate_inputs(params: dict) -> list: def compute_golden(tensors: dict, params: dict) -> None: """Compute golden result: C[m,n] = sum(k) A[m,k] @ B[k,n].""" - A = tensors["A"].reshape(BATCH, GRID_M, GRID_K, TILE_M, TILE_K) - B = tensors["B"].reshape(BATCH, GRID_K, GRID_N, TILE_K, TILE_N) - C = tensors["C"].reshape(BATCH, GRID_M, GRID_N, TILE_M, TILE_N) + A = torch.as_tensor(tensors["A"]).reshape(BATCH, GRID_M, GRID_K, TILE_M, TILE_K) + B = torch.as_tensor(tensors["B"]).reshape(BATCH, GRID_K, GRID_N, TILE_K, TILE_N) + C = torch.as_tensor(tensors["C"]).reshape(BATCH, GRID_M, GRID_N, TILE_M, TILE_N) C[:] = 0.0 @@ -64,6 +64,6 @@ def compute_golden(tensors: dict, params: dict) -> None: for m_idx in range(GRID_M): for n_idx in range(GRID_N): for k_idx in range(GRID_K): - C[batch, m_idx, n_idx] += np.matmul(A[batch, m_idx, k_idx], B[batch, k_idx, n_idx]) + C[batch, m_idx, n_idx] += torch.matmul(A[batch, m_idx, k_idx], B[batch, k_idx, n_idx]) tensors["C"][:] = C.flatten() diff --git a/examples/a2a3/host_build_graph/bgemm/golden.py b/examples/a2a3/host_build_graph/bgemm/golden.py index 771877180..d155c8832 100644 --- a/examples/a2a3/host_build_graph/bgemm/golden.py +++ b/examples/a2a3/host_build_graph/bgemm/golden.py @@ -15,7 +15,7 @@ Args layout: [A, B, C] — shape/dtype/size in ContinuousTensor metadata """ -import numpy as np +import torch __outputs__ = ["C"] RTOL = 1e-3 @@ -37,9 +37,9 @@ def generate_inputs(params: dict) -> list: """Generate input tensors with tile-first memory layout.""" - A = np.random.randn(BATCH, GRID_M, GRID_K, TILE_M, TILE_K).astype(np.float32) * 0.01 - B = np.random.randn(BATCH, GRID_K, GRID_N, TILE_K, TILE_N).astype(np.float32) * 0.01 - C = np.zeros((BATCH, GRID_M, GRID_N, TILE_M, TILE_N), dtype=np.float32) + A = torch.randn(BATCH, GRID_M, GRID_K, TILE_M, TILE_K, dtype=torch.float32) * 0.01 + B = torch.randn(BATCH, GRID_K, GRID_N, TILE_K, TILE_N, dtype=torch.float32) * 0.01 + C = torch.zeros(BATCH, GRID_M, GRID_N, TILE_M, TILE_N, dtype=torch.float32) return [ ("A", A), @@ -50,9 +50,9 @@ def generate_inputs(params: dict) -> list: def compute_golden(tensors: dict, params: dict) -> None: """Compute golden result: C[m,n] = sum(k) A[m,k] @ B[k,n].""" - A = tensors["A"].reshape(BATCH, GRID_M, GRID_K, TILE_M, TILE_K) - B = tensors["B"].reshape(BATCH, GRID_K, GRID_N, TILE_K, TILE_N) - C = tensors["C"].reshape(BATCH, GRID_M, GRID_N, TILE_M, TILE_N) + A = torch.as_tensor(tensors["A"]).reshape(BATCH, GRID_M, GRID_K, TILE_M, TILE_K) + B = torch.as_tensor(tensors["B"]).reshape(BATCH, GRID_K, GRID_N, TILE_K, TILE_N) + C = torch.as_tensor(tensors["C"]).reshape(BATCH, GRID_M, GRID_N, TILE_M, TILE_N) C[:] = 0.0 @@ -60,4 +60,4 @@ def compute_golden(tensors: dict, params: dict) -> None: for m_idx in range(GRID_M): for n_idx in range(GRID_N): for k_idx in range(GRID_K): - C[batch, m_idx, n_idx] += np.matmul(A[batch, m_idx, k_idx], B[batch, k_idx, n_idx]) + C[batch, m_idx, n_idx] += torch.matmul(A[batch, m_idx, k_idx], B[batch, k_idx, n_idx])