hw-native-sys · ChaoWao · Apr 12, 2026 · Apr 10, 2026 · Apr 12, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -174,7 +174,7 @@ jobs:
         run: python ci.py -p a2a3sim -c d96c8784 -t 600 --clone-protocol https
 
       - name: Run pytest scene tests (a2a3sim)
-        run: pytest examples tests/st --platform a2a3sim -v
+        run: pytest examples tests/st --platform a2a3sim --device 0-15 -v
 
   st-sim-a5:
     runs-on: ${{ matrix.os }}
@@ -228,7 +228,7 @@ jobs:
         run: python ci.py -p a5sim -c d96c8784 -t 600 --clone-protocol https
 
       - name: Run pytest scene tests (a5sim)
-        run: pytest examples tests/st --platform a5sim -v
+        run: pytest examples tests/st --platform a5sim --device 0-15 -v
 
   # ---------- Python unit tests (a2a3 hardware) ----------
   ut-py-a2a3:

diff --git a/conftest.py b/conftest.py
@@ -6,10 +6,19 @@
 # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 # See LICENSE in the root of the software repository for the full text of the License.
 # -----------------------------------------------------------------------------------------------------------
-"""Root conftest — CLI options, markers, ST platform filtering, and ST fixtures."""
+"""Root conftest — CLI options, markers, ST platform filtering, runtime isolation, and ST fixtures.
+
+Runtime isolation: CANN's AICPU framework caches the user .so per device context.
+Switching runtimes on the same device within one process causes hangs. When multiple
+runtimes are collected and --runtime is not specified, pytest_runtestloop spawns a
+subprocess per runtime so each gets a clean CANN context. See docs/testing.md.
+"""
 
 from __future__ import annotations
 
+import subprocess
+import sys
+
 import pytest
 
 
@@ -22,31 +31,24 @@ def _parse_device_range(s: str) -> list[int]:
 
 
 class DevicePool:
-    """Simple device allocator for pytest fixtures.
+    """Device allocator for pytest fixtures.
 
-    On sim platforms, device IDs are virtual — allocate always succeeds.
-    On real hardware, IDs are exclusive.
+    Manages a fixed set of device IDs. Tests allocate IDs before use
+    and release them after. Works identically for sim and onboard.
     """
 
-    def __init__(self, device_ids: list[int], *, is_sim: bool = False):
+    def __init__(self, device_ids: list[int]):
         self._available = list(device_ids)
-        self._is_sim = is_sim
-        self._sim_next = 0
 
     def allocate(self, n: int = 1) -> list[int]:
-        if self._is_sim:
-            ids = list(range(self._sim_next, self._sim_next + n))
-            self._sim_next += n
-            return ids
         if n > len(self._available):
             return []
         allocated = self._available[:n]
         self._available = self._available[n:]
         return allocated
 
     def release(self, ids: list[int]) -> None:
-        if not self._is_sim:
-            self._available.extend(ids)
+        self._available.extend(ids)
 
 
 _device_pool: DevicePool | None = None
@@ -58,6 +60,7 @@ def pytest_addoption(parser):
     parser.addoption("--device", action="store", default="0", help="Device ID or range (e.g., 0, 4-7)")
     parser.addoption("--case", action="store", default=None, help="Run specific case name only")
     parser.addoption("--all-cases", action="store_true", default=False, help="Include manual cases")
+    parser.addoption("--runtime", action="store", default=None, help="Only run tests for this runtime")
 
 
 def pytest_configure(config):
@@ -68,18 +71,31 @@ def pytest_configure(config):
 
 
 def pytest_collection_modifyitems(session, config, items):
-    """Skip ST tests based on --platform filter."""
+    """Skip ST tests based on --platform and --runtime filters, and order L3 before L2."""
     platform = config.getoption("--platform")
+    runtime_filter = config.getoption("--runtime")
+
+    # Sort: L3 tests first (they fork child processes that inherit main process CANN state,
+    # so they must run before L2 tests pollute the CANN context).
+    def sort_key(item):
+        cls = getattr(item, "cls", None)
+        level = getattr(cls, "_st_level", 0) if cls else 0
+        return (0 if level >= 3 else 1, item.nodeid)
+
+    items.sort(key=sort_key)
+
     for item in items:
-        # SceneTestCase subclass: skip if no case matches current platform
         cls = getattr(item, "cls", None)
         if cls and hasattr(cls, "CASES") and isinstance(cls.CASES, list):
             if not platform:
                 item.add_marker(pytest.mark.skip(reason="--platform required"))
             elif not any(platform in c.get("platforms", []) for c in cls.CASES):
                 item.add_marker(pytest.mark.skip(reason=f"No cases for {platform}"))
+            elif runtime_filter and getattr(cls, "_st_runtime", None) != runtime_filter:
+                item.add_marker(
+                    pytest.mark.skip(reason=f"Runtime {getattr(cls, '_st_runtime', '?')} != {runtime_filter}")
+                )
             continue
-        # Standalone function with @pytest.mark.platforms([...])
         platforms_marker = item.get_closest_marker("platforms")
         if platforms_marker:
             if not platform:
@@ -88,15 +104,77 @@ def pytest_collection_modifyitems(session, config, items):
                 item.add_marker(pytest.mark.skip(reason=f"Not supported on {platform}"))
 
 
+# ---------------------------------------------------------------------------
+# Runtime isolation: spawn subprocess per runtime
+# ---------------------------------------------------------------------------
+
+
+def _collect_st_runtimes(items):
+    """Return sorted list of unique runtimes from collected SceneTestCase items."""
+    runtimes = set()
+    for item in items:
+        cls = getattr(item, "cls", None)
+        rt = getattr(cls, "_st_runtime", None) if cls else None
+        if rt:
+            runtimes.add(rt)
+    return sorted(runtimes)
+
+
+def pytest_runtestloop(session):
+    """Override test execution to isolate runtimes in subprocesses.
+
+    If --runtime is specified (or only one runtime collected), run normally.
+    Otherwise, spawn one subprocess per runtime and aggregate results.
+    """
+    runtime_filter = session.config.getoption("--runtime")
+    if runtime_filter:
+        return  # single runtime — let pytest run normally
+
+    runtimes = _collect_st_runtimes(session.items)
+    if len(runtimes) <= 1:
+        return  # zero or one runtime — no isolation needed
+
+    # Multiple runtimes: spawn subprocess per runtime
+    # Re-invoke pytest with the same args + --runtime <rt> for each runtime
+    base_args = [sys.executable, "-m", "pytest"]
+    for arg in session.config.invocation_params.args:
+        base_args.append(str(arg))
+
+    failed = False
+    for rt in runtimes:
+        # Build subprocess command: inject --runtime <rt>
+        cmd = base_args + ["--runtime", rt]
+        header = f"  Runtime: {rt}"
+        print(f"\n{'=' * 60}\n{header}\n{'=' * 60}\n", flush=True)
+
+        result = subprocess.run(cmd, check=False, cwd=session.config.invocation_params.dir)
+        if result.returncode != 0:
+            failed = True
+            print(f"\n*** Runtime {rt}: FAILED ***\n", flush=True)
+        else:
+            print(f"\n--- Runtime {rt}: PASSED ---\n", flush=True)
+
+    if failed:
+        session.testsfailed = 1
+    else:
+        session.testscollected = sum(1 for _ in session.items)
+        session.testsfailed = 0
+
+    return True  # returning True prevents default runtestloop
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
 @pytest.fixture(scope="session")
 def device_pool(request):
     """Session-scoped device pool parsed from --device."""
     global _device_pool  # noqa: PLW0603
     if _device_pool is None:
         raw = request.config.getoption("--device")
-        platform = request.config.getoption("--platform")
-        is_sim = platform is None or platform.endswith("sim")
-        _device_pool = DevicePool(_parse_device_range(raw), is_sim=is_sim)
+        _device_pool = DevicePool(_parse_device_range(raw))
     return _device_pool
 
 

diff --git a/docs/distributed_level_runtime.md b/docs/distributed_level_runtime.md
@@ -339,6 +339,12 @@ Python Application
                  └── forked child process    ← mailbox state machine
 ```
 
+## Runtime Isolation (Onboard Hardware)
+
+A single device can only run **one runtime** per CANN process context. CANN's AICPU framework (`libaicpu_extend_kernels.so`) caches the user AICPU .so on first load and skips reloading on subsequent launches. If a different runtime's AICPU .so is launched on the same device, the cached (stale) function pointers are used, causing hangs.
+
+This means: **do not reuse a device across different runtimes within a single process.** Either use separate processes (one per runtime), or partition devices so each runtime gets exclusive devices. See [testing.md](testing.md#runtime-isolation-constraint-onboard) for details and the pytest device allocation algorithm.
+
 ## Files
 
 | File | Purpose |

diff --git a/docs/testing.md b/docs/testing.md
@@ -331,6 +331,71 @@ The test will be automatically picked up by `ci.py`. New tests should prefer the
 
 See [ci.md](ci.md) for the full CI pipeline documentation, including the job matrix, runner constraints, marker scheme, and `ci.sh` internals.
 
+## Runtime Isolation Constraint (Onboard)
+
+**One device can only run one runtime per process.** Switching runtimes on the same device within a single process causes AICPU kernel hangs.
+
+### Root Cause
+
+CANN's AICPU dispatch uses a framework SO (`libaicpu_extend_kernels.so`) with a global singleton `BackendServerHandleManager` that:
+
+1. **`SaveSoFile`**: Writes the user AICPU .so to disk on first call, then sets `firstCreatSo_ = true` to skip all subsequent writes.
+2. **`SetTileFwkKernelMap`**: `dlopen`s the .so and caches function pointers on first call, then sets `firstLoadSo_ = true` to skip all subsequent loads.
+
+When a second runtime launches on the same device (same CANN process context), the Init kernel call hits the cached flags — the new AICPU .so is never written or loaded. The Exec kernel then calls function pointers from the first runtime's .so, which operates on incompatible data structures and hangs.
+
+### Impact
+
+| Scenario | Result |
+| -------- | ------ |
+| Same runtime, same device, sequential | Works (same .so, cached pointers valid) |
+| Different runtime, same device, sequential | **Hangs** (stale .so, wrong function pointers) |
+| Different runtime, different device | Works (separate CANN context per device) |
+| Different runtime, different process, same device | Works (`rtDeviceReset` between processes clears context) |
+
+### Mitigation in pytest
+
+The `conftest.py` device allocator groups tests by runtime and assigns each runtime group to exclusive devices. See "Device Allocation Algorithm" below.
+
+## Device Allocation Algorithm (Onboard pytest)
+
+When running `pytest --platform a2a3 --device 8-11`, the fixture must allocate devices to tests such that:
+
+1. **Runtime isolation**: A device used by runtime A must not be reused by runtime B in the same process.
+2. **L3 multi-device**: L3 tests may need 2+ contiguous devices.
+3. **Efficiency**: Devices freed by one test of the same runtime can be reused by the next.
+
+### Algorithm
+
+```text
+Phase 1: Group tests by runtime
+  tensormap_and_ringbuffer: [TestVectorExample, TestScalarData, TestL3Dependency, ...]
+  aicpu_build_graph:        [TestPagedAttentionAicpuBuildGraph]
+  host_build_graph:         [TestPagedAttentionHostBuildGraph]
+
+Phase 2: Partition devices across runtime groups
+  Available: [8, 9, 10, 11]
+  tensormap_and_ringbuffer (6 tests, needs max 2 for L3 group): devices [8, 9]
+  aicpu_build_graph (1 test, needs 1):                          devices [10]
+  host_build_graph (1 test, needs 1):                           devices [11]
+
+Phase 3: Within each group, allocate from group's device pool
+  TestVectorExample:       dev 8 → run → release → dev 8 available again
+  TestScalarData:          dev 8 → run → release → OK (same runtime)
+  TestL3Dependency:        dev 8 → run → release
+  TestL3Group:             dev [8, 9] → run → release
+  TestPagedAttentionAicpuBuildGraph: dev 10 → run → release
+  TestPagedAttentionHostBuildGraph:  dev 11 → run → release
+```
+
+### Implementation
+
+The `DevicePool` in `conftest.py` is extended with runtime-aware partitioning. The `st_worker` fixture checks the test class's `_st_runtime` and allocates from the corresponding partition.
+
+### Sim platforms
+
+On sim (`a2a3sim`, `a5sim`), device IDs are virtual — no hardware state, no isolation constraint. All tests share a single virtual pool with auto-incrementing IDs.
+
 ## Per-Case Device Filtering
 
 The `@scene_test(platforms=[...])` decorator provides per-case platform filtering. A single test class declares which platforms it supports:

diff --git a/...h/paged_attention/kernels/aic/aic_hub.cpp → ...h/paged_attention/kernels/aic/aic_hub.cpp b/...h/paged_attention/kernels/aic/aic_hub.cpp → ...h/paged_attention/kernels/aic/aic_hub.cpp
diff --git a/...d_attention/kernels/aic/aic_pv_matmul.cpp → ...d_attention/kernels/aic/aic_pv_matmul.cpp b/...d_attention/kernels/aic/aic_pv_matmul.cpp → ...d_attention/kernels/aic/aic_pv_matmul.cpp
diff --git a/...d_attention/kernels/aic/aic_qk_matmul.cpp → ...d_attention/kernels/aic/aic_qk_matmul.cpp b/...d_attention/kernels/aic/aic_qk_matmul.cpp → ...d_attention/kernels/aic/aic_qk_matmul.cpp
diff --git a/...h/paged_attention/kernels/aiv/aiv_hub.cpp → ...h/paged_attention/kernels/aiv/aiv_hub.cpp b/...h/paged_attention/kernels/aiv/aiv_hub.cpp → ...h/paged_attention/kernels/aiv/aiv_hub.cpp
diff --git a/...tention/kernels/aiv/aiv_online_update.cpp → ...tention/kernels/aiv/aiv_online_update.cpp b/...tention/kernels/aiv/aiv_online_update.cpp → ...tention/kernels/aiv/aiv_online_update.cpp
diff --git a/...ntion/kernels/aiv/aiv_softmax_prepare.cpp → ...ntion/kernels/aiv/aiv_softmax_prepare.cpp b/...ntion/kernels/aiv/aiv_softmax_prepare.cpp → ...ntion/kernels/aiv/aiv_softmax_prepare.cpp
diff --git a/...ls/orchestration/paged_attention_orch.cpp → ...ls/orchestration/paged_attention_orch.cpp b/...ls/orchestration/paged_attention_orch.cpp → ...ls/orchestration/paged_attention_orch.cpp