Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ jobs:
run: python ci.py -p a2a3sim -c d96c8784 -t 600 --clone-protocol https

- name: Run pytest scene tests (a2a3sim)
run: pytest examples tests/st --platform a2a3sim -v
run: pytest examples tests/st --platform a2a3sim --device 0-15 -v

st-sim-a5:
runs-on: ${{ matrix.os }}
Expand Down Expand Up @@ -228,7 +228,7 @@ jobs:
run: python ci.py -p a5sim -c d96c8784 -t 600 --clone-protocol https

- name: Run pytest scene tests (a5sim)
run: pytest examples tests/st --platform a5sim -v
run: pytest examples tests/st --platform a5sim --device 0-15 -v

# ---------- Python unit tests (a2a3 hardware) ----------
ut-py-a2a3:
Expand Down
116 changes: 97 additions & 19 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,19 @@
# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
# See LICENSE in the root of the software repository for the full text of the License.
# -----------------------------------------------------------------------------------------------------------
"""Root conftest — CLI options, markers, ST platform filtering, and ST fixtures."""
"""Root conftest — CLI options, markers, ST platform filtering, runtime isolation, and ST fixtures.

Runtime isolation: CANN's AICPU framework caches the user .so per device context.
Switching runtimes on the same device within one process causes hangs. When multiple
runtimes are collected and --runtime is not specified, pytest_runtestloop spawns a
subprocess per runtime so each gets a clean CANN context. See docs/testing.md.
"""

from __future__ import annotations

import subprocess
import sys

import pytest


Expand All @@ -22,31 +31,24 @@ def _parse_device_range(s: str) -> list[int]:


class DevicePool:
"""Simple device allocator for pytest fixtures.
"""Device allocator for pytest fixtures.

On sim platforms, device IDs are virtual — allocate always succeeds.
On real hardware, IDs are exclusive.
Manages a fixed set of device IDs. Tests allocate IDs before use
and release them after. Works identically for sim and onboard.
"""

def __init__(self, device_ids: list[int], *, is_sim: bool = False):
def __init__(self, device_ids: list[int]):
self._available = list(device_ids)
self._is_sim = is_sim
self._sim_next = 0

def allocate(self, n: int = 1) -> list[int]:
if self._is_sim:
ids = list(range(self._sim_next, self._sim_next + n))
self._sim_next += n
return ids
if n > len(self._available):
return []
allocated = self._available[:n]
self._available = self._available[n:]
return allocated

def release(self, ids: list[int]) -> None:
if not self._is_sim:
self._available.extend(ids)
self._available.extend(ids)


_device_pool: DevicePool | None = None
Expand All @@ -58,6 +60,7 @@ def pytest_addoption(parser):
parser.addoption("--device", action="store", default="0", help="Device ID or range (e.g., 0, 4-7)")
parser.addoption("--case", action="store", default=None, help="Run specific case name only")
parser.addoption("--all-cases", action="store_true", default=False, help="Include manual cases")
parser.addoption("--runtime", action="store", default=None, help="Only run tests for this runtime")


def pytest_configure(config):
Expand All @@ -68,18 +71,31 @@ def pytest_configure(config):


def pytest_collection_modifyitems(session, config, items):
"""Skip ST tests based on --platform filter."""
"""Skip ST tests based on --platform and --runtime filters, and order L3 before L2."""
platform = config.getoption("--platform")
runtime_filter = config.getoption("--runtime")

# Sort: L3 tests first (they fork child processes that inherit main process CANN state,
# so they must run before L2 tests pollute the CANN context).
def sort_key(item):
cls = getattr(item, "cls", None)
level = getattr(cls, "_st_level", 0) if cls else 0
return (0 if level >= 3 else 1, item.nodeid)

items.sort(key=sort_key)

for item in items:
# SceneTestCase subclass: skip if no case matches current platform
cls = getattr(item, "cls", None)
if cls and hasattr(cls, "CASES") and isinstance(cls.CASES, list):
if not platform:
item.add_marker(pytest.mark.skip(reason="--platform required"))
elif not any(platform in c.get("platforms", []) for c in cls.CASES):
item.add_marker(pytest.mark.skip(reason=f"No cases for {platform}"))
elif runtime_filter and getattr(cls, "_st_runtime", None) != runtime_filter:
item.add_marker(
pytest.mark.skip(reason=f"Runtime {getattr(cls, '_st_runtime', '?')} != {runtime_filter}")
)
continue
# Standalone function with @pytest.mark.platforms([...])
platforms_marker = item.get_closest_marker("platforms")
if platforms_marker:
if not platform:
Expand All @@ -88,15 +104,77 @@ def pytest_collection_modifyitems(session, config, items):
item.add_marker(pytest.mark.skip(reason=f"Not supported on {platform}"))


# ---------------------------------------------------------------------------
# Runtime isolation: spawn subprocess per runtime
# ---------------------------------------------------------------------------


def _collect_st_runtimes(items):
"""Return sorted list of unique runtimes from collected SceneTestCase items."""
runtimes = set()
for item in items:
cls = getattr(item, "cls", None)
rt = getattr(cls, "_st_runtime", None) if cls else None
if rt:
runtimes.add(rt)
return sorted(runtimes)


def pytest_runtestloop(session):
"""Override test execution to isolate runtimes in subprocesses.

If --runtime is specified (or only one runtime collected), run normally.
Otherwise, spawn one subprocess per runtime and aggregate results.
"""
runtime_filter = session.config.getoption("--runtime")
if runtime_filter:
return # single runtime — let pytest run normally

runtimes = _collect_st_runtimes(session.items)
if len(runtimes) <= 1:
return # zero or one runtime — no isolation needed

# Multiple runtimes: spawn subprocess per runtime
# Re-invoke pytest with the same args + --runtime <rt> for each runtime
base_args = [sys.executable, "-m", "pytest"]
for arg in session.config.invocation_params.args:
base_args.append(str(arg))

failed = False
for rt in runtimes:
# Build subprocess command: inject --runtime <rt>
cmd = base_args + ["--runtime", rt]
header = f" Runtime: {rt}"
print(f"\n{'=' * 60}\n{header}\n{'=' * 60}\n", flush=True)

result = subprocess.run(cmd, check=False, cwd=session.config.invocation_params.dir)
if result.returncode != 0:
failed = True
print(f"\n*** Runtime {rt}: FAILED ***\n", flush=True)
else:
print(f"\n--- Runtime {rt}: PASSED ---\n", flush=True)

if failed:
session.testsfailed = 1
else:
session.testscollected = sum(1 for _ in session.items)
session.testsfailed = 0

return True # returning True prevents default runtestloop


# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------


@pytest.fixture(scope="session")
def device_pool(request):
"""Session-scoped device pool parsed from --device."""
global _device_pool # noqa: PLW0603
if _device_pool is None:
raw = request.config.getoption("--device")
platform = request.config.getoption("--platform")
is_sim = platform is None or platform.endswith("sim")
_device_pool = DevicePool(_parse_device_range(raw), is_sim=is_sim)
_device_pool = DevicePool(_parse_device_range(raw))
return _device_pool


Expand Down
6 changes: 6 additions & 0 deletions docs/distributed_level_runtime.md
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,12 @@ Python Application
└── forked child process ← mailbox state machine
```

## Runtime Isolation (Onboard Hardware)

A single device can only run **one runtime** per CANN process context. CANN's AICPU framework (`libaicpu_extend_kernels.so`) caches the user AICPU .so on first load and skips reloading on subsequent launches. If a different runtime's AICPU .so is launched on the same device, the cached (stale) function pointers are used, causing hangs.

This means: **do not reuse a device across different runtimes within a single process.** Either use separate processes (one per runtime), or partition devices so each runtime gets exclusive devices. See [testing.md](testing.md#runtime-isolation-constraint-onboard) for details and the pytest device allocation algorithm.

## Files

| File | Purpose |
Expand Down
65 changes: 65 additions & 0 deletions docs/testing.md
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,71 @@ The test will be automatically picked up by `ci.py`. New tests should prefer the

See [ci.md](ci.md) for the full CI pipeline documentation, including the job matrix, runner constraints, marker scheme, and `ci.sh` internals.

## Runtime Isolation Constraint (Onboard)

**One device can only run one runtime per process.** Switching runtimes on the same device within a single process causes AICPU kernel hangs.

### Root Cause

CANN's AICPU dispatch uses a framework SO (`libaicpu_extend_kernels.so`) with a global singleton `BackendServerHandleManager` that:

1. **`SaveSoFile`**: Writes the user AICPU .so to disk on first call, then sets `firstCreatSo_ = true` to skip all subsequent writes.
2. **`SetTileFwkKernelMap`**: `dlopen`s the .so and caches function pointers on first call, then sets `firstLoadSo_ = true` to skip all subsequent loads.

When a second runtime launches on the same device (same CANN process context), the Init kernel call hits the cached flags — the new AICPU .so is never written or loaded. The Exec kernel then calls function pointers from the first runtime's .so, which operates on incompatible data structures and hangs.

### Impact

| Scenario | Result |
| -------- | ------ |
| Same runtime, same device, sequential | Works (same .so, cached pointers valid) |
| Different runtime, same device, sequential | **Hangs** (stale .so, wrong function pointers) |
| Different runtime, different device | Works (separate CANN context per device) |
| Different runtime, different process, same device | Works (`rtDeviceReset` between processes clears context) |

### Mitigation in pytest

The `conftest.py` device allocator groups tests by runtime and assigns each runtime group to exclusive devices. See "Device Allocation Algorithm" below.

## Device Allocation Algorithm (Onboard pytest)

When running `pytest --platform a2a3 --device 8-11`, the fixture must allocate devices to tests such that:

1. **Runtime isolation**: A device used by runtime A must not be reused by runtime B in the same process.
2. **L3 multi-device**: L3 tests may need 2+ contiguous devices.
3. **Efficiency**: Devices freed by one test of the same runtime can be reused by the next.

### Algorithm

```text
Phase 1: Group tests by runtime
tensormap_and_ringbuffer: [TestVectorExample, TestScalarData, TestL3Dependency, ...]
aicpu_build_graph: [TestPagedAttentionAicpuBuildGraph]
host_build_graph: [TestPagedAttentionHostBuildGraph]

Phase 2: Partition devices across runtime groups
Available: [8, 9, 10, 11]
tensormap_and_ringbuffer (6 tests, needs max 2 for L3 group): devices [8, 9]
aicpu_build_graph (1 test, needs 1): devices [10]
host_build_graph (1 test, needs 1): devices [11]

Phase 3: Within each group, allocate from group's device pool
TestVectorExample: dev 8 → run → release → dev 8 available again
TestScalarData: dev 8 → run → release → OK (same runtime)
TestL3Dependency: dev 8 → run → release
TestL3Group: dev [8, 9] → run → release
TestPagedAttentionAicpuBuildGraph: dev 10 → run → release
TestPagedAttentionHostBuildGraph: dev 11 → run → release
```

### Implementation

The `DevicePool` in `conftest.py` is extended with runtime-aware partitioning. The `st_worker` fixture checks the test class's `_st_runtime` and allocates from the corresponding partition.

### Sim platforms

On sim (`a2a3sim`, `a5sim`), device IDs are virtual — no hardware state, no isolation constraint. All tests share a single virtual pool with auto-incrementing IDs.

## Per-Case Device Filtering

The `@scene_test(platforms=[...])` decorator provides per-case platform filtering. A single test class declares which platforms it supports:
Expand Down
Loading
Loading