diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 22e6cbbba..5b43bdc02 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -174,7 +174,7 @@ jobs: run: python ci.py -p a2a3sim -c d96c8784 -t 600 --clone-protocol https - name: Run pytest scene tests (a2a3sim) - run: pytest examples tests/st --platform a2a3sim -v + run: pytest examples tests/st --platform a2a3sim --device 0-15 -v st-sim-a5: runs-on: ${{ matrix.os }} @@ -228,7 +228,7 @@ jobs: run: python ci.py -p a5sim -c d96c8784 -t 600 --clone-protocol https - name: Run pytest scene tests (a5sim) - run: pytest examples tests/st --platform a5sim -v + run: pytest examples tests/st --platform a5sim --device 0-15 -v # ---------- Python unit tests (a2a3 hardware) ---------- ut-py-a2a3: diff --git a/conftest.py b/conftest.py index 5e33c740f..a36945dea 100644 --- a/conftest.py +++ b/conftest.py @@ -6,10 +6,19 @@ # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. # See LICENSE in the root of the software repository for the full text of the License. # ----------------------------------------------------------------------------------------------------------- -"""Root conftest — CLI options, markers, ST platform filtering, and ST fixtures.""" +"""Root conftest — CLI options, markers, ST platform filtering, runtime isolation, and ST fixtures. + +Runtime isolation: CANN's AICPU framework caches the user .so per device context. +Switching runtimes on the same device within one process causes hangs. When multiple +runtimes are collected and --runtime is not specified, pytest_runtestloop spawns a +subprocess per runtime so each gets a clean CANN context. See docs/testing.md. +""" from __future__ import annotations +import subprocess +import sys + import pytest @@ -22,22 +31,16 @@ def _parse_device_range(s: str) -> list[int]: class DevicePool: - """Simple device allocator for pytest fixtures. + """Device allocator for pytest fixtures. - On sim platforms, device IDs are virtual — allocate always succeeds. - On real hardware, IDs are exclusive. + Manages a fixed set of device IDs. Tests allocate IDs before use + and release them after. Works identically for sim and onboard. """ - def __init__(self, device_ids: list[int], *, is_sim: bool = False): + def __init__(self, device_ids: list[int]): self._available = list(device_ids) - self._is_sim = is_sim - self._sim_next = 0 def allocate(self, n: int = 1) -> list[int]: - if self._is_sim: - ids = list(range(self._sim_next, self._sim_next + n)) - self._sim_next += n - return ids if n > len(self._available): return [] allocated = self._available[:n] @@ -45,8 +48,7 @@ def allocate(self, n: int = 1) -> list[int]: return allocated def release(self, ids: list[int]) -> None: - if not self._is_sim: - self._available.extend(ids) + self._available.extend(ids) _device_pool: DevicePool | None = None @@ -58,6 +60,7 @@ def pytest_addoption(parser): parser.addoption("--device", action="store", default="0", help="Device ID or range (e.g., 0, 4-7)") parser.addoption("--case", action="store", default=None, help="Run specific case name only") parser.addoption("--all-cases", action="store_true", default=False, help="Include manual cases") + parser.addoption("--runtime", action="store", default=None, help="Only run tests for this runtime") def pytest_configure(config): @@ -68,18 +71,31 @@ def pytest_configure(config): def pytest_collection_modifyitems(session, config, items): - """Skip ST tests based on --platform filter.""" + """Skip ST tests based on --platform and --runtime filters, and order L3 before L2.""" platform = config.getoption("--platform") + runtime_filter = config.getoption("--runtime") + + # Sort: L3 tests first (they fork child processes that inherit main process CANN state, + # so they must run before L2 tests pollute the CANN context). + def sort_key(item): + cls = getattr(item, "cls", None) + level = getattr(cls, "_st_level", 0) if cls else 0 + return (0 if level >= 3 else 1, item.nodeid) + + items.sort(key=sort_key) + for item in items: - # SceneTestCase subclass: skip if no case matches current platform cls = getattr(item, "cls", None) if cls and hasattr(cls, "CASES") and isinstance(cls.CASES, list): if not platform: item.add_marker(pytest.mark.skip(reason="--platform required")) elif not any(platform in c.get("platforms", []) for c in cls.CASES): item.add_marker(pytest.mark.skip(reason=f"No cases for {platform}")) + elif runtime_filter and getattr(cls, "_st_runtime", None) != runtime_filter: + item.add_marker( + pytest.mark.skip(reason=f"Runtime {getattr(cls, '_st_runtime', '?')} != {runtime_filter}") + ) continue - # Standalone function with @pytest.mark.platforms([...]) platforms_marker = item.get_closest_marker("platforms") if platforms_marker: if not platform: @@ -88,15 +104,77 @@ def pytest_collection_modifyitems(session, config, items): item.add_marker(pytest.mark.skip(reason=f"Not supported on {platform}")) +# --------------------------------------------------------------------------- +# Runtime isolation: spawn subprocess per runtime +# --------------------------------------------------------------------------- + + +def _collect_st_runtimes(items): + """Return sorted list of unique runtimes from collected SceneTestCase items.""" + runtimes = set() + for item in items: + cls = getattr(item, "cls", None) + rt = getattr(cls, "_st_runtime", None) if cls else None + if rt: + runtimes.add(rt) + return sorted(runtimes) + + +def pytest_runtestloop(session): + """Override test execution to isolate runtimes in subprocesses. + + If --runtime is specified (or only one runtime collected), run normally. + Otherwise, spawn one subprocess per runtime and aggregate results. + """ + runtime_filter = session.config.getoption("--runtime") + if runtime_filter: + return # single runtime — let pytest run normally + + runtimes = _collect_st_runtimes(session.items) + if len(runtimes) <= 1: + return # zero or one runtime — no isolation needed + + # Multiple runtimes: spawn subprocess per runtime + # Re-invoke pytest with the same args + --runtime for each runtime + base_args = [sys.executable, "-m", "pytest"] + for arg in session.config.invocation_params.args: + base_args.append(str(arg)) + + failed = False + for rt in runtimes: + # Build subprocess command: inject --runtime + cmd = base_args + ["--runtime", rt] + header = f" Runtime: {rt}" + print(f"\n{'=' * 60}\n{header}\n{'=' * 60}\n", flush=True) + + result = subprocess.run(cmd, check=False, cwd=session.config.invocation_params.dir) + if result.returncode != 0: + failed = True + print(f"\n*** Runtime {rt}: FAILED ***\n", flush=True) + else: + print(f"\n--- Runtime {rt}: PASSED ---\n", flush=True) + + if failed: + session.testsfailed = 1 + else: + session.testscollected = sum(1 for _ in session.items) + session.testsfailed = 0 + + return True # returning True prevents default runtestloop + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + @pytest.fixture(scope="session") def device_pool(request): """Session-scoped device pool parsed from --device.""" global _device_pool # noqa: PLW0603 if _device_pool is None: raw = request.config.getoption("--device") - platform = request.config.getoption("--platform") - is_sim = platform is None or platform.endswith("sim") - _device_pool = DevicePool(_parse_device_range(raw), is_sim=is_sim) + _device_pool = DevicePool(_parse_device_range(raw)) return _device_pool diff --git a/docs/distributed_level_runtime.md b/docs/distributed_level_runtime.md index 8a73dca7c..1f1204639 100644 --- a/docs/distributed_level_runtime.md +++ b/docs/distributed_level_runtime.md @@ -339,6 +339,12 @@ Python Application └── forked child process ← mailbox state machine ``` +## Runtime Isolation (Onboard Hardware) + +A single device can only run **one runtime** per CANN process context. CANN's AICPU framework (`libaicpu_extend_kernels.so`) caches the user AICPU .so on first load and skips reloading on subsequent launches. If a different runtime's AICPU .so is launched on the same device, the cached (stale) function pointers are used, causing hangs. + +This means: **do not reuse a device across different runtimes within a single process.** Either use separate processes (one per runtime), or partition devices so each runtime gets exclusive devices. See [testing.md](testing.md#runtime-isolation-constraint-onboard) for details and the pytest device allocation algorithm. + ## Files | File | Purpose | diff --git a/docs/testing.md b/docs/testing.md index 69612672e..7f38f8298 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -331,6 +331,71 @@ The test will be automatically picked up by `ci.py`. New tests should prefer the See [ci.md](ci.md) for the full CI pipeline documentation, including the job matrix, runner constraints, marker scheme, and `ci.sh` internals. +## Runtime Isolation Constraint (Onboard) + +**One device can only run one runtime per process.** Switching runtimes on the same device within a single process causes AICPU kernel hangs. + +### Root Cause + +CANN's AICPU dispatch uses a framework SO (`libaicpu_extend_kernels.so`) with a global singleton `BackendServerHandleManager` that: + +1. **`SaveSoFile`**: Writes the user AICPU .so to disk on first call, then sets `firstCreatSo_ = true` to skip all subsequent writes. +2. **`SetTileFwkKernelMap`**: `dlopen`s the .so and caches function pointers on first call, then sets `firstLoadSo_ = true` to skip all subsequent loads. + +When a second runtime launches on the same device (same CANN process context), the Init kernel call hits the cached flags — the new AICPU .so is never written or loaded. The Exec kernel then calls function pointers from the first runtime's .so, which operates on incompatible data structures and hangs. + +### Impact + +| Scenario | Result | +| -------- | ------ | +| Same runtime, same device, sequential | Works (same .so, cached pointers valid) | +| Different runtime, same device, sequential | **Hangs** (stale .so, wrong function pointers) | +| Different runtime, different device | Works (separate CANN context per device) | +| Different runtime, different process, same device | Works (`rtDeviceReset` between processes clears context) | + +### Mitigation in pytest + +The `conftest.py` device allocator groups tests by runtime and assigns each runtime group to exclusive devices. See "Device Allocation Algorithm" below. + +## Device Allocation Algorithm (Onboard pytest) + +When running `pytest --platform a2a3 --device 8-11`, the fixture must allocate devices to tests such that: + +1. **Runtime isolation**: A device used by runtime A must not be reused by runtime B in the same process. +2. **L3 multi-device**: L3 tests may need 2+ contiguous devices. +3. **Efficiency**: Devices freed by one test of the same runtime can be reused by the next. + +### Algorithm + +```text +Phase 1: Group tests by runtime + tensormap_and_ringbuffer: [TestVectorExample, TestScalarData, TestL3Dependency, ...] + aicpu_build_graph: [TestPagedAttentionAicpuBuildGraph] + host_build_graph: [TestPagedAttentionHostBuildGraph] + +Phase 2: Partition devices across runtime groups + Available: [8, 9, 10, 11] + tensormap_and_ringbuffer (6 tests, needs max 2 for L3 group): devices [8, 9] + aicpu_build_graph (1 test, needs 1): devices [10] + host_build_graph (1 test, needs 1): devices [11] + +Phase 3: Within each group, allocate from group's device pool + TestVectorExample: dev 8 → run → release → dev 8 available again + TestScalarData: dev 8 → run → release → OK (same runtime) + TestL3Dependency: dev 8 → run → release + TestL3Group: dev [8, 9] → run → release + TestPagedAttentionAicpuBuildGraph: dev 10 → run → release + TestPagedAttentionHostBuildGraph: dev 11 → run → release +``` + +### Implementation + +The `DevicePool` in `conftest.py` is extended with runtime-aware partitioning. The `st_worker` fixture checks the test class's `_st_runtime` and allocates from the corresponding partition. + +### Sim platforms + +On sim (`a2a3sim`, `a5sim`), device IDs are virtual — no hardware state, no isolation constraint. All tests share a single virtual pool with auto-incrementing IDs. + ## Per-Case Device Filtering The `@scene_test(platforms=[...])` decorator provides per-case platform filtering. A single test class declares which platforms it supports: diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_hub.cpp b/examples/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_hub.cpp similarity index 100% rename from tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_hub.cpp rename to examples/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_hub.cpp diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp b/examples/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp similarity index 100% rename from tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp rename to examples/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp b/examples/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp similarity index 100% rename from tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp rename to examples/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_hub.cpp b/examples/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_hub.cpp similarity index 100% rename from tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_hub.cpp rename to examples/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_hub.cpp diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp b/examples/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp similarity index 100% rename from tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp rename to examples/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/examples/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp similarity index 100% rename from tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp rename to examples/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a2a3/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp similarity index 100% rename from tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp rename to examples/a2a3/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp diff --git a/examples/a2a3/aicpu_build_graph/paged_attention/test_paged_attention.py b/examples/a2a3/aicpu_build_graph/paged_attention/test_paged_attention.py new file mode 100644 index 000000000..62687419b --- /dev/null +++ b/examples/a2a3/aicpu_build_graph/paged_attention/test_paged_attention.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Paged attention — aicpu_build_graph runtime (production scale, bfloat16). + +Tests aicpu_build_graph runtime with hub kernels (aic_hub, aiv_hub), +INOUT tensors, and AIC+AIV mixed execution. +""" + +import torch +from paged_attention_golden import compute_golden as _pa_compute_golden # noqa: PLC0415 +from paged_attention_golden import generate_inputs as _pa_generate_inputs # noqa: PLC0415 +from simpler.task_interface import ArgDirection as D + +from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test + + +@scene_test(level=2, runtime="aicpu_build_graph") +class TestPagedAttentionAicpuBuildGraph(SceneTestCase): + """Paged attention with aicpu_build_graph runtime and hub kernels.""" + + RTOL = 1e-3 + ATOL = 1e-3 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/paged_attention_orch.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "source": "kernels/aic/aic_qk_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 2, + "source": "kernels/aic/aic_pv_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 4, + "source": "kernels/aic/aic_hub.cpp", + "core_type": "aic", + "signature": [], + }, + { + "func_id": 1, + "source": "kernels/aiv/aiv_softmax_prepare.cpp", + "core_type": "aiv", + "signature": [D.IN, D.OUT, D.OUT, D.OUT], + }, + { + "func_id": 3, + "source": "kernels/aiv/aiv_online_update.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], + }, + { + "func_id": 5, + "source": "kernels/aiv/aiv_hub.cpp", + "core_type": "aiv", + "signature": [], + }, + ], + } + + CASES = [ + { + "name": "case1", + "platforms": ["a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": { + "batch": 256, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 128, + "block_size": 128, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + }, + { + "name": "case2", + "platforms": ["a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": { + "batch": 64, + "num_heads": 64, + "kv_head_num": 1, + "head_dim": 128, + "block_size": 64, + "context_len": 8192, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + }, + ] + + def generate_args(self, params): + inputs = _pa_generate_inputs(params) + specs = [] + for name, val in inputs: + if isinstance(val, torch.Tensor): + specs.append(Tensor(name, val)) + else: + specs.append(Scalar(name, val)) + return TaskArgsBuilder(*specs) + + def compute_golden(self, args, params): + tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)} + _pa_compute_golden(tensors, params) + for s in args.specs: + if isinstance(s, Tensor) and s.name in tensors: + getattr(args, s.name)[:] = tensors[s.name] + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/examples/a2a3/host_build_graph/paged_attention/golden.py b/examples/a2a3/host_build_graph/paged_attention/golden.py deleted file mode 100644 index fc57c9f7d..000000000 --- a/examples/a2a3/host_build_graph/paged_attention/golden.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -"""Paged Attention Golden - host_build_graph example (small scale, float16). - -Args layout: [query, key_cache, value_cache, block_table, context_lens, out, scale] - - Tensors retain original multi-dimensional shapes (ContinuousTensor metadata carries shape/dtype) - - scale is a scalar float parameter -""" - -from paged_attention_golden import ( - compute_golden, # noqa: F401 - run_golden_test, -) -from paged_attention_golden import generate_inputs as _generate_inputs - -__outputs__ = ["out"] - -RTOL = 1e-2 -ATOL = 1e-2 - -ALL_CASES = { - "Case1": { - "batch": 1, - "num_heads": 16, - "kv_head_num": 1, - "head_dim": 16, - "block_size": 16, - "context_len": 16, - "max_model_len": 256, - "dtype": "float16", - }, - "Case2": { - "batch": 1, - "num_heads": 16, - "kv_head_num": 1, - "head_dim": 16, - "block_size": 16, - "context_len": 64, - "max_model_len": 256, - "dtype": "float16", - }, -} - -DEFAULT_CASE = "Case1" - - -def generate_inputs(params: dict) -> list: - return _generate_inputs(params) - - -if __name__ == "__main__": - run_golden_test(ALL_CASES, DEFAULT_CASE, generate_inputs) diff --git a/examples/a2a3/host_build_graph/paged_attention/kernels/kernel_config.py b/examples/a2a3/host_build_graph/paged_attention/kernels/kernel_config.py deleted file mode 100644 index 0245cc8a5..000000000 --- a/examples/a2a3/host_build_graph/paged_attention/kernels/kernel_config.py +++ /dev/null @@ -1,76 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Paged Attention Kernel and Orchestration Configuration - -Defines the kernels and orchestration function for paged attention -with AIC/AIV subgraph splitting: - -AIC Kernels (Matrix Multiplication): - - aic_qk_matmul: Q @ K^T computation - - aic_pv_matmul: P @ V computation - -AIV Kernels (Vector Operations): - - aiv_softmax_prepare: scale, rowmax, exp, rowsum - - aiv_online_update: online softmax accumulation + fused normalization -""" - -from pathlib import Path - -from simpler.task_interface import ArgDirection as D # pyright: ignore[reportAttributeAccessIssue] - -_KERNELS_ROOT = Path(__file__).parent - -# Orchestration config -ORCHESTRATION = { - "source": str(_KERNELS_ROOT / "orchestration" / "paged_attention_orch.cpp"), - "function_name": "build_paged_attention_graph", - "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], -} - -# Kernel configs -KERNELS = [ - # AIC kernels (matrix multiplication using Cube unit) - { - "func_id": 0, - "name": "QK", - "source": str(_KERNELS_ROOT / "aic" / "aic_qk_matmul.cpp"), - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - { - "func_id": 2, - "name": "PV", - "source": str(_KERNELS_ROOT / "aic" / "aic_pv_matmul.cpp"), - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - # AIV kernels (vector operations) - { - "func_id": 1, - "name": "SF", - "source": str(_KERNELS_ROOT / "aiv" / "aiv_softmax_prepare.cpp"), - "core_type": "aiv", - "signature": [D.IN, D.OUT, D.OUT, D.OUT], - }, - { - "func_id": 3, - "name": "UP", - "source": str(_KERNELS_ROOT / "aiv" / "aiv_online_update.cpp"), - "core_type": "aiv", - "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], - }, -] - -# Runtime configuration -RUNTIME_CONFIG = { - "runtime": "host_build_graph", - "aicpu_thread_num": 3, - "block_dim": 3, -} diff --git a/examples/a2a3/host_build_graph/paged_attention/test_paged_attention.py b/examples/a2a3/host_build_graph/paged_attention/test_paged_attention.py new file mode 100644 index 000000000..ec67f7307 --- /dev/null +++ b/examples/a2a3/host_build_graph/paged_attention/test_paged_attention.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Paged attention — host_build_graph runtime (small scale, float16). + +Tests host_build_graph runtime with AIC+AIV mixed execution and INOUT tensors. +""" + +import torch +from paged_attention_golden import compute_golden as _pa_compute_golden # noqa: PLC0415 +from paged_attention_golden import generate_inputs as _pa_generate_inputs # noqa: PLC0415 +from simpler.task_interface import ArgDirection as D + +from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test + + +@scene_test(level=2, runtime="host_build_graph") +class TestPagedAttentionHostBuildGraph(SceneTestCase): + """Paged attention with host_build_graph runtime.""" + + RTOL = 1e-2 + ATOL = 1e-2 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/paged_attention_orch.cpp", + "function_name": "build_paged_attention_graph", + "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "source": "kernels/aic/aic_qk_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 2, + "source": "kernels/aic/aic_pv_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "source": "kernels/aiv/aiv_softmax_prepare.cpp", + "core_type": "aiv", + "signature": [D.IN, D.OUT, D.OUT, D.OUT], + }, + { + "func_id": 3, + "source": "kernels/aiv/aiv_online_update.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], + }, + ], + } + + CASES = [ + { + "name": "small1", + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 3, "block_dim": 3}, + "params": { + "batch": 1, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 16, + "max_model_len": 256, + "dtype": "float16", + }, + }, + { + "name": "small2", + "platforms": ["a2a3sim", "a2a3"], + "config": {"aicpu_thread_num": 3, "block_dim": 3}, + "manual": True, + "params": { + "batch": 1, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 16, + "block_size": 16, + "context_len": 64, + "max_model_len": 256, + "dtype": "float16", + }, + }, + ] + + def generate_args(self, params): + inputs = _pa_generate_inputs(params) + specs = [] + for name, val in inputs: + if isinstance(val, torch.Tensor): + specs.append(Tensor(name, val)) + else: + specs.append(Scalar(name, val)) + return TaskArgsBuilder(*specs) + + def compute_golden(self, args, params): + tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)} + _pa_compute_golden(tensors, params) + for s in args.specs: + if isinstance(s, Tensor) and s.name in tensors: + getattr(args, s.name)[:] = tensors[s.name] + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/golden.py b/examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/golden.py similarity index 86% rename from tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/golden.py rename to examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/golden.py index 2ffe2b19f..13652252d 100644 --- a/tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/golden.py +++ b/examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/golden.py @@ -1,3 +1,11 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- """ Golden test specification for alternating matmul-add test. @@ -10,9 +18,10 @@ """ import ctypes -import torch import time +import torch + __outputs__ = ["C", "Z"] RTOL = 1e-3 ATOL = 1e-3 @@ -34,7 +43,6 @@ "matmul_batch": 4, # Number of matmul tiles per task "add_batch": 5, # Number of add tiles per task }, - } DEFAULT_CASE = "Case1" @@ -67,14 +75,10 @@ def generate_inputs(params: dict) -> list: if total_matmul_tasks % matmul_batch != 0: raise ValueError( - f"total_matmul_tasks ({total_matmul_tasks}) must be " - f"divisible by matmul_batch ({matmul_batch})" + f"total_matmul_tasks ({total_matmul_tasks}) must be divisible by matmul_batch ({matmul_batch})" ) if total_add_tasks % add_batch != 0: - raise ValueError( - f"total_add_tasks ({total_add_tasks}) must be " - f"divisible by add_batch ({add_batch})" - ) + raise ValueError(f"total_add_tasks ({total_add_tasks}) must be divisible by add_batch ({add_batch})") # Prevent integer overflow in orchestration (task_idx = b * M + m or b * N + n) INT32_MAX = 2**31 - 1 diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aic/kernel_matmul.cpp b/examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aic/kernel_matmul.cpp similarity index 100% rename from tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aic/kernel_matmul.cpp rename to examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aic/kernel_matmul.cpp diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aiv/kernel_add.cpp b/examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aiv/kernel_add.cpp similarity index 100% rename from tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aiv/kernel_add.cpp rename to examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aiv/kernel_add.cpp diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/kernel_config.py similarity index 100% rename from tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/kernel_config.py rename to examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/kernel_config.py diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp similarity index 100% rename from tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp rename to examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp diff --git a/examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/test_alternating_matmul_add.py b/examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/test_alternating_matmul_add.py new file mode 100644 index 000000000..b25b8ee51 --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/test_alternating_matmul_add.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Alternating matmul + add: interleaved AIC (matmul 128x128) and AIV (add 128x128) tasks. + +Tests AIC+AIV mixed execution with scalar parameters and batched task submission. +C[b,m] = A[b,m] @ B[b,m], Z[b,n] = X[b,n] + Y[b,n]. +""" + +import ctypes + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test + + +@scene_test(level=2, runtime="tensormap_and_ringbuffer") +class TestAlternatingMatmulAdd(SceneTestCase): + """Alternating matmul + add with scalar parameters.""" + + RTOL = 1e-3 + ATOL = 1e-3 + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/alternating_orch.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.IN, D.IN, D.OUT, D.IN, D.IN, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "source": "kernels/aic/kernel_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "source": "kernels/aiv/kernel_add.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.OUT], + }, + ], + } + + CASES = [ + { + "name": "default", + "platforms": ["a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": {"batch": 1, "M": 1, "N": 1, "matmul_batch": 1, "add_batch": 1}, + }, + ] + + def generate_args(self, params): + batch = params["batch"] + M = params["M"] + N = params["N"] + matmul_batch = params.get("matmul_batch", 1) + add_batch = params.get("add_batch", 1) + matmul_size = 128 + add_rows = 128 + add_cols = 128 + + torch.manual_seed(42) + A = torch.randn(batch, M, matmul_size, matmul_size, dtype=torch.float32) * 0.01 + B = torch.randn(batch, M, matmul_size, matmul_size, dtype=torch.float32) * 0.01 + C = torch.zeros(batch, M, matmul_size, matmul_size, dtype=torch.float32) + X = torch.randn(batch, N, add_rows, add_cols, dtype=torch.float32) * 0.01 + Y = torch.randn(batch, N, add_rows, add_cols, dtype=torch.float32) * 0.01 + Z = torch.zeros(batch, N, add_rows, add_cols, dtype=torch.float32) + + return TaskArgsBuilder( + Tensor("A", A.flatten()), + Tensor("B", B.flatten()), + Tensor("C", C.flatten()), + Tensor("X", X.flatten()), + Tensor("Y", Y.flatten()), + Tensor("Z", Z.flatten()), + Scalar("batch", ctypes.c_int64(batch)), + Scalar("M_val", ctypes.c_int64(M)), + Scalar("N_val", ctypes.c_int64(N)), + Scalar("matmul_batch", ctypes.c_int64(matmul_batch)), + Scalar("add_batch", ctypes.c_int64(add_batch)), + ) + + def compute_golden(self, args, params): + batch = params["batch"] + M = params["M"] + N = params["N"] + matmul_size = 128 + add_rows = 128 + add_cols = 128 + + A = args.A.reshape(batch, M, matmul_size, matmul_size) + B = args.B.reshape(batch, M, matmul_size, matmul_size) + C = args.C.reshape(batch, M, matmul_size, matmul_size) + X = args.X.reshape(batch, N, add_rows, add_cols) + Y = args.Y.reshape(batch, N, add_rows, add_cols) + Z = args.Z.reshape(batch, N, add_rows, add_cols) + + for b in range(batch): + for m in range(M): + C[b, m] = torch.matmul(A[b, m], B[b, m]) + for b in range(batch): + for n in range(N): + Z[b, n] = X[b, n] + Y[b, n] + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/examples/a2a3/tensormap_and_ringbuffer/paged_attention_ringbuffer/test_paged_attention_ringbuffer.py b/examples/a2a3/tensormap_and_ringbuffer/paged_attention_ringbuffer/test_paged_attention_ringbuffer.py new file mode 100644 index 000000000..0fa823d8e --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/paged_attention_ringbuffer/test_paged_attention_ringbuffer.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Paged attention with small ring buffer sizes — stress test for ring rotation/reclamation. + +Tests RUNTIME_ENV (PTO2_RING_TASK_WINDOW, PTO2_RING_HEAP, PTO2_RING_DEP_POOL), +INOUT tensors, bfloat16, and AIC+AIV mixed execution. +""" + +import torch +from paged_attention_golden import compute_golden as _pa_compute_golden # noqa: PLC0415 +from paged_attention_golden import generate_inputs as _pa_generate_inputs # noqa: PLC0415 +from simpler.task_interface import ArgDirection as D + +from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test + +PA_KERNELS = "../../../../tests/st/a2a3/tensormap_and_ringbuffer/paged_attention/kernels" + + +@scene_test(level=2, runtime="tensormap_and_ringbuffer") +class TestPagedAttentionRingbuffer(SceneTestCase): + """Paged attention with small ring buffer sizes for stress testing.""" + + RTOL = 1e-3 + ATOL = 1e-3 + RUNTIME_ENV = { + "PTO2_RING_TASK_WINDOW": "128", + "PTO2_RING_HEAP": "262144", + "PTO2_RING_DEP_POOL": "256", + } + + CALLABLE = { + "orchestration": { + "source": f"{PA_KERNELS}/orchestration/paged_attention_orch.cpp", + "function_name": "build_paged_attention_graph", + "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "source": f"{PA_KERNELS}/aic/aic_qk_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 2, + "source": f"{PA_KERNELS}/aic/aic_pv_matmul.cpp", + "core_type": "aic", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "source": f"{PA_KERNELS}/aiv/aiv_softmax_prepare.cpp", + "core_type": "aiv", + "signature": [D.IN, D.OUT, D.OUT, D.OUT], + }, + { + "func_id": 3, + "source": f"{PA_KERNELS}/aiv/aiv_online_update.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], + }, + ], + } + + CASES = [ + { + "name": "ringbuffer_stress", + "platforms": ["a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 24}, + "params": { + "batch": 32, + "num_heads": 16, + "kv_head_num": 1, + "head_dim": 128, + "block_size": 128, + "context_len": 4096, + "max_model_len": 32768, + "dtype": "bfloat16", + }, + }, + ] + + def generate_args(self, params): + inputs = _pa_generate_inputs(params) + specs = [] + for name, val in inputs: + if isinstance(val, torch.Tensor): + specs.append(Tensor(name, val)) + else: + specs.append(Scalar(name, val)) + return TaskArgsBuilder(*specs) + + def compute_golden(self, args, params): + tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)} + _pa_compute_golden(tensors, params) + for s in args.specs: + if isinstance(s, Tensor) and s.name in tensors: + getattr(args, s.name)[:] = tensors[s.name] + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/aiv/kernel_add.cpp b/examples/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/aiv/kernel_add.cpp similarity index 100% rename from tests/st/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/aiv/kernel_add.cpp rename to examples/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/aiv/kernel_add.cpp diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/aiv/kernel_noop.cpp b/examples/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/aiv/kernel_noop.cpp similarity index 100% rename from tests/st/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/aiv/kernel_noop.cpp rename to examples/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/aiv/kernel_noop.cpp diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/orchestration/scalar_data_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/orchestration/scalar_data_orch.cpp similarity index 100% rename from tests/st/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/orchestration/scalar_data_orch.cpp rename to examples/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/orchestration/scalar_data_orch.cpp diff --git a/examples/a2a3/tensormap_and_ringbuffer/scalar_data_test/test_scalar_data.py b/examples/a2a3/tensormap_and_ringbuffer/scalar_data_test/test_scalar_data.py new file mode 100644 index 000000000..590bda5cb --- /dev/null +++ b/examples/a2a3/tensormap_and_ringbuffer/scalar_data_test/test_scalar_data.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Scalar data dependency test: GetTensorData, SetTensorData, add_inout. + +Tests orchestration-level data manipulation: scalar initialization, +Get/Set round-trips, WAW+WAR dependency auto-wait, and external tensor WAR. +""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test + + +@scene_test(level=2, runtime="tensormap_and_ringbuffer") +class TestScalarData(SceneTestCase): + """Scalar data dependency: Get/SetTensorData, add_inout with initial value.""" + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/scalar_data_orch.cpp", + "function_name": "aicpu_orchestration_entry", + "signature": [D.IN, D.IN, D.OUT, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "source": "kernels/aiv/kernel_add.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "source": "kernels/aiv/kernel_noop.cpp", + "core_type": "aiv", + "signature": [], + }, + ], + } + + CASES = [ + { + "name": "default", + "platforms": ["a2a3"], + "config": {"aicpu_thread_num": 4, "block_dim": 3}, + "params": {}, + }, + ] + + def generate_args(self, params): + SIZE = 128 * 128 + return TaskArgsBuilder( + Tensor("a", torch.full((SIZE,), 2.0, dtype=torch.float32)), + Tensor("b", torch.arange(SIZE, dtype=torch.float32)), + Tensor("result", torch.zeros(SIZE, dtype=torch.float32)), + Tensor("check", torch.zeros(10, dtype=torch.float32)), + ) + + def compute_golden(self, args, params): + # result = a + b (computed by kernel_add) + args.result[:] = args.a + args.b + + # check values written by orchestration via SetTensorData + args.check[0] = 2.0 # GetTensorData(c, {0}): c = a + b, c[0] = 2.0+0.0 + args.check[1] = 102.0 # GetTensorData(c, {100}): c[100] = 2.0+100.0 + args.check[2] = 77.0 # runtime-created scalar output initialized to 77.0 + args.check[3] = 77.0 # second noop via add_inout preserves the value + args.check[4] = 79.0 # orchestration arithmetic: 2.0 + 77.0 + args.check[5] = 42.0 # Orch set->get round-trip: SetTensorData then GetTensorData + args.check[6] = 12.0 # Orch->AICore RAW: SetTensorData(d,10.0) + kernel_add(d,a) -> 10.0+2.0 + args.check[7] = 88.0 # WAW+WAR: kernel reads c, SetTensorData(c,88.0) auto-waits + args.check[8] = 55.0 # External WAR: noop(ext_b INOUT) -> SetTensorData(ext_b,55.0) auto-waits + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/pyproject.toml b/pyproject.toml index d25a55c9d..ad5d343ba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,9 @@ name = "simpler" version = "0.1.0" requires-python = ">=3.9" +[project.optional-dependencies] +test = ["pytest>=6.0"] + [tool.ruff] line-length = 120 target-version = "py39" @@ -65,7 +68,8 @@ reportRedeclaration = false [tool.pytest.ini_options] testpaths = ["tests", "examples"] -pythonpath = ["python"] +pythonpath = ["python", "golden"] +addopts = "--import-mode=importlib" [tool.scikit-build] wheel.packages = ["simpler_setup"] diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/README.md b/tests/st/a2a3/aicpu_build_graph/paged_attention/README.md deleted file mode 100644 index b56d9774d..000000000 --- a/tests/st/a2a3/aicpu_build_graph/paged_attention/README.md +++ /dev/null @@ -1,113 +0,0 @@ -# Paged Attention (Device Test - aicpu_build_graph) - -This test demonstrates Paged Attention using the **aicpu_build_graph** runtime, where the AICPU device builds the task graph at runtime via a dlopen'd orchestration plugin while scheduler threads execute tasks concurrently. - -The kernel implementations are identical to the `host_build_graph` version. Only the orchestration and runtime configuration differ. - -## Overview - -Paged Attention is an efficient attention mechanism that processes KV cache in fixed-size blocks, enabling memory-efficient inference for long sequences. This implementation uses: - -- **CCE-style codegen** for AIC kernels (Cube unit matmul) -- **PTO Tile API** for AIV kernels (Vector unit operations) -- **Online Softmax** algorithm for numerically stable incremental computation - -### Runtime Architecture - -In `aicpu_build_graph` mode: -- **1 AICPU builder thread** runs the orchestration plugin (builds the task graph) -- **3 AICPU scheduler threads** execute tasks concurrently with graph construction -- The orchestration plugin is compiled as a `.so`, embedded in Runtime, and dlopen'd on AICPU -- The framework pre-allocates device memory for I/O tensors and populates `orch_args[]` - -### Supported Platforms - -| Platform | Description | -|----------|-------------| -| a2a3 | Ascend hardware (requires device ID) | - -> This test uses bfloat16 data types and production-scale shapes that are not supported by the a2a3sim simulator. It only runs on real hardware. - -### Algorithm - -For each query token, the attention is computed incrementally across KV cache blocks: - -``` -For each block j: - sij = Qi @ Kj^T # QK MatMul (AIC) - mij, lij, pij = softmax_prepare(sij) # Softmax (AIV) - oi_new = pij @ Vj # PV MatMul (AIC) - oi = online_update(oi, oi_new, mij, lij) # Accumulate (AIV) -``` - -### Task Graph Structure - -For each batch, the task dependency pattern is: - -``` -Block 0: QK -> SF -> PV --+ -Block 1: QK -> SF -> PV --+--> UP[0] -> UP[1] -> ... -> UP[n] -Block n: QK -> SF -> PV --+ -``` - -- **QK/SF/PV chains**: Run in parallel across blocks -- **UP (Online Update)**: Serialized within batch due to accumulator dependency - -## Quick Start - -```bash -# Run on hardware (specify device ID) -python examples/scripts/run_example.py \ - -k tests/st/aicpu_build_graph/paged_attention/kernels \ - -g tests/st/aicpu_build_graph/paged_attention/golden.py \ - -p a2a3 -d 0 - -# Run multi-block test case -PA_CASE=Case2 python examples/scripts/run_example.py \ - -k tests/st/aicpu_build_graph/paged_attention/kernels \ - -g tests/st/aicpu_build_graph/paged_attention/golden.py \ - -p a2a3 -d 0 -``` - -## Directory Structure - -``` -paged_attention/ -├── README.md # This file -├── golden.py # Input generation and expected output -└── kernels/ - ├── kernel_config.py # Kernel registration config (aicpu_build_graph) - ├── aic/ # AIC kernels (CCE codegen style) - │ ├── aic_qk_matmul.cpp # Q @ K^T matmul - │ └── aic_pv_matmul.cpp # P @ V matmul - ├── aiv/ # AIV kernels (PTO Tile API) - │ ├── aiv_softmax_prepare.cpp # Softmax preparation - │ └── aiv_online_update.cpp # Online Softmax update + normalize - └── orchestration/ - └── paged_attention_orch.cpp # AICPU task graph builder (dlopen'd plugin) -``` - -## Test Cases - -| Case | batch | num_heads | kv_head_num | head_dim | block_size | context_len | Description | -|------|-------|-----------|-------------|----------|------------|-------------|-------------| -| Case1 | 256 | 16 | 1 | 128 | 128 | 8192 | Default | -| Case2 | 64 | 64 | 1 | 128 | 64 | 8192 | Multi-block | - -All test cases use **bfloat16** Q/K/V inputs with GQA (kv_head_num=1). - -## Key Differences from host_build_graph Version - -| Aspect | host_build_graph | aicpu_build_graph | -|--------|------------------|-------------------| -| Graph building | Host CPU | AICPU device (dlopen'd plugin) | -| I/O memory | Orchestration allocates + copies | Framework pre-manages | -| Task API | `runtime->add_task()` | `api.add_task()` | -| Dependency API | `runtime->add_successor()` | `api.add_successor_conditional()` | -| Task visibility | Implicit | Explicit `api.publish_task()` | -| Thread model | 3 scheduler threads | 1 builder + 3 scheduler threads | - -## See Also - -- [host_build_graph version](../../host_build_graph/paged_attention/README.md) -- [Test Framework Documentation](../../../../examples/scripts/README.md) diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/golden.py b/tests/st/a2a3/aicpu_build_graph/paged_attention/golden.py deleted file mode 100644 index cb27d10f6..000000000 --- a/tests/st/a2a3/aicpu_build_graph/paged_attention/golden.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -"""Paged Attention Golden - aicpu_build_graph test (production scale, bfloat16).""" - -from paged_attention_golden import ( - compute_golden, # noqa: F401 - run_golden_test, -) -from paged_attention_golden import generate_inputs as _generate_inputs - -__outputs__ = ["out"] - -RTOL = 1e-3 -ATOL = 1e-3 - -ALL_CASES = { - "Case1": { - "batch": 256, - "num_heads": 16, - "kv_head_num": 1, - "head_dim": 128, - "block_size": 128, - "context_len": 8192, - "max_model_len": 32768, - "dtype": "bfloat16", - }, - "Case2": { - "batch": 64, - "num_heads": 64, - "kv_head_num": 1, - "head_dim": 128, - "block_size": 64, - "context_len": 8192, - "max_model_len": 32768, - "dtype": "bfloat16", - }, -} - -DEFAULT_CASE = "Case1" - - -def generate_inputs(params: dict) -> list: - return _generate_inputs(params) - - -if __name__ == "__main__": - run_golden_test(ALL_CASES, DEFAULT_CASE, generate_inputs) diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/kernel_config.py b/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/kernel_config.py deleted file mode 100644 index 58021f61c..000000000 --- a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/kernel_config.py +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Paged Attention — aicpu_build_graph Runtime - -Kernels and orchestration config for paged attention (per-block version). -Uses explicit add_dependency for task ordering, scope-end batch publish. - -AIC Kernels (Cube): - - aic_qk_matmul: Q @ K^T computation - - aic_pv_matmul: P @ V computation - - aic_hub: placeholder hub task - -AIV Kernels (Vector): - - aiv_softmax_prepare: scale, rowmax, exp, rowsum - - aiv_online_update: online softmax accumulation + fused normalization - - aiv_hub: zero-initialize accumulators -""" - -from pathlib import Path - -from simpler.task_interface import ArgDirection as D # pyright: ignore[reportAttributeAccessIssue] - -_KERNELS_ROOT = Path(__file__).parent - -ORCHESTRATION = { - "source": str(_KERNELS_ROOT / "orchestration" / "paged_attention_orch.cpp"), - "function_name": "aicpu_orchestration_entry", - "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], -} - -KERNELS = [ - { - "func_id": 0, - "name": "QK", - "source": str(_KERNELS_ROOT / "aic" / "aic_qk_matmul.cpp"), - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - { - "func_id": 2, - "name": "PV", - "source": str(_KERNELS_ROOT / "aic" / "aic_pv_matmul.cpp"), - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - { - "func_id": 4, - "name": "AIC_HUB", - "source": str(_KERNELS_ROOT / "aic" / "aic_hub.cpp"), - "core_type": "aic", - "signature": [], - }, - { - "func_id": 1, - "name": "SF", - "source": str(_KERNELS_ROOT / "aiv" / "aiv_softmax_prepare.cpp"), - "core_type": "aiv", - "signature": [D.IN, D.OUT, D.OUT, D.OUT], - }, - { - "func_id": 3, - "name": "UP", - "source": str(_KERNELS_ROOT / "aiv" / "aiv_online_update.cpp"), - "core_type": "aiv", - "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], - }, - { - "func_id": 5, - "name": "AIV_HUB", - "source": str(_KERNELS_ROOT / "aiv" / "aiv_hub.cpp"), - "core_type": "aiv", - "signature": [], - }, -] - -RUNTIME_CONFIG = { - "runtime": "aicpu_build_graph", - "aicpu_thread_num": 4, - "block_dim": 24, -} diff --git a/tests/st/a2a3/host_build_graph/paged_attention/README.md b/tests/st/a2a3/host_build_graph/paged_attention/README.md deleted file mode 100644 index bb280c331..000000000 --- a/tests/st/a2a3/host_build_graph/paged_attention/README.md +++ /dev/null @@ -1,192 +0,0 @@ -# Paged Attention (Device Test) - -This example demonstrates Paged Attention implementation using CCE (Cube Core Engine) code generation, with AIC matmul kernels and AIV vector kernels using PTO Tile API. - -## Overview - -Paged Attention is an efficient attention mechanism that processes KV cache in fixed-size blocks, enabling memory-efficient inference for long sequences. This implementation uses: - -- **CCE-style codegen** for AIC kernels (Cube unit matmul) -- **PTO Tile API** for AIV kernels (Vector unit operations) -- **Online Softmax** algorithm for numerically stable incremental computation - -### Supported Platforms - -| Platform | Description | -|----------|-------------| -| a2a3 | Ascend hardware (requires device ID) | - -> This test uses bfloat16 data types and production-scale shapes that are not supported by the a2a3sim simulator. It only runs on real hardware. - -### Algorithm - -For each query token, the attention is computed incrementally across KV cache blocks: - -``` -For each block j: - sij = Qi @ Kj^T # QK MatMul (AIC) - mij, lij, pij = softmax_prepare(sij) # Softmax (AIV) - oi_new = pij @ Vj # PV MatMul (AIC) - oi = online_update(oi, oi_new, mij, lij) # Accumulate (AIV) -``` - -### Kernel Design (AIC/AIV Split) - -| Kernel | Core Type | Operation | Key Instructions | -|--------|-----------|-----------|------------------| -| aic_qk_matmul | AIC (Cube) | Q @ K^T | TLOAD/TMOV/TMATMUL/TSTORE | -| aiv_softmax_prepare | AIV (Vector) | scale, rowmax, exp, rowsum | TMULS/TROWMAX/TROWEXPANDSUB/TEXP/TROWSUM | -| aic_pv_matmul | AIC (Cube) | P @ V | TLOAD/TMOV/TMATMUL/TSTORE | -| aiv_online_update | AIV (Vector) | Online Softmax + normalize | TMAX/TSUB/TEXP/TROWEXPANDMUL/TROWEXPANDDIV | - -### Memory Hierarchy (AIC Matmul) - -``` -GM -> L1 (Mat tiles) -> L0A/L0B -> L0C (Accumulator) -> GM -``` - -### Task Graph Structure - -For each batch, the task dependency pattern is: - -``` -Block 0: QK -> SF -> PV --+ -Block 1: QK -> SF -> PV --+--> UP[0] -> UP[1] -> ... -> UP[n] -Block n: QK -> SF -> PV --+ -``` - -- **QK/SF/PV chains**: Run in parallel across blocks -- **UP (Online Update)**: Serialized within batch due to accumulator dependency - -## Quick Start - -```bash -# Run on hardware (specify device ID) -python examples/scripts/run_example.py \ - -k tests/st/host_build_graph/paged_attention/kernels \ - -g tests/st/host_build_graph/paged_attention/golden.py \ - -p a2a3 -d 0 - -# Run multi-block test case -PA_CASE=Case2 python examples/scripts/run_example.py \ - -k tests/st/host_build_graph/paged_attention/kernels \ - -g tests/st/host_build_graph/paged_attention/golden.py \ - -p a2a3 -d 0 -``` - -## Directory Structure - -``` -paged_attention/ -├── README.md # This file -├── golden.py # Input generation and expected output -└── kernels/ - ├── kernel_config.py # Kernel registration config - ├── aic/ # AIC kernels (CCE codegen style) - │ ├── aic_qk_matmul.cpp # Q @ K^T matmul - │ └── aic_pv_matmul.cpp # P @ V matmul - ├── aiv/ # AIV kernels (PTO Tile API) - │ ├── aiv_softmax_prepare.cpp # Softmax preparation - │ └── aiv_online_update.cpp # Online Softmax update + normalize - └── orchestration/ - └── paged_attention_orch.cpp # Task graph builder -``` - -## Test Cases - -| Case | batch | num_heads | kv_head_num | head_dim | block_size | context_len | Description | -|------|-------|-----------|-------------|----------|------------|-------------|-------------| -| Case1 | 1 | 16 | 1 | 128 | 128 | 256 | Small scale (default) | -| Case2 | 8 | 64 | 1 | 128 | 64 | 8192 | Production scale | - -All test cases use **bfloat16** Q/K/V inputs with GQA (kv_head_num=1). - -## Key Technical Details - -### AIC Kernels (CCE Codegen) - -```cpp -// L1 tiles: ColMajor + SLayout::RowMajor (required for matmul) -using TileMatA = Tile; -using TileMatB = Tile; - -// L0 tiles: Use standard TileLeft/TileRight/TileAcc aliases -using LeftTile = TileLeft; -using RightTile = TileRight; -using AccTile = TileAcc; - -// Pipeline: MTE2 -> MTE1 -> M -> FIX -> MTE3 -TLOAD(aMatTile, qiGlobal); // GM -> L1 -TMOV(aTile, aMatTile); // L1 -> L0A -TMATMUL(cTile, aTile, bTile); // L0A x L0B -> L0C -TSTORE(sijGlobal, cTile); // L0C -> GM -``` - -### AIV Kernels (PTO Tile API) - -**softmax_prepare**: Uses DN layout (ColMajor, 16x1) for row reduction results - -```cpp -using TileScalarDN = Tile; - -TMULS(sijTile, sijTile, scale_value); // Scale -TROWMAX(maxTile, sijTile, tmpTile); // Row max -TROWEXPANDSUB(pijTile, sijTile, maxTile); // Subtract max (broadcast) -TEXP(pijTile, pijTile); // Exp -TROWSUM(sumTile, pijTile, tmpTile); // Row sum -``` - -**online_update**: Uses ND/DN layout conversion for hardware compatibility - -```cpp -// ND (1x16, RowMajor) for scalar arithmetic - TSUB/TMUL/TADD require RowMajor -using TileScalarND = Tile; -// DN (16x1, ColMajor) for row broadcast - TROWEXPANDMUL/TROWEXPANDDIV require this -using TileScalarDN = Tile; - -// Arithmetic in ND layout -TMAX(miNewTileND, miTileND, mijTileND); -TSUB(alphaTileND, miTileND, miNewTileND); -TEXP(alphaTileND, alphaTileND); - -// Reshape ND -> DN for broadcast operations -TRESHAPE(alphaTileDN, alphaTileND); -TROWEXPANDMUL(oiTile, oiTile, alphaTileDN); -``` - -### Data Layout - -- **K stored as K^T**: (head_dim, block_size) for direct matmul compatibility -- **V stored normally**: (block_size, head_dim) - -## Expected Output - -``` -=== Compiling and Registering Kernels === -Compiling kernel: .../aic_qk_matmul.cpp (func_id=0) -Compiling kernel: .../aiv_softmax_prepare.cpp (func_id=1) -Compiling kernel: .../aic_pv_matmul.cpp (func_id=2) -Compiling kernel: .../aiv_online_update.cpp (func_id=3) -... -=== build_paged_attention_graph (16x16 framework version) === -batch=1, num_heads=16, kv_head_num=1, head_dim=16 -block_size=16, block_num=1 -... -Created 4 tasks -... -=== Comparing Results === -Comparing out: shape=(256,), dtype=float32 - out: PASS (256/256 elements matched) - -============================================================ -TEST PASSED -============================================================ -``` - -## Reference - -This implementation uses the Online Softmax algorithm for paged attention, with identical kernel structure to the PyPTO reference implementation. - -## See Also - -- [Test Framework Documentation](../../../../examples/scripts/README.md) diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_ringbuffer/golden.py b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_ringbuffer/golden.py deleted file mode 100644 index aba775d82..000000000 --- a/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_ringbuffer/golden.py +++ /dev/null @@ -1,40 +0,0 @@ -""" -Paged Attention Ring Buffer Stress Test Golden - -Tests paged attention with small ring buffer sizes (TW=1024, HP=1MB, DP=1024) -to guard the ring buffer rotation/reclamation logic. -""" - -from paged_attention_golden import ( - generate_inputs as _generate_inputs, - compute_golden, - run_golden_test, -) - -__outputs__ = ["out"] - -RTOL = 1e-3 -ATOL = 1e-3 - -ALL_CASES = { - "Case1": { - "batch": 32, - "num_heads": 16, - "kv_head_num": 1, - "head_dim": 128, - "block_size": 128, - "context_len": 4096, - "max_model_len": 32768, - "dtype": "bfloat16", - }, -} - -DEFAULT_CASE = "Case1" - - -def generate_inputs(params: dict) -> list: - return _generate_inputs(params) - - -if __name__ == "__main__": - run_golden_test(ALL_CASES, DEFAULT_CASE, generate_inputs) diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_ringbuffer/kernels/kernel_config.py b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_ringbuffer/kernels/kernel_config.py deleted file mode 100644 index 100a75305..000000000 --- a/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_ringbuffer/kernels/kernel_config.py +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Paged Attention Ring Buffer Stress Test - -Reuses paged_attention kernels and orchestration with deliberately small -ring buffer sizes to exercise and guard the ring buffer rotation logic. - -The orchestration uses an inner PTO2_SCOPE per block, allowing per-block -ring resources to be reclaimed. Combined with small ring sizes, this -stresses the back-pressure and reclamation paths. - -Environment overrides: - PTO2_RING_TASK_WINDOW = 128 (vs default 65536, 8x smaller than prev 1024) - PTO2_RING_HEAP = 256KB (vs default 1GB, 4x smaller than prev 1MB) - PTO2_RING_DEP_POOL = 256 (vs default 65536, 4x smaller than prev 1024) -""" - -from pathlib import Path - -from simpler.task_interface import ArgDirection as D # pyright: ignore[reportAttributeAccessIssue] - -# Point to paged_attention's kernel sources (no duplication) -_PA_KERNELS = Path(__file__).parent / ".." / ".." / "paged_attention" / "kernels" - -ORCHESTRATION = { - "source": str(_PA_KERNELS / "orchestration" / "paged_attention_orch.cpp"), - "function_name": "build_paged_attention_graph", - "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT], -} - -KERNELS = [ - { - "func_id": 0, - "name": "QK", - "source": str(_PA_KERNELS / "aic" / "aic_qk_matmul.cpp"), - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - { - "func_id": 2, - "name": "PV", - "source": str(_PA_KERNELS / "aic" / "aic_pv_matmul.cpp"), - "core_type": "aic", - "signature": [D.IN, D.IN, D.OUT], - }, - { - "func_id": 1, - "name": "SF", - "source": str(_PA_KERNELS / "aiv" / "aiv_softmax_prepare.cpp"), - "core_type": "aiv", - "signature": [D.IN, D.OUT, D.OUT, D.OUT], - }, - { - "func_id": 3, - "name": "UP", - "source": str(_PA_KERNELS / "aiv" / "aiv_online_update.cpp"), - "core_type": "aiv", - "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT], - }, -] - -RUNTIME_CONFIG = { - "runtime": "tensormap_and_ringbuffer", - "aicpu_thread_num": 4, - "block_dim": 24, -} - -# Small ring buffer sizes — see module docstring for rationale. -RUNTIME_ENV = { - "PTO2_RING_TASK_WINDOW": "128", - "PTO2_RING_HEAP": "262144", - "PTO2_RING_DEP_POOL": "256", -} diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/scalar_data_test/golden.py b/tests/st/a2a3/tensormap_and_ringbuffer/scalar_data_test/golden.py deleted file mode 100644 index 2b010b018..000000000 --- a/tests/st/a2a3/tensormap_and_ringbuffer/scalar_data_test/golden.py +++ /dev/null @@ -1,79 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- - -""" -Golden script for scalar data dependency test. - -Tests GetTensorData, SetTensorData, and add_inout with initial value. - -Computation: - c = a + b (kernel, internal tensor) - check[0] = GetTensorData(c, {0}) = a[0]+b[0] = 2.0+0.0 = 2.0 - check[1] = GetTensorData(c, {100}) = a[100]+b[100] = 2.0+100.0 = 102.0 - scalar initialized to 77.0 via add_output(TensorCreateInfo, to_u64(77.0f)) - check[2] = GetTensorData(scalar, {0}) = 77.0 - second noop with add_inout(scalar), value preserved - check[3] = GetTensorData(scalar, {0}) = 77.0 - check[4] = orchestration arithmetic: 2.0 + 77.0 = 79.0 - SetTensorData(scalar, {0}, 42.0), then GetTensorData round-trip - check[5] = GetTensorData(scalar, {0}) = 42.0 - Orch SetTensorData(d, {0}, 10.0) → kernel_add(d, a) → e[0] = 12.0 - check[6] = GetTensorData(e, {0}) = 12.0 - WAW+WAR: kernel reads c as INPUT, then SetTensorData(c, 88.0) auto-waits - check[7] = GetTensorData(c, {0}) = 88.0 - External WAR: noop(ext_b as INOUT) → SetTensorData(ext_b, 55.0) auto-waits - check[8] = GetTensorData(ext_b, {0}) = 55.0 (ext_b[0] restored to 0.0 after) - result = a + b (kernel, external output) - -Args layout: [a, b, result, check] -""" - -import torch - -__outputs__ = ["result", "check"] - -RTOL = 1e-5 -ATOL = 1e-5 - - -def generate_inputs(params: dict) -> list: - SIZE = 128 * 128 # 16384 -- matches kernel_add 128x128 tile - - a = torch.full((SIZE,), 2.0, dtype=torch.float32) - b = torch.arange(SIZE, dtype=torch.float32) - result = torch.zeros(SIZE, dtype=torch.float32) - check = torch.zeros(10, dtype=torch.float32) - - return [ - ("a", a), - ("b", b), - ("result", result), - ("check", check), - ] - - -def compute_golden(tensors: dict, params: dict) -> None: - a = torch.as_tensor(tensors["a"]) - b = torch.as_tensor(tensors["b"]) - - # result = a + b (computed by kernel_add) - tensors["result"][:] = a + b - - # check values written by orchestration via SetTensorData - check = torch.as_tensor(tensors["check"]) - check[0] = 2.0 # GetTensorData(c, {0}): c = a + b, c[0] = 2.0+0.0 - check[1] = 102.0 # GetTensorData(c, {100}): c[100] = 2.0+100.0 - check[2] = 77.0 # runtime-created scalar output initialized to 77.0 - check[3] = 77.0 # second noop via add_inout preserves the value - check[4] = 79.0 # orchestration arithmetic: 2.0 + 77.0 - check[5] = 42.0 # Orch set→get round-trip: SetTensorData then GetTensorData - check[6] = 12.0 # Orch→AICore RAW: SetTensorData(d,10.0) + kernel_add(d,a) → 10.0+2.0 - check[7] = 88.0 # WAW+WAR: kernel reads c, SetTensorData(c,88.0) auto-waits for consumer - check[8] = 55.0 # External WAR: noop(ext_b INOUT) → SetTensorData(ext_b,55.0) auto-waits - # check[9] remains 0.0 (sentinel) diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/kernel_config.py b/tests/st/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/kernel_config.py deleted file mode 100644 index f7c67b088..000000000 --- a/tests/st/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/kernel_config.py +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright (c) PyPTO Contributors. -# This program is free software, you can redistribute it and/or modify it under the terms and conditions of -# CANN Open Software License Agreement Version 2.0 (the "License"). -# Please refer to the License for details. You may not use this file except in compliance with the License. -# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -# See LICENSE in the root of the software repository for the full text of the License. -# ----------------------------------------------------------------------------------------------------------- -""" -Kernel configuration for scalar data dependency test (tensormap_and_ringbuffer). - -Tests GetTensorData, SetTensorData, and add_inout with initial value. - -Kernels: - func_id=0: kernel_add (AIV) - element-wise tensor addition (128x128) - func_id=1: kernel_noop (AIV) - empty kernel for allocation trigger -""" - -from pathlib import Path - -from simpler.task_interface import ArgDirection as D # pyright: ignore[reportAttributeAccessIssue] - -_KERNELS_ROOT = Path(__file__).parent - -ORCHESTRATION = { - "source": str(_KERNELS_ROOT / "orchestration" / "scalar_data_orch.cpp"), - "function_name": "aicpu_orchestration_entry", - "signature": [D.IN, D.IN, D.OUT, D.OUT], -} - -KERNELS = [ - { - "func_id": 0, - "source": str(_KERNELS_ROOT / "aiv" / "kernel_add.cpp"), - "core_type": "aiv", - "signature": [D.IN, D.IN, D.OUT], - }, - { - "func_id": 1, - "source": str(_KERNELS_ROOT / "aiv" / "kernel_noop.cpp"), - "core_type": "aiv", - "signature": [], - }, -] - -RUNTIME_CONFIG = { - "runtime": "tensormap_and_ringbuffer", - "aicpu_thread_num": 4, - "block_dim": 3, -} diff --git a/tools/benchmark_rounds.sh b/tools/benchmark_rounds.sh index 64b283e81..bd674b866 100755 --- a/tools/benchmark_rounds.sh +++ b/tools/benchmark_rounds.sh @@ -1,4 +1,12 @@ #!/usr/bin/env bash +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- # Benchmark wrapper: run examples on hardware, # then parse device-log timing lines to report per-round latency. # @@ -122,9 +130,14 @@ vlog() { } # --------------------------------------------------------------------------- -# Derive arch from platform and set examples directory +# Derive arch from platform and set examples directories # --------------------------------------------------------------------------- -EXAMPLES_DIR="$PROJECT_ROOT/tests/st/${PLATFORM}/${RUNTIME}" +# Search both examples/ (migrated tests) and tests/st/ (legacy tests) +ARCH="${PLATFORM%%sim}" # strip "sim" suffix if present +EXAMPLES_DIRS=( + "$PROJECT_ROOT/examples/${ARCH}/${RUNTIME}" + "$PROJECT_ROOT/tests/st/${ARCH}/${RUNTIME}" +) # Clock frequency (MHz) for converting cycle counts to microseconds case "$PLATFORM" in @@ -458,12 +471,19 @@ SUMMARY_ORCH=() echo "" echo "Runtime: $RUNTIME" -echo "Tests dir: $EXAMPLES_DIR" for example in "${EXAMPLE_ORDER[@]}"; do case_list="${EXAMPLE_CASES[$example]:-}" - EXAMPLE_DIR="$EXAMPLES_DIR/$example" + # Search for example in both directories + EXAMPLE_DIR="" + for dir in "${EXAMPLES_DIRS[@]}"; do + if [[ -f "$dir/$example/golden.py" && -d "$dir/$example/kernels" ]]; then + EXAMPLE_DIR="$dir/$example" + break + fi + done + KERNELS_DIR="$EXAMPLE_DIR/kernels" GOLDEN="$EXAMPLE_DIR/golden.py" @@ -472,8 +492,8 @@ for example in "${EXAMPLE_ORDER[@]}"; do echo " $example" echo "================================================================" - if [[ ! -f "$GOLDEN" || ! -d "$KERNELS_DIR" ]]; then - echo " SKIP: missing kernels/ or golden.py" + if [[ -z "$EXAMPLE_DIR" ]]; then + echo " SKIP: not found in any search directory" ((FAIL++)) || true continue fi