diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 22e6cbbba..5b43bdc02 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -174,7 +174,7 @@ jobs:
         run: python ci.py -p a2a3sim -c d96c8784 -t 600 --clone-protocol https
 
       - name: Run pytest scene tests (a2a3sim)
-        run: pytest examples tests/st --platform a2a3sim -v
+        run: pytest examples tests/st --platform a2a3sim --device 0-15 -v
 
   st-sim-a5:
     runs-on: ${{ matrix.os }}
@@ -228,7 +228,7 @@ jobs:
         run: python ci.py -p a5sim -c d96c8784 -t 600 --clone-protocol https
 
       - name: Run pytest scene tests (a5sim)
-        run: pytest examples tests/st --platform a5sim -v
+        run: pytest examples tests/st --platform a5sim --device 0-15 -v
 
   # ---------- Python unit tests (a2a3 hardware) ----------
   ut-py-a2a3:
diff --git a/conftest.py b/conftest.py
index 5e33c740f..a36945dea 100644
--- a/conftest.py
+++ b/conftest.py
@@ -6,10 +6,19 @@
 # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 # See LICENSE in the root of the software repository for the full text of the License.
 # -----------------------------------------------------------------------------------------------------------
-"""Root conftest — CLI options, markers, ST platform filtering, and ST fixtures."""
+"""Root conftest — CLI options, markers, ST platform filtering, runtime isolation, and ST fixtures.
+
+Runtime isolation: CANN's AICPU framework caches the user .so per device context.
+Switching runtimes on the same device within one process causes hangs. When multiple
+runtimes are collected and --runtime is not specified, pytest_runtestloop spawns a
+subprocess per runtime so each gets a clean CANN context. See docs/testing.md.
+"""
 
 from __future__ import annotations
 
+import subprocess
+import sys
+
 import pytest
 
 
@@ -22,22 +31,16 @@ def _parse_device_range(s: str) -> list[int]:
 
 
 class DevicePool:
-    """Simple device allocator for pytest fixtures.
+    """Device allocator for pytest fixtures.
 
-    On sim platforms, device IDs are virtual — allocate always succeeds.
-    On real hardware, IDs are exclusive.
+    Manages a fixed set of device IDs. Tests allocate IDs before use
+    and release them after. Works identically for sim and onboard.
     """
 
-    def __init__(self, device_ids: list[int], *, is_sim: bool = False):
+    def __init__(self, device_ids: list[int]):
         self._available = list(device_ids)
-        self._is_sim = is_sim
-        self._sim_next = 0
 
     def allocate(self, n: int = 1) -> list[int]:
-        if self._is_sim:
-            ids = list(range(self._sim_next, self._sim_next + n))
-            self._sim_next += n
-            return ids
         if n > len(self._available):
             return []
         allocated = self._available[:n]
@@ -45,8 +48,7 @@ def allocate(self, n: int = 1) -> list[int]:
         return allocated
 
     def release(self, ids: list[int]) -> None:
-        if not self._is_sim:
-            self._available.extend(ids)
+        self._available.extend(ids)
 
 
 _device_pool: DevicePool | None = None
@@ -58,6 +60,7 @@ def pytest_addoption(parser):
     parser.addoption("--device", action="store", default="0", help="Device ID or range (e.g., 0, 4-7)")
     parser.addoption("--case", action="store", default=None, help="Run specific case name only")
     parser.addoption("--all-cases", action="store_true", default=False, help="Include manual cases")
+    parser.addoption("--runtime", action="store", default=None, help="Only run tests for this runtime")
 
 
 def pytest_configure(config):
@@ -68,18 +71,31 @@ def pytest_configure(config):
 
 
 def pytest_collection_modifyitems(session, config, items):
-    """Skip ST tests based on --platform filter."""
+    """Skip ST tests based on --platform and --runtime filters, and order L3 before L2."""
     platform = config.getoption("--platform")
+    runtime_filter = config.getoption("--runtime")
+
+    # Sort: L3 tests first (they fork child processes that inherit main process CANN state,
+    # so they must run before L2 tests pollute the CANN context).
+    def sort_key(item):
+        cls = getattr(item, "cls", None)
+        level = getattr(cls, "_st_level", 0) if cls else 0
+        return (0 if level >= 3 else 1, item.nodeid)
+
+    items.sort(key=sort_key)
+
     for item in items:
-        # SceneTestCase subclass: skip if no case matches current platform
         cls = getattr(item, "cls", None)
         if cls and hasattr(cls, "CASES") and isinstance(cls.CASES, list):
             if not platform:
                 item.add_marker(pytest.mark.skip(reason="--platform required"))
             elif not any(platform in c.get("platforms", []) for c in cls.CASES):
                 item.add_marker(pytest.mark.skip(reason=f"No cases for {platform}"))
+            elif runtime_filter and getattr(cls, "_st_runtime", None) != runtime_filter:
+                item.add_marker(
+                    pytest.mark.skip(reason=f"Runtime {getattr(cls, '_st_runtime', '?')} != {runtime_filter}")
+                )
             continue
-        # Standalone function with @pytest.mark.platforms([...])
         platforms_marker = item.get_closest_marker("platforms")
         if platforms_marker:
             if not platform:
@@ -88,15 +104,77 @@ def pytest_collection_modifyitems(session, config, items):
                 item.add_marker(pytest.mark.skip(reason=f"Not supported on {platform}"))
 
 
+# ---------------------------------------------------------------------------
+# Runtime isolation: spawn subprocess per runtime
+# ---------------------------------------------------------------------------
+
+
+def _collect_st_runtimes(items):
+    """Return sorted list of unique runtimes from collected SceneTestCase items."""
+    runtimes = set()
+    for item in items:
+        cls = getattr(item, "cls", None)
+        rt = getattr(cls, "_st_runtime", None) if cls else None
+        if rt:
+            runtimes.add(rt)
+    return sorted(runtimes)
+
+
+def pytest_runtestloop(session):
+    """Override test execution to isolate runtimes in subprocesses.
+
+    If --runtime is specified (or only one runtime collected), run normally.
+    Otherwise, spawn one subprocess per runtime and aggregate results.
+    """
+    runtime_filter = session.config.getoption("--runtime")
+    if runtime_filter:
+        return  # single runtime — let pytest run normally
+
+    runtimes = _collect_st_runtimes(session.items)
+    if len(runtimes) <= 1:
+        return  # zero or one runtime — no isolation needed
+
+    # Multiple runtimes: spawn subprocess per runtime
+    # Re-invoke pytest with the same args + --runtime <rt> for each runtime
+    base_args = [sys.executable, "-m", "pytest"]
+    for arg in session.config.invocation_params.args:
+        base_args.append(str(arg))
+
+    failed = False
+    for rt in runtimes:
+        # Build subprocess command: inject --runtime <rt>
+        cmd = base_args + ["--runtime", rt]
+        header = f"  Runtime: {rt}"
+        print(f"\n{'=' * 60}\n{header}\n{'=' * 60}\n", flush=True)
+
+        result = subprocess.run(cmd, check=False, cwd=session.config.invocation_params.dir)
+        if result.returncode != 0:
+            failed = True
+            print(f"\n*** Runtime {rt}: FAILED ***\n", flush=True)
+        else:
+            print(f"\n--- Runtime {rt}: PASSED ---\n", flush=True)
+
+    if failed:
+        session.testsfailed = 1
+    else:
+        session.testscollected = sum(1 for _ in session.items)
+        session.testsfailed = 0
+
+    return True  # returning True prevents default runtestloop
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
 @pytest.fixture(scope="session")
 def device_pool(request):
     """Session-scoped device pool parsed from --device."""
     global _device_pool  # noqa: PLW0603
     if _device_pool is None:
         raw = request.config.getoption("--device")
-        platform = request.config.getoption("--platform")
-        is_sim = platform is None or platform.endswith("sim")
-        _device_pool = DevicePool(_parse_device_range(raw), is_sim=is_sim)
+        _device_pool = DevicePool(_parse_device_range(raw))
     return _device_pool
 
 
diff --git a/docs/distributed_level_runtime.md b/docs/distributed_level_runtime.md
index 8a73dca7c..1f1204639 100644
--- a/docs/distributed_level_runtime.md
+++ b/docs/distributed_level_runtime.md
@@ -339,6 +339,12 @@ Python Application
                  └── forked child process    ← mailbox state machine
 ```
 
+## Runtime Isolation (Onboard Hardware)
+
+A single device can only run **one runtime** per CANN process context. CANN's AICPU framework (`libaicpu_extend_kernels.so`) caches the user AICPU .so on first load and skips reloading on subsequent launches. If a different runtime's AICPU .so is launched on the same device, the cached (stale) function pointers are used, causing hangs.
+
+This means: **do not reuse a device across different runtimes within a single process.** Either use separate processes (one per runtime), or partition devices so each runtime gets exclusive devices. See [testing.md](testing.md#runtime-isolation-constraint-onboard) for details and the pytest device allocation algorithm.
+
 ## Files
 
 | File | Purpose |
diff --git a/docs/testing.md b/docs/testing.md
index 69612672e..7f38f8298 100644
--- a/docs/testing.md
+++ b/docs/testing.md
@@ -331,6 +331,71 @@ The test will be automatically picked up by `ci.py`. New tests should prefer the
 
 See [ci.md](ci.md) for the full CI pipeline documentation, including the job matrix, runner constraints, marker scheme, and `ci.sh` internals.
 
+## Runtime Isolation Constraint (Onboard)
+
+**One device can only run one runtime per process.** Switching runtimes on the same device within a single process causes AICPU kernel hangs.
+
+### Root Cause
+
+CANN's AICPU dispatch uses a framework SO (`libaicpu_extend_kernels.so`) with a global singleton `BackendServerHandleManager` that:
+
+1. **`SaveSoFile`**: Writes the user AICPU .so to disk on first call, then sets `firstCreatSo_ = true` to skip all subsequent writes.
+2. **`SetTileFwkKernelMap`**: `dlopen`s the .so and caches function pointers on first call, then sets `firstLoadSo_ = true` to skip all subsequent loads.
+
+When a second runtime launches on the same device (same CANN process context), the Init kernel call hits the cached flags — the new AICPU .so is never written or loaded. The Exec kernel then calls function pointers from the first runtime's .so, which operates on incompatible data structures and hangs.
+
+### Impact
+
+| Scenario | Result |
+| -------- | ------ |
+| Same runtime, same device, sequential | Works (same .so, cached pointers valid) |
+| Different runtime, same device, sequential | **Hangs** (stale .so, wrong function pointers) |
+| Different runtime, different device | Works (separate CANN context per device) |
+| Different runtime, different process, same device | Works (`rtDeviceReset` between processes clears context) |
+
+### Mitigation in pytest
+
+The `conftest.py` device allocator groups tests by runtime and assigns each runtime group to exclusive devices. See "Device Allocation Algorithm" below.
+
+## Device Allocation Algorithm (Onboard pytest)
+
+When running `pytest --platform a2a3 --device 8-11`, the fixture must allocate devices to tests such that:
+
+1. **Runtime isolation**: A device used by runtime A must not be reused by runtime B in the same process.
+2. **L3 multi-device**: L3 tests may need 2+ contiguous devices.
+3. **Efficiency**: Devices freed by one test of the same runtime can be reused by the next.
+
+### Algorithm
+
+```text
+Phase 1: Group tests by runtime
+  tensormap_and_ringbuffer: [TestVectorExample, TestScalarData, TestL3Dependency, ...]
+  aicpu_build_graph:        [TestPagedAttentionAicpuBuildGraph]
+  host_build_graph:         [TestPagedAttentionHostBuildGraph]
+
+Phase 2: Partition devices across runtime groups
+  Available: [8, 9, 10, 11]
+  tensormap_and_ringbuffer (6 tests, needs max 2 for L3 group): devices [8, 9]
+  aicpu_build_graph (1 test, needs 1):                          devices [10]
+  host_build_graph (1 test, needs 1):                           devices [11]
+
+Phase 3: Within each group, allocate from group's device pool
+  TestVectorExample:       dev 8 → run → release → dev 8 available again
+  TestScalarData:          dev 8 → run → release → OK (same runtime)
+  TestL3Dependency:        dev 8 → run → release
+  TestL3Group:             dev [8, 9] → run → release
+  TestPagedAttentionAicpuBuildGraph: dev 10 → run → release
+  TestPagedAttentionHostBuildGraph:  dev 11 → run → release
+```
+
+### Implementation
+
+The `DevicePool` in `conftest.py` is extended with runtime-aware partitioning. The `st_worker` fixture checks the test class's `_st_runtime` and allocates from the corresponding partition.
+
+### Sim platforms
+
+On sim (`a2a3sim`, `a5sim`), device IDs are virtual — no hardware state, no isolation constraint. All tests share a single virtual pool with auto-incrementing IDs.
+
 ## Per-Case Device Filtering
 
 The `@scene_test(platforms=[...])` decorator provides per-case platform filtering. A single test class declares which platforms it supports:
diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_hub.cpp b/examples/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_hub.cpp
similarity index 100%
rename from tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_hub.cpp
rename to examples/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_hub.cpp
diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp b/examples/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
similarity index 100%
rename from tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
rename to examples/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp b/examples/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
similarity index 100%
rename from tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
rename to examples/a2a3/aicpu_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_hub.cpp b/examples/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_hub.cpp
similarity index 100%
rename from tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_hub.cpp
rename to examples/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_hub.cpp
diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp b/examples/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
similarity index 100%
rename from tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
rename to examples/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/examples/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
similarity index 100%
rename from tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
rename to examples/a2a3/aicpu_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a2a3/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
similarity index 100%
rename from tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
rename to examples/a2a3/aicpu_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
diff --git a/examples/a2a3/aicpu_build_graph/paged_attention/test_paged_attention.py b/examples/a2a3/aicpu_build_graph/paged_attention/test_paged_attention.py
new file mode 100644
index 000000000..62687419b
--- /dev/null
+++ b/examples/a2a3/aicpu_build_graph/paged_attention/test_paged_attention.py
@@ -0,0 +1,129 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Paged attention — aicpu_build_graph runtime (production scale, bfloat16).
+
+Tests aicpu_build_graph runtime with hub kernels (aic_hub, aiv_hub),
+INOUT tensors, and AIC+AIV mixed execution.
+"""
+
+import torch
+from paged_attention_golden import compute_golden as _pa_compute_golden  # noqa: PLC0415
+from paged_attention_golden import generate_inputs as _pa_generate_inputs  # noqa: PLC0415
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+
+@scene_test(level=2, runtime="aicpu_build_graph")
+class TestPagedAttentionAicpuBuildGraph(SceneTestCase):
+    """Paged attention with aicpu_build_graph runtime and hub kernels."""
+
+    RTOL = 1e-3
+    ATOL = 1e-3
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/paged_attention_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": "kernels/aic/aic_qk_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "source": "kernels/aic/aic_pv_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 4,
+                "source": "kernels/aic/aic_hub.cpp",
+                "core_type": "aic",
+                "signature": [],
+            },
+            {
+                "func_id": 1,
+                "source": "kernels/aiv/aiv_softmax_prepare.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT, D.OUT, D.OUT],
+            },
+            {
+                "func_id": 3,
+                "source": "kernels/aiv/aiv_online_update.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
+            },
+            {
+                "func_id": 5,
+                "source": "kernels/aiv/aiv_hub.cpp",
+                "core_type": "aiv",
+                "signature": [],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "case1",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 256,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 128,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "case2",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 64,
+                "num_heads": 64,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 64,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+    ]
+
+    def generate_args(self, params):
+        inputs = _pa_generate_inputs(params)
+        specs = []
+        for name, val in inputs:
+            if isinstance(val, torch.Tensor):
+                specs.append(Tensor(name, val))
+            else:
+                specs.append(Scalar(name, val))
+        return TaskArgsBuilder(*specs)
+
+    def compute_golden(self, args, params):
+        tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)}
+        _pa_compute_golden(tensors, params)
+        for s in args.specs:
+            if isinstance(s, Tensor) and s.name in tensors:
+                getattr(args, s.name)[:] = tensors[s.name]
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/examples/a2a3/host_build_graph/paged_attention/golden.py b/examples/a2a3/host_build_graph/paged_attention/golden.py
deleted file mode 100644
index fc57c9f7d..000000000
--- a/examples/a2a3/host_build_graph/paged_attention/golden.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""Paged Attention Golden - host_build_graph example (small scale, float16).
-
-Args layout: [query, key_cache, value_cache, block_table, context_lens, out, scale]
-  - Tensors retain original multi-dimensional shapes (ContinuousTensor metadata carries shape/dtype)
-  - scale is a scalar float parameter
-"""
-
-from paged_attention_golden import (
-    compute_golden,  # noqa: F401
-    run_golden_test,
-)
-from paged_attention_golden import generate_inputs as _generate_inputs
-
-__outputs__ = ["out"]
-
-RTOL = 1e-2
-ATOL = 1e-2
-
-ALL_CASES = {
-    "Case1": {
-        "batch": 1,
-        "num_heads": 16,
-        "kv_head_num": 1,
-        "head_dim": 16,
-        "block_size": 16,
-        "context_len": 16,
-        "max_model_len": 256,
-        "dtype": "float16",
-    },
-    "Case2": {
-        "batch": 1,
-        "num_heads": 16,
-        "kv_head_num": 1,
-        "head_dim": 16,
-        "block_size": 16,
-        "context_len": 64,
-        "max_model_len": 256,
-        "dtype": "float16",
-    },
-}
-
-DEFAULT_CASE = "Case1"
-
-
-def generate_inputs(params: dict) -> list:
-    return _generate_inputs(params)
-
-
-if __name__ == "__main__":
-    run_golden_test(ALL_CASES, DEFAULT_CASE, generate_inputs)
diff --git a/examples/a2a3/host_build_graph/paged_attention/kernels/kernel_config.py b/examples/a2a3/host_build_graph/paged_attention/kernels/kernel_config.py
deleted file mode 100644
index 0245cc8a5..000000000
--- a/examples/a2a3/host_build_graph/paged_attention/kernels/kernel_config.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Paged Attention Kernel and Orchestration Configuration
-
-Defines the kernels and orchestration function for paged attention
-with AIC/AIV subgraph splitting:
-
-AIC Kernels (Matrix Multiplication):
-  - aic_qk_matmul: Q @ K^T computation
-  - aic_pv_matmul: P @ V computation
-
-AIV Kernels (Vector Operations):
-  - aiv_softmax_prepare: scale, rowmax, exp, rowsum
-  - aiv_online_update: online softmax accumulation + fused normalization
-"""
-
-from pathlib import Path
-
-from simpler.task_interface import ArgDirection as D  # pyright: ignore[reportAttributeAccessIssue]
-
-_KERNELS_ROOT = Path(__file__).parent
-
-# Orchestration config
-ORCHESTRATION = {
-    "source": str(_KERNELS_ROOT / "orchestration" / "paged_attention_orch.cpp"),
-    "function_name": "build_paged_attention_graph",
-    "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
-}
-
-# Kernel configs
-KERNELS = [
-    # AIC kernels (matrix multiplication using Cube unit)
-    {
-        "func_id": 0,
-        "name": "QK",
-        "source": str(_KERNELS_ROOT / "aic" / "aic_qk_matmul.cpp"),
-        "core_type": "aic",
-        "signature": [D.IN, D.IN, D.OUT],
-    },
-    {
-        "func_id": 2,
-        "name": "PV",
-        "source": str(_KERNELS_ROOT / "aic" / "aic_pv_matmul.cpp"),
-        "core_type": "aic",
-        "signature": [D.IN, D.IN, D.OUT],
-    },
-    # AIV kernels (vector operations)
-    {
-        "func_id": 1,
-        "name": "SF",
-        "source": str(_KERNELS_ROOT / "aiv" / "aiv_softmax_prepare.cpp"),
-        "core_type": "aiv",
-        "signature": [D.IN, D.OUT, D.OUT, D.OUT],
-    },
-    {
-        "func_id": 3,
-        "name": "UP",
-        "source": str(_KERNELS_ROOT / "aiv" / "aiv_online_update.cpp"),
-        "core_type": "aiv",
-        "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
-    },
-]
-
-# Runtime configuration
-RUNTIME_CONFIG = {
-    "runtime": "host_build_graph",
-    "aicpu_thread_num": 3,
-    "block_dim": 3,
-}
diff --git a/examples/a2a3/host_build_graph/paged_attention/test_paged_attention.py b/examples/a2a3/host_build_graph/paged_attention/test_paged_attention.py
new file mode 100644
index 000000000..ec67f7307
--- /dev/null
+++ b/examples/a2a3/host_build_graph/paged_attention/test_paged_attention.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Paged attention — host_build_graph runtime (small scale, float16).
+
+Tests host_build_graph runtime with AIC+AIV mixed execution and INOUT tensors.
+"""
+
+import torch
+from paged_attention_golden import compute_golden as _pa_compute_golden  # noqa: PLC0415
+from paged_attention_golden import generate_inputs as _pa_generate_inputs  # noqa: PLC0415
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+
+@scene_test(level=2, runtime="host_build_graph")
+class TestPagedAttentionHostBuildGraph(SceneTestCase):
+    """Paged attention with host_build_graph runtime."""
+
+    RTOL = 1e-2
+    ATOL = 1e-2
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/paged_attention_orch.cpp",
+            "function_name": "build_paged_attention_graph",
+            "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": "kernels/aic/aic_qk_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "source": "kernels/aic/aic_pv_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": "kernels/aiv/aiv_softmax_prepare.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT, D.OUT, D.OUT],
+            },
+            {
+                "func_id": 3,
+                "source": "kernels/aiv/aiv_online_update.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "small1",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 3, "block_dim": 3},
+            "params": {
+                "batch": 1,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 16,
+                "max_model_len": 256,
+                "dtype": "float16",
+            },
+        },
+        {
+            "name": "small2",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 3, "block_dim": 3},
+            "manual": True,
+            "params": {
+                "batch": 1,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 64,
+                "max_model_len": 256,
+                "dtype": "float16",
+            },
+        },
+    ]
+
+    def generate_args(self, params):
+        inputs = _pa_generate_inputs(params)
+        specs = []
+        for name, val in inputs:
+            if isinstance(val, torch.Tensor):
+                specs.append(Tensor(name, val))
+            else:
+                specs.append(Scalar(name, val))
+        return TaskArgsBuilder(*specs)
+
+    def compute_golden(self, args, params):
+        tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)}
+        _pa_compute_golden(tensors, params)
+        for s in args.specs:
+            if isinstance(s, Tensor) and s.name in tensors:
+                getattr(args, s.name)[:] = tensors[s.name]
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/golden.py b/examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/golden.py
similarity index 86%
rename from tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/golden.py
rename to examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/golden.py
index 2ffe2b19f..13652252d 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/golden.py
+++ b/examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/golden.py
@@ -1,3 +1,11 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
 """
 Golden test specification for alternating matmul-add test.
 
@@ -10,9 +18,10 @@
 """
 
 import ctypes
-import torch
 import time
 
+import torch
+
 __outputs__ = ["C", "Z"]
 RTOL = 1e-3
 ATOL = 1e-3
@@ -34,7 +43,6 @@
         "matmul_batch": 4,  # Number of matmul tiles per task
         "add_batch": 5,  # Number of add tiles per task
     },
-
 }
 
 DEFAULT_CASE = "Case1"
@@ -67,14 +75,10 @@ def generate_inputs(params: dict) -> list:
 
     if total_matmul_tasks % matmul_batch != 0:
         raise ValueError(
-            f"total_matmul_tasks ({total_matmul_tasks}) must be "
-            f"divisible by matmul_batch ({matmul_batch})"
+            f"total_matmul_tasks ({total_matmul_tasks}) must be divisible by matmul_batch ({matmul_batch})"
         )
     if total_add_tasks % add_batch != 0:
-        raise ValueError(
-            f"total_add_tasks ({total_add_tasks}) must be "
-            f"divisible by add_batch ({add_batch})"
-        )
+        raise ValueError(f"total_add_tasks ({total_add_tasks}) must be divisible by add_batch ({add_batch})")
 
     # Prevent integer overflow in orchestration (task_idx = b * M + m or b * N + n)
     INT32_MAX = 2**31 - 1
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aic/kernel_matmul.cpp b/examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aic/kernel_matmul.cpp
similarity index 100%
rename from tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aic/kernel_matmul.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aic/kernel_matmul.cpp
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aiv/kernel_add.cpp b/examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aiv/kernel_add.cpp
similarity index 100%
rename from tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aiv/kernel_add.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/aiv/kernel_add.cpp
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/kernel_config.py b/examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/kernel_config.py
similarity index 100%
rename from tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/kernel_config.py
rename to examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/kernel_config.py
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp
similarity index 100%
rename from tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/kernels/orchestration/alternating_orch.cpp
diff --git a/examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/test_alternating_matmul_add.py b/examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/test_alternating_matmul_add.py
new file mode 100644
index 000000000..b25b8ee51
--- /dev/null
+++ b/examples/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/test_alternating_matmul_add.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Alternating matmul + add: interleaved AIC (matmul 128x128) and AIV (add 128x128) tasks.
+
+Tests AIC+AIV mixed execution with scalar parameters and batched task submission.
+C[b,m] = A[b,m] @ B[b,m], Z[b,n] = X[b,n] + Y[b,n].
+"""
+
+import ctypes
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestAlternatingMatmulAdd(SceneTestCase):
+    """Alternating matmul + add with scalar parameters."""
+
+    RTOL = 1e-3
+    ATOL = 1e-3
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/alternating_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.OUT, D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": "kernels/aic/kernel_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": "kernels/aiv/kernel_add.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "default",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {"batch": 1, "M": 1, "N": 1, "matmul_batch": 1, "add_batch": 1},
+        },
+    ]
+
+    def generate_args(self, params):
+        batch = params["batch"]
+        M = params["M"]
+        N = params["N"]
+        matmul_batch = params.get("matmul_batch", 1)
+        add_batch = params.get("add_batch", 1)
+        matmul_size = 128
+        add_rows = 128
+        add_cols = 128
+
+        torch.manual_seed(42)
+        A = torch.randn(batch, M, matmul_size, matmul_size, dtype=torch.float32) * 0.01
+        B = torch.randn(batch, M, matmul_size, matmul_size, dtype=torch.float32) * 0.01
+        C = torch.zeros(batch, M, matmul_size, matmul_size, dtype=torch.float32)
+        X = torch.randn(batch, N, add_rows, add_cols, dtype=torch.float32) * 0.01
+        Y = torch.randn(batch, N, add_rows, add_cols, dtype=torch.float32) * 0.01
+        Z = torch.zeros(batch, N, add_rows, add_cols, dtype=torch.float32)
+
+        return TaskArgsBuilder(
+            Tensor("A", A.flatten()),
+            Tensor("B", B.flatten()),
+            Tensor("C", C.flatten()),
+            Tensor("X", X.flatten()),
+            Tensor("Y", Y.flatten()),
+            Tensor("Z", Z.flatten()),
+            Scalar("batch", ctypes.c_int64(batch)),
+            Scalar("M_val", ctypes.c_int64(M)),
+            Scalar("N_val", ctypes.c_int64(N)),
+            Scalar("matmul_batch", ctypes.c_int64(matmul_batch)),
+            Scalar("add_batch", ctypes.c_int64(add_batch)),
+        )
+
+    def compute_golden(self, args, params):
+        batch = params["batch"]
+        M = params["M"]
+        N = params["N"]
+        matmul_size = 128
+        add_rows = 128
+        add_cols = 128
+
+        A = args.A.reshape(batch, M, matmul_size, matmul_size)
+        B = args.B.reshape(batch, M, matmul_size, matmul_size)
+        C = args.C.reshape(batch, M, matmul_size, matmul_size)
+        X = args.X.reshape(batch, N, add_rows, add_cols)
+        Y = args.Y.reshape(batch, N, add_rows, add_cols)
+        Z = args.Z.reshape(batch, N, add_rows, add_cols)
+
+        for b in range(batch):
+            for m in range(M):
+                C[b, m] = torch.matmul(A[b, m], B[b, m])
+        for b in range(batch):
+            for n in range(N):
+                Z[b, n] = X[b, n] + Y[b, n]
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/examples/a2a3/tensormap_and_ringbuffer/paged_attention_ringbuffer/test_paged_attention_ringbuffer.py b/examples/a2a3/tensormap_and_ringbuffer/paged_attention_ringbuffer/test_paged_attention_ringbuffer.py
new file mode 100644
index 000000000..0fa823d8e
--- /dev/null
+++ b/examples/a2a3/tensormap_and_ringbuffer/paged_attention_ringbuffer/test_paged_attention_ringbuffer.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Paged attention with small ring buffer sizes — stress test for ring rotation/reclamation.
+
+Tests RUNTIME_ENV (PTO2_RING_TASK_WINDOW, PTO2_RING_HEAP, PTO2_RING_DEP_POOL),
+INOUT tensors, bfloat16, and AIC+AIV mixed execution.
+"""
+
+import torch
+from paged_attention_golden import compute_golden as _pa_compute_golden  # noqa: PLC0415
+from paged_attention_golden import generate_inputs as _pa_generate_inputs  # noqa: PLC0415
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+PA_KERNELS = "../../../../tests/st/a2a3/tensormap_and_ringbuffer/paged_attention/kernels"
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestPagedAttentionRingbuffer(SceneTestCase):
+    """Paged attention with small ring buffer sizes for stress testing."""
+
+    RTOL = 1e-3
+    ATOL = 1e-3
+    RUNTIME_ENV = {
+        "PTO2_RING_TASK_WINDOW": "128",
+        "PTO2_RING_HEAP": "262144",
+        "PTO2_RING_DEP_POOL": "256",
+    }
+
+    CALLABLE = {
+        "orchestration": {
+            "source": f"{PA_KERNELS}/orchestration/paged_attention_orch.cpp",
+            "function_name": "build_paged_attention_graph",
+            "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": f"{PA_KERNELS}/aic/aic_qk_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "source": f"{PA_KERNELS}/aic/aic_pv_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": f"{PA_KERNELS}/aiv/aiv_softmax_prepare.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT, D.OUT, D.OUT],
+            },
+            {
+                "func_id": 3,
+                "source": f"{PA_KERNELS}/aiv/aiv_online_update.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "ringbuffer_stress",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 32,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 128,
+                "context_len": 4096,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+    ]
+
+    def generate_args(self, params):
+        inputs = _pa_generate_inputs(params)
+        specs = []
+        for name, val in inputs:
+            if isinstance(val, torch.Tensor):
+                specs.append(Tensor(name, val))
+            else:
+                specs.append(Scalar(name, val))
+        return TaskArgsBuilder(*specs)
+
+    def compute_golden(self, args, params):
+        tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)}
+        _pa_compute_golden(tensors, params)
+        for s in args.specs:
+            if isinstance(s, Tensor) and s.name in tensors:
+                getattr(args, s.name)[:] = tensors[s.name]
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/aiv/kernel_add.cpp b/examples/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/aiv/kernel_add.cpp
similarity index 100%
rename from tests/st/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/aiv/kernel_add.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/aiv/kernel_add.cpp
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/aiv/kernel_noop.cpp b/examples/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/aiv/kernel_noop.cpp
similarity index 100%
rename from tests/st/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/aiv/kernel_noop.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/aiv/kernel_noop.cpp
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/orchestration/scalar_data_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/orchestration/scalar_data_orch.cpp
similarity index 100%
rename from tests/st/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/orchestration/scalar_data_orch.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/orchestration/scalar_data_orch.cpp
diff --git a/examples/a2a3/tensormap_and_ringbuffer/scalar_data_test/test_scalar_data.py b/examples/a2a3/tensormap_and_ringbuffer/scalar_data_test/test_scalar_data.py
new file mode 100644
index 000000000..590bda5cb
--- /dev/null
+++ b/examples/a2a3/tensormap_and_ringbuffer/scalar_data_test/test_scalar_data.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Scalar data dependency test: GetTensorData, SetTensorData, add_inout.
+
+Tests orchestration-level data manipulation: scalar initialization,
+Get/Set round-trips, WAW+WAR dependency auto-wait, and external tensor WAR.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestScalarData(SceneTestCase):
+    """Scalar data dependency: Get/SetTensorData, add_inout with initial value."""
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/scalar_data_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.OUT, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": "kernels/aiv/kernel_add.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": "kernels/aiv/kernel_noop.cpp",
+                "core_type": "aiv",
+                "signature": [],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "default",
+            "platforms": ["a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 3},
+            "params": {},
+        },
+    ]
+
+    def generate_args(self, params):
+        SIZE = 128 * 128
+        return TaskArgsBuilder(
+            Tensor("a", torch.full((SIZE,), 2.0, dtype=torch.float32)),
+            Tensor("b", torch.arange(SIZE, dtype=torch.float32)),
+            Tensor("result", torch.zeros(SIZE, dtype=torch.float32)),
+            Tensor("check", torch.zeros(10, dtype=torch.float32)),
+        )
+
+    def compute_golden(self, args, params):
+        # result = a + b (computed by kernel_add)
+        args.result[:] = args.a + args.b
+
+        # check values written by orchestration via SetTensorData
+        args.check[0] = 2.0  # GetTensorData(c, {0}): c = a + b, c[0] = 2.0+0.0
+        args.check[1] = 102.0  # GetTensorData(c, {100}): c[100] = 2.0+100.0
+        args.check[2] = 77.0  # runtime-created scalar output initialized to 77.0
+        args.check[3] = 77.0  # second noop via add_inout preserves the value
+        args.check[4] = 79.0  # orchestration arithmetic: 2.0 + 77.0
+        args.check[5] = 42.0  # Orch set->get round-trip: SetTensorData then GetTensorData
+        args.check[6] = 12.0  # Orch->AICore RAW: SetTensorData(d,10.0) + kernel_add(d,a) -> 10.0+2.0
+        args.check[7] = 88.0  # WAW+WAR: kernel reads c, SetTensorData(c,88.0) auto-waits
+        args.check[8] = 55.0  # External WAR: noop(ext_b INOUT) -> SetTensorData(ext_b,55.0) auto-waits
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/pyproject.toml b/pyproject.toml
index d25a55c9d..ad5d343ba 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,6 +16,9 @@ name = "simpler"
 version = "0.1.0"
 requires-python = ">=3.9"
 
+[project.optional-dependencies]
+test = ["pytest>=6.0"]
+
 [tool.ruff]
 line-length = 120
 target-version = "py39"
@@ -65,7 +68,8 @@ reportRedeclaration = false
 
 [tool.pytest.ini_options]
 testpaths = ["tests", "examples"]
-pythonpath = ["python"]
+pythonpath = ["python", "golden"]
+addopts = "--import-mode=importlib"
 
 [tool.scikit-build]
 wheel.packages = ["simpler_setup"]
diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/README.md b/tests/st/a2a3/aicpu_build_graph/paged_attention/README.md
deleted file mode 100644
index b56d9774d..000000000
--- a/tests/st/a2a3/aicpu_build_graph/paged_attention/README.md
+++ /dev/null
@@ -1,113 +0,0 @@
-# Paged Attention (Device Test - aicpu_build_graph)
-
-This test demonstrates Paged Attention using the **aicpu_build_graph** runtime, where the AICPU device builds the task graph at runtime via a dlopen'd orchestration plugin while scheduler threads execute tasks concurrently.
-
-The kernel implementations are identical to the `host_build_graph` version. Only the orchestration and runtime configuration differ.
-
-## Overview
-
-Paged Attention is an efficient attention mechanism that processes KV cache in fixed-size blocks, enabling memory-efficient inference for long sequences. This implementation uses:
-
-- **CCE-style codegen** for AIC kernels (Cube unit matmul)
-- **PTO Tile API** for AIV kernels (Vector unit operations)
-- **Online Softmax** algorithm for numerically stable incremental computation
-
-### Runtime Architecture
-
-In `aicpu_build_graph` mode:
-- **1 AICPU builder thread** runs the orchestration plugin (builds the task graph)
-- **3 AICPU scheduler threads** execute tasks concurrently with graph construction
-- The orchestration plugin is compiled as a `.so`, embedded in Runtime, and dlopen'd on AICPU
-- The framework pre-allocates device memory for I/O tensors and populates `orch_args[]`
-
-### Supported Platforms
-
-| Platform | Description |
-|----------|-------------|
-| a2a3 | Ascend hardware (requires device ID) |
-
-> This test uses bfloat16 data types and production-scale shapes that are not supported by the a2a3sim simulator. It only runs on real hardware.
-
-### Algorithm
-
-For each query token, the attention is computed incrementally across KV cache blocks:
-
-```
-For each block j:
-    sij = Qi @ Kj^T                    # QK MatMul (AIC)
-    mij, lij, pij = softmax_prepare(sij)  # Softmax (AIV)
-    oi_new = pij @ Vj                  # PV MatMul (AIC)
-    oi = online_update(oi, oi_new, mij, lij)  # Accumulate (AIV)
-```
-
-### Task Graph Structure
-
-For each batch, the task dependency pattern is:
-
-```
-Block 0: QK -> SF -> PV --+
-Block 1: QK -> SF -> PV --+--> UP[0] -> UP[1] -> ... -> UP[n]
-Block n: QK -> SF -> PV --+
-```
-
-- **QK/SF/PV chains**: Run in parallel across blocks
-- **UP (Online Update)**: Serialized within batch due to accumulator dependency
-
-## Quick Start
-
-```bash
-# Run on hardware (specify device ID)
-python examples/scripts/run_example.py \
-  -k tests/st/aicpu_build_graph/paged_attention/kernels \
-  -g tests/st/aicpu_build_graph/paged_attention/golden.py \
-  -p a2a3 -d 0
-
-# Run multi-block test case
-PA_CASE=Case2 python examples/scripts/run_example.py \
-  -k tests/st/aicpu_build_graph/paged_attention/kernels \
-  -g tests/st/aicpu_build_graph/paged_attention/golden.py \
-  -p a2a3 -d 0
-```
-
-## Directory Structure
-
-```
-paged_attention/
-├── README.md                    # This file
-├── golden.py                    # Input generation and expected output
-└── kernels/
-    ├── kernel_config.py         # Kernel registration config (aicpu_build_graph)
-    ├── aic/                      # AIC kernels (CCE codegen style)
-    │   ├── aic_qk_matmul.cpp     # Q @ K^T matmul
-    │   └── aic_pv_matmul.cpp     # P @ V matmul
-    ├── aiv/                      # AIV kernels (PTO Tile API)
-    │   ├── aiv_softmax_prepare.cpp  # Softmax preparation
-    │   └── aiv_online_update.cpp    # Online Softmax update + normalize
-    └── orchestration/
-        └── paged_attention_orch.cpp # AICPU task graph builder (dlopen'd plugin)
-```
-
-## Test Cases
-
-| Case | batch | num_heads | kv_head_num | head_dim | block_size | context_len | Description |
-|------|-------|-----------|-------------|----------|------------|-------------|-------------|
-| Case1 | 256 | 16 | 1 | 128 | 128 | 8192 | Default |
-| Case2 | 64 | 64 | 1 | 128 | 64 | 8192 | Multi-block |
-
-All test cases use **bfloat16** Q/K/V inputs with GQA (kv_head_num=1).
-
-## Key Differences from host_build_graph Version
-
-| Aspect | host_build_graph | aicpu_build_graph |
-|--------|------------------|-------------------|
-| Graph building | Host CPU | AICPU device (dlopen'd plugin) |
-| I/O memory | Orchestration allocates + copies | Framework pre-manages |
-| Task API | `runtime->add_task()` | `api.add_task()` |
-| Dependency API | `runtime->add_successor()` | `api.add_successor_conditional()` |
-| Task visibility | Implicit | Explicit `api.publish_task()` |
-| Thread model | 3 scheduler threads | 1 builder + 3 scheduler threads |
-
-## See Also
-
-- [host_build_graph version](../../host_build_graph/paged_attention/README.md)
-- [Test Framework Documentation](../../../../examples/scripts/README.md)
diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/golden.py b/tests/st/a2a3/aicpu_build_graph/paged_attention/golden.py
deleted file mode 100644
index cb27d10f6..000000000
--- a/tests/st/a2a3/aicpu_build_graph/paged_attention/golden.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""Paged Attention Golden - aicpu_build_graph test (production scale, bfloat16)."""
-
-from paged_attention_golden import (
-    compute_golden,  # noqa: F401
-    run_golden_test,
-)
-from paged_attention_golden import generate_inputs as _generate_inputs
-
-__outputs__ = ["out"]
-
-RTOL = 1e-3
-ATOL = 1e-3
-
-ALL_CASES = {
-    "Case1": {
-        "batch": 256,
-        "num_heads": 16,
-        "kv_head_num": 1,
-        "head_dim": 128,
-        "block_size": 128,
-        "context_len": 8192,
-        "max_model_len": 32768,
-        "dtype": "bfloat16",
-    },
-    "Case2": {
-        "batch": 64,
-        "num_heads": 64,
-        "kv_head_num": 1,
-        "head_dim": 128,
-        "block_size": 64,
-        "context_len": 8192,
-        "max_model_len": 32768,
-        "dtype": "bfloat16",
-    },
-}
-
-DEFAULT_CASE = "Case1"
-
-
-def generate_inputs(params: dict) -> list:
-    return _generate_inputs(params)
-
-
-if __name__ == "__main__":
-    run_golden_test(ALL_CASES, DEFAULT_CASE, generate_inputs)
diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/kernel_config.py b/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/kernel_config.py
deleted file mode 100644
index 58021f61c..000000000
--- a/tests/st/a2a3/aicpu_build_graph/paged_attention/kernels/kernel_config.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Paged Attention — aicpu_build_graph Runtime
-
-Kernels and orchestration config for paged attention (per-block version).
-Uses explicit add_dependency for task ordering, scope-end batch publish.
-
-AIC Kernels (Cube):
-  - aic_qk_matmul: Q @ K^T computation
-  - aic_pv_matmul: P @ V computation
-  - aic_hub: placeholder hub task
-
-AIV Kernels (Vector):
-  - aiv_softmax_prepare: scale, rowmax, exp, rowsum
-  - aiv_online_update: online softmax accumulation + fused normalization
-  - aiv_hub: zero-initialize accumulators
-"""
-
-from pathlib import Path
-
-from simpler.task_interface import ArgDirection as D  # pyright: ignore[reportAttributeAccessIssue]
-
-_KERNELS_ROOT = Path(__file__).parent
-
-ORCHESTRATION = {
-    "source": str(_KERNELS_ROOT / "orchestration" / "paged_attention_orch.cpp"),
-    "function_name": "aicpu_orchestration_entry",
-    "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
-}
-
-KERNELS = [
-    {
-        "func_id": 0,
-        "name": "QK",
-        "source": str(_KERNELS_ROOT / "aic" / "aic_qk_matmul.cpp"),
-        "core_type": "aic",
-        "signature": [D.IN, D.IN, D.OUT],
-    },
-    {
-        "func_id": 2,
-        "name": "PV",
-        "source": str(_KERNELS_ROOT / "aic" / "aic_pv_matmul.cpp"),
-        "core_type": "aic",
-        "signature": [D.IN, D.IN, D.OUT],
-    },
-    {
-        "func_id": 4,
-        "name": "AIC_HUB",
-        "source": str(_KERNELS_ROOT / "aic" / "aic_hub.cpp"),
-        "core_type": "aic",
-        "signature": [],
-    },
-    {
-        "func_id": 1,
-        "name": "SF",
-        "source": str(_KERNELS_ROOT / "aiv" / "aiv_softmax_prepare.cpp"),
-        "core_type": "aiv",
-        "signature": [D.IN, D.OUT, D.OUT, D.OUT],
-    },
-    {
-        "func_id": 3,
-        "name": "UP",
-        "source": str(_KERNELS_ROOT / "aiv" / "aiv_online_update.cpp"),
-        "core_type": "aiv",
-        "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
-    },
-    {
-        "func_id": 5,
-        "name": "AIV_HUB",
-        "source": str(_KERNELS_ROOT / "aiv" / "aiv_hub.cpp"),
-        "core_type": "aiv",
-        "signature": [],
-    },
-]
-
-RUNTIME_CONFIG = {
-    "runtime": "aicpu_build_graph",
-    "aicpu_thread_num": 4,
-    "block_dim": 24,
-}
diff --git a/tests/st/a2a3/host_build_graph/paged_attention/README.md b/tests/st/a2a3/host_build_graph/paged_attention/README.md
deleted file mode 100644
index bb280c331..000000000
--- a/tests/st/a2a3/host_build_graph/paged_attention/README.md
+++ /dev/null
@@ -1,192 +0,0 @@
-# Paged Attention (Device Test)
-
-This example demonstrates Paged Attention implementation using CCE (Cube Core Engine) code generation, with AIC matmul kernels and AIV vector kernels using PTO Tile API.
-
-## Overview
-
-Paged Attention is an efficient attention mechanism that processes KV cache in fixed-size blocks, enabling memory-efficient inference for long sequences. This implementation uses:
-
-- **CCE-style codegen** for AIC kernels (Cube unit matmul)
-- **PTO Tile API** for AIV kernels (Vector unit operations)
-- **Online Softmax** algorithm for numerically stable incremental computation
-
-### Supported Platforms
-
-| Platform | Description |
-|----------|-------------|
-| a2a3 | Ascend hardware (requires device ID) |
-
-> This test uses bfloat16 data types and production-scale shapes that are not supported by the a2a3sim simulator. It only runs on real hardware.
-
-### Algorithm
-
-For each query token, the attention is computed incrementally across KV cache blocks:
-
-```
-For each block j:
-    sij = Qi @ Kj^T                    # QK MatMul (AIC)
-    mij, lij, pij = softmax_prepare(sij)  # Softmax (AIV)
-    oi_new = pij @ Vj                  # PV MatMul (AIC)
-    oi = online_update(oi, oi_new, mij, lij)  # Accumulate (AIV)
-```
-
-### Kernel Design (AIC/AIV Split)
-
-| Kernel | Core Type | Operation | Key Instructions |
-|--------|-----------|-----------|------------------|
-| aic_qk_matmul | AIC (Cube) | Q @ K^T | TLOAD/TMOV/TMATMUL/TSTORE |
-| aiv_softmax_prepare | AIV (Vector) | scale, rowmax, exp, rowsum | TMULS/TROWMAX/TROWEXPANDSUB/TEXP/TROWSUM |
-| aic_pv_matmul | AIC (Cube) | P @ V | TLOAD/TMOV/TMATMUL/TSTORE |
-| aiv_online_update | AIV (Vector) | Online Softmax + normalize | TMAX/TSUB/TEXP/TROWEXPANDMUL/TROWEXPANDDIV |
-
-### Memory Hierarchy (AIC Matmul)
-
-```
-GM -> L1 (Mat tiles) -> L0A/L0B -> L0C (Accumulator) -> GM
-```
-
-### Task Graph Structure
-
-For each batch, the task dependency pattern is:
-
-```
-Block 0: QK -> SF -> PV --+
-Block 1: QK -> SF -> PV --+--> UP[0] -> UP[1] -> ... -> UP[n]
-Block n: QK -> SF -> PV --+
-```
-
-- **QK/SF/PV chains**: Run in parallel across blocks
-- **UP (Online Update)**: Serialized within batch due to accumulator dependency
-
-## Quick Start
-
-```bash
-# Run on hardware (specify device ID)
-python examples/scripts/run_example.py \
-  -k tests/st/host_build_graph/paged_attention/kernels \
-  -g tests/st/host_build_graph/paged_attention/golden.py \
-  -p a2a3 -d 0
-
-# Run multi-block test case
-PA_CASE=Case2 python examples/scripts/run_example.py \
-  -k tests/st/host_build_graph/paged_attention/kernels \
-  -g tests/st/host_build_graph/paged_attention/golden.py \
-  -p a2a3 -d 0
-```
-
-## Directory Structure
-
-```
-paged_attention/
-├── README.md                    # This file
-├── golden.py                    # Input generation and expected output
-└── kernels/
-    ├── kernel_config.py         # Kernel registration config
-    ├── aic/                      # AIC kernels (CCE codegen style)
-    │   ├── aic_qk_matmul.cpp     # Q @ K^T matmul
-    │   └── aic_pv_matmul.cpp     # P @ V matmul
-    ├── aiv/                      # AIV kernels (PTO Tile API)
-    │   ├── aiv_softmax_prepare.cpp  # Softmax preparation
-    │   └── aiv_online_update.cpp    # Online Softmax update + normalize
-    └── orchestration/
-        └── paged_attention_orch.cpp # Task graph builder
-```
-
-## Test Cases
-
-| Case | batch | num_heads | kv_head_num | head_dim | block_size | context_len | Description |
-|------|-------|-----------|-------------|----------|------------|-------------|-------------|
-| Case1 | 1 | 16 | 1 | 128 | 128 | 256 | Small scale (default) |
-| Case2 | 8 | 64 | 1 | 128 | 64 | 8192 | Production scale |
-
-All test cases use **bfloat16** Q/K/V inputs with GQA (kv_head_num=1).
-
-## Key Technical Details
-
-### AIC Kernels (CCE Codegen)
-
-```cpp
-// L1 tiles: ColMajor + SLayout::RowMajor (required for matmul)
-using TileMatA = Tile<TileType::Mat, float, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
-using TileMatB = Tile<TileType::Mat, float, K, N, BLayout::ColMajor, K, N, SLayout::RowMajor, 512>;
-
-// L0 tiles: Use standard TileLeft/TileRight/TileAcc aliases
-using LeftTile = TileLeft<float, M, K, M, K>;
-using RightTile = TileRight<float, K, N, K, N>;
-using AccTile = TileAcc<float, M, N, M, N>;
-
-// Pipeline: MTE2 -> MTE1 -> M -> FIX -> MTE3
-TLOAD(aMatTile, qiGlobal);           // GM -> L1
-TMOV(aTile, aMatTile);               // L1 -> L0A
-TMATMUL(cTile, aTile, bTile);        // L0A x L0B -> L0C
-TSTORE(sijGlobal, cTile);            // L0C -> GM
-```
-
-### AIV Kernels (PTO Tile API)
-
-**softmax_prepare**: Uses DN layout (ColMajor, 16x1) for row reduction results
-
-```cpp
-using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, kRows, 1>;
-
-TMULS(sijTile, sijTile, scale_value);      // Scale
-TROWMAX(maxTile, sijTile, tmpTile);        // Row max
-TROWEXPANDSUB(pijTile, sijTile, maxTile);  // Subtract max (broadcast)
-TEXP(pijTile, pijTile);                    // Exp
-TROWSUM(sumTile, pijTile, tmpTile);        // Row sum
-```
-
-**online_update**: Uses ND/DN layout conversion for hardware compatibility
-
-```cpp
-// ND (1x16, RowMajor) for scalar arithmetic - TSUB/TMUL/TADD require RowMajor
-using TileScalarND = Tile<TileType::Vec, float, 1, kNumHeads, BLayout::RowMajor, 1, kNumHeads>;
-// DN (16x1, ColMajor) for row broadcast - TROWEXPANDMUL/TROWEXPANDDIV require this
-using TileScalarDN = Tile<TileType::Vec, float, kNumHeads, 1, BLayout::ColMajor, kNumHeads, 1>;
-
-// Arithmetic in ND layout
-TMAX(miNewTileND, miTileND, mijTileND);
-TSUB(alphaTileND, miTileND, miNewTileND);
-TEXP(alphaTileND, alphaTileND);
-
-// Reshape ND -> DN for broadcast operations
-TRESHAPE(alphaTileDN, alphaTileND);
-TROWEXPANDMUL(oiTile, oiTile, alphaTileDN);
-```
-
-### Data Layout
-
-- **K stored as K^T**: (head_dim, block_size) for direct matmul compatibility
-- **V stored normally**: (block_size, head_dim)
-
-## Expected Output
-
-```
-=== Compiling and Registering Kernels ===
-Compiling kernel: .../aic_qk_matmul.cpp (func_id=0)
-Compiling kernel: .../aiv_softmax_prepare.cpp (func_id=1)
-Compiling kernel: .../aic_pv_matmul.cpp (func_id=2)
-Compiling kernel: .../aiv_online_update.cpp (func_id=3)
-...
-=== build_paged_attention_graph (16x16 framework version) ===
-batch=1, num_heads=16, kv_head_num=1, head_dim=16
-block_size=16, block_num=1
-...
-Created 4 tasks
-...
-=== Comparing Results ===
-Comparing out: shape=(256,), dtype=float32
-  out: PASS (256/256 elements matched)
-
-============================================================
-TEST PASSED
-============================================================
-```
-
-## Reference
-
-This implementation uses the Online Softmax algorithm for paged attention, with identical kernel structure to the PyPTO reference implementation.
-
-## See Also
-
-- [Test Framework Documentation](../../../../examples/scripts/README.md)
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_ringbuffer/golden.py b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_ringbuffer/golden.py
deleted file mode 100644
index aba775d82..000000000
--- a/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_ringbuffer/golden.py
+++ /dev/null
@@ -1,40 +0,0 @@
-"""
-Paged Attention Ring Buffer Stress Test Golden
-
-Tests paged attention with small ring buffer sizes (TW=1024, HP=1MB, DP=1024)
-to guard the ring buffer rotation/reclamation logic.
-"""
-
-from paged_attention_golden import (
-    generate_inputs as _generate_inputs,
-    compute_golden,
-    run_golden_test,
-)
-
-__outputs__ = ["out"]
-
-RTOL = 1e-3
-ATOL = 1e-3
-
-ALL_CASES = {
-    "Case1": {
-        "batch": 32,
-        "num_heads": 16,
-        "kv_head_num": 1,
-        "head_dim": 128,
-        "block_size": 128,
-        "context_len": 4096,
-        "max_model_len": 32768,
-        "dtype": "bfloat16",
-    },
-}
-
-DEFAULT_CASE = "Case1"
-
-
-def generate_inputs(params: dict) -> list:
-    return _generate_inputs(params)
-
-
-if __name__ == "__main__":
-    run_golden_test(ALL_CASES, DEFAULT_CASE, generate_inputs)
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_ringbuffer/kernels/kernel_config.py b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_ringbuffer/kernels/kernel_config.py
deleted file mode 100644
index 100a75305..000000000
--- a/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_ringbuffer/kernels/kernel_config.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Paged Attention Ring Buffer Stress Test
-
-Reuses paged_attention kernels and orchestration with deliberately small
-ring buffer sizes to exercise and guard the ring buffer rotation logic.
-
-The orchestration uses an inner PTO2_SCOPE per block, allowing per-block
-ring resources to be reclaimed. Combined with small ring sizes, this
-stresses the back-pressure and reclamation paths.
-
-Environment overrides:
-  PTO2_RING_TASK_WINDOW = 128   (vs default 65536, 8x smaller than prev 1024)
-  PTO2_RING_HEAP        = 256KB (vs default 1GB,   4x smaller than prev 1MB)
-  PTO2_RING_DEP_POOL    = 256   (vs default 65536, 4x smaller than prev 1024)
-"""
-
-from pathlib import Path
-
-from simpler.task_interface import ArgDirection as D  # pyright: ignore[reportAttributeAccessIssue]
-
-# Point to paged_attention's kernel sources (no duplication)
-_PA_KERNELS = Path(__file__).parent / ".." / ".." / "paged_attention" / "kernels"
-
-ORCHESTRATION = {
-    "source": str(_PA_KERNELS / "orchestration" / "paged_attention_orch.cpp"),
-    "function_name": "build_paged_attention_graph",
-    "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
-}
-
-KERNELS = [
-    {
-        "func_id": 0,
-        "name": "QK",
-        "source": str(_PA_KERNELS / "aic" / "aic_qk_matmul.cpp"),
-        "core_type": "aic",
-        "signature": [D.IN, D.IN, D.OUT],
-    },
-    {
-        "func_id": 2,
-        "name": "PV",
-        "source": str(_PA_KERNELS / "aic" / "aic_pv_matmul.cpp"),
-        "core_type": "aic",
-        "signature": [D.IN, D.IN, D.OUT],
-    },
-    {
-        "func_id": 1,
-        "name": "SF",
-        "source": str(_PA_KERNELS / "aiv" / "aiv_softmax_prepare.cpp"),
-        "core_type": "aiv",
-        "signature": [D.IN, D.OUT, D.OUT, D.OUT],
-    },
-    {
-        "func_id": 3,
-        "name": "UP",
-        "source": str(_PA_KERNELS / "aiv" / "aiv_online_update.cpp"),
-        "core_type": "aiv",
-        "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
-    },
-]
-
-RUNTIME_CONFIG = {
-    "runtime": "tensormap_and_ringbuffer",
-    "aicpu_thread_num": 4,
-    "block_dim": 24,
-}
-
-# Small ring buffer sizes — see module docstring for rationale.
-RUNTIME_ENV = {
-    "PTO2_RING_TASK_WINDOW": "128",
-    "PTO2_RING_HEAP": "262144",
-    "PTO2_RING_DEP_POOL": "256",
-}
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/scalar_data_test/golden.py b/tests/st/a2a3/tensormap_and_ringbuffer/scalar_data_test/golden.py
deleted file mode 100644
index 2b010b018..000000000
--- a/tests/st/a2a3/tensormap_and_ringbuffer/scalar_data_test/golden.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-
-"""
-Golden script for scalar data dependency test.
-
-Tests GetTensorData, SetTensorData, and add_inout with initial value.
-
-Computation:
-  c = a + b (kernel, internal tensor)
-  check[0] = GetTensorData(c, {0})     = a[0]+b[0] = 2.0+0.0 = 2.0
-  check[1] = GetTensorData(c, {100})   = a[100]+b[100] = 2.0+100.0 = 102.0
-  scalar initialized to 77.0 via add_output(TensorCreateInfo, to_u64(77.0f))
-  check[2] = GetTensorData(scalar, {0}) = 77.0
-  second noop with add_inout(scalar), value preserved
-  check[3] = GetTensorData(scalar, {0}) = 77.0
-  check[4] = orchestration arithmetic: 2.0 + 77.0 = 79.0
-  SetTensorData(scalar, {0}, 42.0), then GetTensorData round-trip
-  check[5] = GetTensorData(scalar, {0}) = 42.0
-  Orch SetTensorData(d, {0}, 10.0) → kernel_add(d, a) → e[0] = 12.0
-  check[6] = GetTensorData(e, {0}) = 12.0
-  WAW+WAR: kernel reads c as INPUT, then SetTensorData(c, 88.0) auto-waits
-  check[7] = GetTensorData(c, {0}) = 88.0
-  External WAR: noop(ext_b as INOUT) → SetTensorData(ext_b, 55.0) auto-waits
-  check[8] = GetTensorData(ext_b, {0}) = 55.0 (ext_b[0] restored to 0.0 after)
-  result = a + b (kernel, external output)
-
-Args layout: [a, b, result, check]
-"""
-
-import torch
-
-__outputs__ = ["result", "check"]
-
-RTOL = 1e-5
-ATOL = 1e-5
-
-
-def generate_inputs(params: dict) -> list:
-    SIZE = 128 * 128  # 16384 -- matches kernel_add 128x128 tile
-
-    a = torch.full((SIZE,), 2.0, dtype=torch.float32)
-    b = torch.arange(SIZE, dtype=torch.float32)
-    result = torch.zeros(SIZE, dtype=torch.float32)
-    check = torch.zeros(10, dtype=torch.float32)
-
-    return [
-        ("a", a),
-        ("b", b),
-        ("result", result),
-        ("check", check),
-    ]
-
-
-def compute_golden(tensors: dict, params: dict) -> None:
-    a = torch.as_tensor(tensors["a"])
-    b = torch.as_tensor(tensors["b"])
-
-    # result = a + b (computed by kernel_add)
-    tensors["result"][:] = a + b
-
-    # check values written by orchestration via SetTensorData
-    check = torch.as_tensor(tensors["check"])
-    check[0] = 2.0  # GetTensorData(c, {0}): c = a + b, c[0] = 2.0+0.0
-    check[1] = 102.0  # GetTensorData(c, {100}): c[100] = 2.0+100.0
-    check[2] = 77.0  # runtime-created scalar output initialized to 77.0
-    check[3] = 77.0  # second noop via add_inout preserves the value
-    check[4] = 79.0  # orchestration arithmetic: 2.0 + 77.0
-    check[5] = 42.0  # Orch set→get round-trip: SetTensorData then GetTensorData
-    check[6] = 12.0  # Orch→AICore RAW: SetTensorData(d,10.0) + kernel_add(d,a) → 10.0+2.0
-    check[7] = 88.0  # WAW+WAR: kernel reads c, SetTensorData(c,88.0) auto-waits for consumer
-    check[8] = 55.0  # External WAR: noop(ext_b INOUT) → SetTensorData(ext_b,55.0) auto-waits
-    # check[9] remains 0.0 (sentinel)
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/kernel_config.py b/tests/st/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/kernel_config.py
deleted file mode 100644
index f7c67b088..000000000
--- a/tests/st/a2a3/tensormap_and_ringbuffer/scalar_data_test/kernels/kernel_config.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Kernel configuration for scalar data dependency test (tensormap_and_ringbuffer).
-
-Tests GetTensorData, SetTensorData, and add_inout with initial value.
-
-Kernels:
-  func_id=0: kernel_add (AIV) - element-wise tensor addition (128x128)
-  func_id=1: kernel_noop (AIV) - empty kernel for allocation trigger
-"""
-
-from pathlib import Path
-
-from simpler.task_interface import ArgDirection as D  # pyright: ignore[reportAttributeAccessIssue]
-
-_KERNELS_ROOT = Path(__file__).parent
-
-ORCHESTRATION = {
-    "source": str(_KERNELS_ROOT / "orchestration" / "scalar_data_orch.cpp"),
-    "function_name": "aicpu_orchestration_entry",
-    "signature": [D.IN, D.IN, D.OUT, D.OUT],
-}
-
-KERNELS = [
-    {
-        "func_id": 0,
-        "source": str(_KERNELS_ROOT / "aiv" / "kernel_add.cpp"),
-        "core_type": "aiv",
-        "signature": [D.IN, D.IN, D.OUT],
-    },
-    {
-        "func_id": 1,
-        "source": str(_KERNELS_ROOT / "aiv" / "kernel_noop.cpp"),
-        "core_type": "aiv",
-        "signature": [],
-    },
-]
-
-RUNTIME_CONFIG = {
-    "runtime": "tensormap_and_ringbuffer",
-    "aicpu_thread_num": 4,
-    "block_dim": 3,
-}
diff --git a/tools/benchmark_rounds.sh b/tools/benchmark_rounds.sh
index 64b283e81..bd674b866 100755
--- a/tools/benchmark_rounds.sh
+++ b/tools/benchmark_rounds.sh
@@ -1,4 +1,12 @@
 #!/usr/bin/env bash
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
 # Benchmark wrapper: run examples on hardware,
 # then parse device-log timing lines to report per-round latency.
 #
@@ -122,9 +130,14 @@ vlog() {
 }
 
 # ---------------------------------------------------------------------------
-# Derive arch from platform and set examples directory
+# Derive arch from platform and set examples directories
 # ---------------------------------------------------------------------------
-EXAMPLES_DIR="$PROJECT_ROOT/tests/st/${PLATFORM}/${RUNTIME}"
+# Search both examples/ (migrated tests) and tests/st/ (legacy tests)
+ARCH="${PLATFORM%%sim}"  # strip "sim" suffix if present
+EXAMPLES_DIRS=(
+    "$PROJECT_ROOT/examples/${ARCH}/${RUNTIME}"
+    "$PROJECT_ROOT/tests/st/${ARCH}/${RUNTIME}"
+)
 
 # Clock frequency (MHz) for converting cycle counts to microseconds
 case "$PLATFORM" in
@@ -458,12 +471,19 @@ SUMMARY_ORCH=()
 
 echo ""
 echo "Runtime: $RUNTIME"
-echo "Tests dir: $EXAMPLES_DIR"
 
 for example in "${EXAMPLE_ORDER[@]}"; do
     case_list="${EXAMPLE_CASES[$example]:-}"
 
-    EXAMPLE_DIR="$EXAMPLES_DIR/$example"
+    # Search for example in both directories
+    EXAMPLE_DIR=""
+    for dir in "${EXAMPLES_DIRS[@]}"; do
+        if [[ -f "$dir/$example/golden.py" && -d "$dir/$example/kernels" ]]; then
+            EXAMPLE_DIR="$dir/$example"
+            break
+        fi
+    done
+
     KERNELS_DIR="$EXAMPLE_DIR/kernels"
     GOLDEN="$EXAMPLE_DIR/golden.py"
 
@@ -472,8 +492,8 @@ for example in "${EXAMPLE_ORDER[@]}"; do
     echo "  $example"
     echo "================================================================"
 
-    if [[ ! -f "$GOLDEN" || ! -d "$KERNELS_DIR" ]]; then
-        echo "  SKIP: missing kernels/ or golden.py"
+    if [[ -z "$EXAMPLE_DIR" ]]; then
+        echo "  SKIP: not found in any search directory"
         ((FAIL++)) || true
         continue
     fi