hw-native-sys · ChaoWao · Apr 15, 2026 · Apr 14, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -401,4 +401,6 @@ jobs:
 
       - name: Run pytest scene tests (a5)
         run: |
-          source ${ASCEND_HOME_PATH}/bin/setenv.bash && python -m pytest examples tests/st --platform a5 --device ${DEVICE_RANGE} -v
+          source ${ASCEND_HOME_PATH}/bin/setenv.bash
+          DEVICE_LIST=$(python -c "s,e='${DEVICE_RANGE}'.split('-'); print(','.join(str(i) for i in range(int(s),int(e)+1)))")
+          task-submit --timeout 1800 --max-time 1800 --device "$DEVICE_LIST" --run "python -m pytest examples tests/st --platform a5 --device ${DEVICE_RANGE} -v"
diff --git a/conftest.py b/conftest.py
@@ -69,6 +69,7 @@ def pytest_addoption(parser):
     parser.addoption(
         "--enable-profiling", action="store_true", default=False, help="Enable profiling (first round only)"
     )
+    parser.addoption("--dump-tensor", action="store_true", default=False, help="Dump per-task tensor I/O at runtime")
     parser.addoption("--build", action="store_true", default=False, help="Compile runtime from source")
 
 

diff --git a/docs/tensor-dump.md b/docs/tensor-dump.md
diff --git a/docs/testing.md b/docs/testing.md
@@ -51,6 +51,10 @@ python examples/a2a3/tensormap_and_ringbuffer/vector_example/test_vector_example
 python examples/a2a3/tensormap_and_ringbuffer/vector_example/test_vector_example.py \
     -p a2a3 --enable-profiling
 
+# Tensor dump
+python tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/test_alternating_matmul_add.py \
+    -p a2a3 -d 11 --dump-tensor
+
 # Single example via run_example.py (deprecated — prefer test_*.py standalone)
 python examples/scripts/run_example.py \
     -k examples/a2a3/host_build_graph/vector_example/kernels \
@@ -95,6 +99,7 @@ pytest --platform a2a3sim --log-level debug                        # verbose C++
 python test_xxx.py -p a2a3sim                                    # default: 1 round + golden
 python test_xxx.py -p a2a3 -d 0 -n 100 --skip-golden            # benchmark mode
 python test_xxx.py -p a2a3 --enable-profiling                    # profiling (first round)
+python test_xxx.py -p a2a3 --dump-tensor                         # dump per-task tensor I/O
 python test_xxx.py -p a2a3sim --build                            # compile runtime from source
 python test_xxx.py -p a2a3sim --log-level debug                  # verbose C++ logging
 ```
@@ -106,6 +111,7 @@ python test_xxx.py -p a2a3sim --log-level debug                  # verbose C++ l
 | `--rounds N` | `-n` | 1 | Run each case N times |
 | `--skip-golden` | | false | Skip golden comparison (for benchmarking) |
 | `--enable-profiling` | | false | Enable profiling on first round only |
+| `--dump-tensor` | | false | Dump per-task tensor I/O during runtime execution |
 | `--build` | | false | Compile runtime from source (not pre-built) |
 | `--log-level LEVEL` | | (none) | Set `PTO_LOG_LEVEL` env var (`error`/`warn`/`info`/`debug`) |
 

diff --git a/examples/scripts/run_example.py b/examples/scripts/run_example.py
@@ -143,6 +143,12 @@ def compute_golden(tensors: dict, params: dict) -> None:
         help="Enable profiling and generate swimlane.json",
     )
 
+    parser.add_argument(
+        "--dump-tensor",
+        action="store_true",
+        help="Dump per-task tensor I/O at runtime (controlled by enable_dump_tensor flag)",
+    )
+
     parser.add_argument(
         "--all",
         action="store_true",
@@ -223,6 +229,7 @@ def compute_golden(tensors: dict, params: dict) -> None:
             device_id=args.device,
             platform=args.platform,
             enable_profiling=args.enable_profiling,
+            enable_dump_tensor=args.dump_tensor,
             run_all_cases=args.all,
             case_name=args.case,
             pto_isa_commit=args.pto_isa_commit,

diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp
@@ -541,10 +541,12 @@ NB_MODULE(_task_interface, m) {
         .def_rw("block_dim", &ChipCallConfig::block_dim)
         .def_rw("aicpu_thread_num", &ChipCallConfig::aicpu_thread_num)
         .def_rw("enable_profiling", &ChipCallConfig::enable_profiling)
+        .def_rw("enable_dump_tensor", &ChipCallConfig::enable_dump_tensor)
         .def("__repr__", [](const ChipCallConfig &self) -> std::string {
             std::ostringstream os;
             os << "ChipCallConfig(block_dim=" << self.block_dim << ", aicpu_thread_num=" << self.aicpu_thread_num
-               << ", enable_profiling=" << (self.enable_profiling ? "True" : "False") << ")";
+               << ", enable_profiling=" << (self.enable_profiling ? "True" : "False")
+               << ", enable_dump_tensor=" << (self.enable_dump_tensor ? "True" : "False") << ")";
             return os.str();
         });
 

diff --git a/simpler_setup/code_runner.py b/simpler_setup/code_runner.py
@@ -193,6 +193,7 @@ def __init__(  # noqa: PLR0913
         device_id: Optional[int] = None,
         platform: str = "a2a3",
         enable_profiling: bool = False,
+        enable_dump_tensor: bool = False,
         run_all_cases: bool = False,
         case_name: Optional[str] = None,
         pto_isa_commit: Optional[str] = None,
@@ -212,6 +213,7 @@ def __init__(  # noqa: PLR0913
         self.golden_path = Path(golden_path).resolve()
         self.platform = platform
         self.enable_profiling = enable_profiling
+        self.enable_dump_tensor = enable_dump_tensor
         self.skip_golden = skip_golden
         self.project_root = PROJECT_ROOT
 
@@ -608,6 +610,9 @@ def _compile_one_kernel(kernel):
                 if self.enable_profiling and round_idx == 0:
                     config.enable_profiling = True
                     logger.info("Profiling enabled")
+                if self.enable_dump_tensor:
+                    config.enable_dump_tensor = True
+                    logger.info("Dump tensor enabled")
 
                 with _temporary_env(run_env):
                     worker.run(chip_callable, orch_args, config)
@@ -682,6 +687,7 @@ def create_code_runner(  # noqa: PLR0913
     device_id=None,
     platform="a2a3",
     enable_profiling=False,
+    enable_dump_tensor=False,
     run_all_cases=False,
     case_name=None,
     pto_isa_commit=None,
@@ -698,6 +704,7 @@ def create_code_runner(  # noqa: PLR0913
         device_id=device_id,
         platform=platform,
         enable_profiling=enable_profiling,
+        enable_dump_tensor=enable_dump_tensor,
         run_all_cases=run_all_cases,
         case_name=case_name,
         pto_isa_commit=pto_isa_commit,

diff --git a/simpler_setup/scene_test.py b/simpler_setup/scene_test.py
@@ -499,13 +499,14 @@ def build_callable(self, platform):
             return self._compile_l3_callables(platform)
         raise ValueError(f"Unsupported level: {self._st_level}")
 
-    def _build_config(self, config_dict, enable_profiling=False):
+    def _build_config(self, config_dict, enable_profiling=False, enable_dump_tensor=False):
         from simpler.task_interface import ChipCallConfig  # noqa: PLC0415
 
         config = ChipCallConfig()
         config.block_dim = config_dict.get("block_dim", 1)
         config.aicpu_thread_num = config_dict.get("aicpu_thread_num", 3)
         config.enable_profiling = enable_profiling
+        config.enable_dump_tensor = enable_dump_tensor
         return config
 
     def _resolve_env(self):
@@ -526,7 +527,15 @@ def _resolve_env(self):
     # ------------------------------------------------------------------
 
     def _run_and_validate(
-        self, worker, callable_obj, case, sub_ids=None, rounds=1, skip_golden=False, enable_profiling=False
+        self,
+        worker,
+        callable_obj,
+        case,
+        sub_ids=None,
+        rounds=1,
+        skip_golden=False,
+        enable_profiling=False,
+        enable_dump_tensor=False,
     ):
         if self._st_level == 2:
             self._run_and_validate_l2(
@@ -536,6 +545,7 @@ def _run_and_validate(
                 rounds=rounds,
                 skip_golden=skip_golden,
                 enable_profiling=enable_profiling,
+                enable_dump_tensor=enable_dump_tensor,
             )
         elif self._st_level == 3:
             self._run_and_validate_l3(
@@ -546,9 +556,12 @@ def _run_and_validate(
                 rounds=rounds,
                 skip_golden=skip_golden,
                 enable_profiling=enable_profiling,
+                enable_dump_tensor=enable_dump_tensor,
             )
 
-    def _run_and_validate_l2(self, worker, callable_obj, case, rounds=1, skip_golden=False, enable_profiling=False):
+    def _run_and_validate_l2(
+        self, worker, callable_obj, case, rounds=1, skip_golden=False, enable_profiling=False, enable_dump_tensor=False
+    ):
         params = case.get("params", {})
         config_dict = case.get("config", {})
         orch_sig = self.CALLABLE.get("orchestration", {}).get("signature", [])
@@ -575,7 +588,11 @@ def _run_and_validate_l2(self, worker, callable_obj, case, rounds=1, skip_golden
                 for name, initial in initial_outputs.items():
                     getattr(test_args, name).copy_(initial)
 
-            config = self._build_config(config_dict, enable_profiling=(enable_profiling and round_idx == 0))
+            config = self._build_config(
+                config_dict,
+                enable_profiling=(enable_profiling and round_idx == 0),
+                enable_dump_tensor=enable_dump_tensor,
+            )
 
             with _temporary_env(self._resolve_env()):
                 worker.run(callable_obj, chip_args, config=config)
@@ -584,7 +601,15 @@ def _run_and_validate_l2(self, worker, callable_obj, case, rounds=1, skip_golden
                 _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)
 
     def _run_and_validate_l3(
-        self, worker, compiled_callables, sub_ids, case, rounds=1, skip_golden=False, enable_profiling=False
+        self,
+        worker,
+        compiled_callables,
+        sub_ids,
+        case,
+        rounds=1,
+        skip_golden=False,
+        enable_profiling=False,
+        enable_dump_tensor=False,
     ):
         from simpler.worker import Task  # noqa: PLC0415
 
@@ -619,7 +644,11 @@ def _run_and_validate_l3(
                 for name, initial in initial_tensors.items():
                     getattr(test_args, name).copy_(initial)
 
-            config = self._build_config(config_dict, enable_profiling=(enable_profiling and round_idx == 0))
+            config = self._build_config(
+                config_dict,
+                enable_profiling=(enable_profiling and round_idx == 0),
+                enable_dump_tensor=enable_dump_tensor,
+            )
 
             # Wrap in Task — user orch signature: (orch, callables, task_args, config)
             def task_orch(orch, _unused, _ns=ns, _test_args=test_args, _config=config):
@@ -642,6 +671,7 @@ def test_run(self, st_platform, st_worker, request):
         rounds = request.config.getoption("--rounds", default=1)
         skip_golden = request.config.getoption("--skip-golden", default=False)
         enable_profiling = request.config.getoption("--enable-profiling", default=False)
+        enable_dump_tensor = request.config.getoption("--dump-tensor", default=False)
 
         callable_obj = self.build_callable(st_platform)
         sub_ids = getattr(type(self), "_st_sub_ids", {})
@@ -661,6 +691,7 @@ def test_run(self, st_platform, st_worker, request):
                 rounds=rounds,
                 skip_golden=skip_golden,
                 enable_profiling=enable_profiling,
+                enable_dump_tensor=enable_dump_tensor,
             )
             ran_any = True
 
@@ -686,6 +717,7 @@ def run_module(module_name):
         parser.add_argument("-n", "--rounds", type=int, default=1, help="Run each case N times (default: 1)")
         parser.add_argument("--skip-golden", action="store_true", help="Skip golden comparison (benchmark mode)")
         parser.add_argument("--enable-profiling", action="store_true", help="Enable profiling (first round only)")
+        parser.add_argument("--dump-tensor", action="store_true", help="Dump per-task tensor I/O at runtime")
         parser.add_argument("--build", action="store_true", help="Compile runtime from source")
         parser.add_argument(
             "--log-level",
@@ -734,6 +766,7 @@ def run_module(module_name):
                                 rounds=args.rounds,
                                 skip_golden=args.skip_golden,
                                 enable_profiling=args.enable_profiling,
+                                enable_dump_tensor=args.dump_tensor,
                             )
                             print("PASSED")
                         except Exception as e: