Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -401,4 +401,6 @@ jobs:

- name: Run pytest scene tests (a5)
run: |
source ${ASCEND_HOME_PATH}/bin/setenv.bash && python -m pytest examples tests/st --platform a5 --device ${DEVICE_RANGE} -v
source ${ASCEND_HOME_PATH}/bin/setenv.bash
DEVICE_LIST=$(python -c "s,e='${DEVICE_RANGE}'.split('-'); print(','.join(str(i) for i in range(int(s),int(e)+1)))")
task-submit --timeout 1800 --max-time 1800 --device "$DEVICE_LIST" --run "python -m pytest examples tests/st --platform a5 --device ${DEVICE_RANGE} -v"
1 change: 1 addition & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def pytest_addoption(parser):
parser.addoption(
"--enable-profiling", action="store_true", default=False, help="Enable profiling (first round only)"
)
parser.addoption("--dump-tensor", action="store_true", default=False, help="Dump per-task tensor I/O at runtime")
parser.addoption("--build", action="store_true", default=False, help="Compile runtime from source")


Expand Down
603 changes: 603 additions & 0 deletions docs/tensor-dump.md

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions docs/testing.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ python examples/a2a3/tensormap_and_ringbuffer/vector_example/test_vector_example
python examples/a2a3/tensormap_and_ringbuffer/vector_example/test_vector_example.py \
-p a2a3 --enable-profiling

# Tensor dump
python tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/test_alternating_matmul_add.py \
-p a2a3 -d 11 --dump-tensor

# Single example via run_example.py (deprecated — prefer test_*.py standalone)
python examples/scripts/run_example.py \
-k examples/a2a3/host_build_graph/vector_example/kernels \
Expand Down Expand Up @@ -95,6 +99,7 @@ pytest --platform a2a3sim --log-level debug # verbose C++
python test_xxx.py -p a2a3sim # default: 1 round + golden
python test_xxx.py -p a2a3 -d 0 -n 100 --skip-golden # benchmark mode
python test_xxx.py -p a2a3 --enable-profiling # profiling (first round)
python test_xxx.py -p a2a3 --dump-tensor # dump per-task tensor I/O
python test_xxx.py -p a2a3sim --build # compile runtime from source
python test_xxx.py -p a2a3sim --log-level debug # verbose C++ logging
```
Expand All @@ -106,6 +111,7 @@ python test_xxx.py -p a2a3sim --log-level debug # verbose C++ l
| `--rounds N` | `-n` | 1 | Run each case N times |
| `--skip-golden` | | false | Skip golden comparison (for benchmarking) |
| `--enable-profiling` | | false | Enable profiling on first round only |
| `--dump-tensor` | | false | Dump per-task tensor I/O during runtime execution |
| `--build` | | false | Compile runtime from source (not pre-built) |
| `--log-level LEVEL` | | (none) | Set `PTO_LOG_LEVEL` env var (`error`/`warn`/`info`/`debug`) |

Expand Down
7 changes: 7 additions & 0 deletions examples/scripts/run_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,12 @@ def compute_golden(tensors: dict, params: dict) -> None:
help="Enable profiling and generate swimlane.json",
)

parser.add_argument(
"--dump-tensor",
action="store_true",
help="Dump per-task tensor I/O at runtime (controlled by enable_dump_tensor flag)",
)

parser.add_argument(
"--all",
action="store_true",
Expand Down Expand Up @@ -223,6 +229,7 @@ def compute_golden(tensors: dict, params: dict) -> None:
device_id=args.device,
platform=args.platform,
enable_profiling=args.enable_profiling,
enable_dump_tensor=args.dump_tensor,
run_all_cases=args.all,
case_name=args.case,
pto_isa_commit=args.pto_isa_commit,
Expand Down
4 changes: 3 additions & 1 deletion python/bindings/task_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -541,10 +541,12 @@ NB_MODULE(_task_interface, m) {
.def_rw("block_dim", &ChipCallConfig::block_dim)
.def_rw("aicpu_thread_num", &ChipCallConfig::aicpu_thread_num)
.def_rw("enable_profiling", &ChipCallConfig::enable_profiling)
.def_rw("enable_dump_tensor", &ChipCallConfig::enable_dump_tensor)
.def("__repr__", [](const ChipCallConfig &self) -> std::string {
std::ostringstream os;
os << "ChipCallConfig(block_dim=" << self.block_dim << ", aicpu_thread_num=" << self.aicpu_thread_num
<< ", enable_profiling=" << (self.enable_profiling ? "True" : "False") << ")";
<< ", enable_profiling=" << (self.enable_profiling ? "True" : "False")
<< ", enable_dump_tensor=" << (self.enable_dump_tensor ? "True" : "False") << ")";
return os.str();
});

Expand Down
7 changes: 7 additions & 0 deletions simpler_setup/code_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ def __init__( # noqa: PLR0913
device_id: Optional[int] = None,
platform: str = "a2a3",
enable_profiling: bool = False,
enable_dump_tensor: bool = False,
run_all_cases: bool = False,
case_name: Optional[str] = None,
pto_isa_commit: Optional[str] = None,
Expand All @@ -212,6 +213,7 @@ def __init__( # noqa: PLR0913
self.golden_path = Path(golden_path).resolve()
self.platform = platform
self.enable_profiling = enable_profiling
self.enable_dump_tensor = enable_dump_tensor
self.skip_golden = skip_golden
self.project_root = PROJECT_ROOT

Expand Down Expand Up @@ -608,6 +610,9 @@ def _compile_one_kernel(kernel):
if self.enable_profiling and round_idx == 0:
config.enable_profiling = True
logger.info("Profiling enabled")
if self.enable_dump_tensor:
config.enable_dump_tensor = True
logger.info("Dump tensor enabled")

with _temporary_env(run_env):
worker.run(chip_callable, orch_args, config)
Expand Down Expand Up @@ -682,6 +687,7 @@ def create_code_runner( # noqa: PLR0913
device_id=None,
platform="a2a3",
enable_profiling=False,
enable_dump_tensor=False,
run_all_cases=False,
case_name=None,
pto_isa_commit=None,
Expand All @@ -698,6 +704,7 @@ def create_code_runner( # noqa: PLR0913
device_id=device_id,
platform=platform,
enable_profiling=enable_profiling,
enable_dump_tensor=enable_dump_tensor,
run_all_cases=run_all_cases,
case_name=case_name,
pto_isa_commit=pto_isa_commit,
Expand Down
45 changes: 39 additions & 6 deletions simpler_setup/scene_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,13 +499,14 @@ def build_callable(self, platform):
return self._compile_l3_callables(platform)
raise ValueError(f"Unsupported level: {self._st_level}")

def _build_config(self, config_dict, enable_profiling=False):
def _build_config(self, config_dict, enable_profiling=False, enable_dump_tensor=False):
from simpler.task_interface import ChipCallConfig # noqa: PLC0415

config = ChipCallConfig()
config.block_dim = config_dict.get("block_dim", 1)
config.aicpu_thread_num = config_dict.get("aicpu_thread_num", 3)
config.enable_profiling = enable_profiling
config.enable_dump_tensor = enable_dump_tensor
return config

def _resolve_env(self):
Expand All @@ -526,7 +527,15 @@ def _resolve_env(self):
# ------------------------------------------------------------------

def _run_and_validate(
self, worker, callable_obj, case, sub_ids=None, rounds=1, skip_golden=False, enable_profiling=False
self,
worker,
callable_obj,
case,
sub_ids=None,
rounds=1,
skip_golden=False,
enable_profiling=False,
enable_dump_tensor=False,
):
if self._st_level == 2:
self._run_and_validate_l2(
Expand All @@ -536,6 +545,7 @@ def _run_and_validate(
rounds=rounds,
skip_golden=skip_golden,
enable_profiling=enable_profiling,
enable_dump_tensor=enable_dump_tensor,
)
elif self._st_level == 3:
self._run_and_validate_l3(
Expand All @@ -546,9 +556,12 @@ def _run_and_validate(
rounds=rounds,
skip_golden=skip_golden,
enable_profiling=enable_profiling,
enable_dump_tensor=enable_dump_tensor,
)

def _run_and_validate_l2(self, worker, callable_obj, case, rounds=1, skip_golden=False, enable_profiling=False):
def _run_and_validate_l2(
self, worker, callable_obj, case, rounds=1, skip_golden=False, enable_profiling=False, enable_dump_tensor=False
):
params = case.get("params", {})
config_dict = case.get("config", {})
orch_sig = self.CALLABLE.get("orchestration", {}).get("signature", [])
Expand All @@ -575,7 +588,11 @@ def _run_and_validate_l2(self, worker, callable_obj, case, rounds=1, skip_golden
for name, initial in initial_outputs.items():
getattr(test_args, name).copy_(initial)

config = self._build_config(config_dict, enable_profiling=(enable_profiling and round_idx == 0))
config = self._build_config(
config_dict,
enable_profiling=(enable_profiling and round_idx == 0),
enable_dump_tensor=enable_dump_tensor,
)

with _temporary_env(self._resolve_env()):
worker.run(callable_obj, chip_args, config=config)
Expand All @@ -584,7 +601,15 @@ def _run_and_validate_l2(self, worker, callable_obj, case, rounds=1, skip_golden
_compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL)

def _run_and_validate_l3(
self, worker, compiled_callables, sub_ids, case, rounds=1, skip_golden=False, enable_profiling=False
self,
worker,
compiled_callables,
sub_ids,
case,
rounds=1,
skip_golden=False,
enable_profiling=False,
enable_dump_tensor=False,
):
from simpler.worker import Task # noqa: PLC0415

Expand Down Expand Up @@ -619,7 +644,11 @@ def _run_and_validate_l3(
for name, initial in initial_tensors.items():
getattr(test_args, name).copy_(initial)

config = self._build_config(config_dict, enable_profiling=(enable_profiling and round_idx == 0))
config = self._build_config(
config_dict,
enable_profiling=(enable_profiling and round_idx == 0),
enable_dump_tensor=enable_dump_tensor,
)

# Wrap in Task — user orch signature: (orch, callables, task_args, config)
def task_orch(orch, _unused, _ns=ns, _test_args=test_args, _config=config):
Expand All @@ -642,6 +671,7 @@ def test_run(self, st_platform, st_worker, request):
rounds = request.config.getoption("--rounds", default=1)
skip_golden = request.config.getoption("--skip-golden", default=False)
enable_profiling = request.config.getoption("--enable-profiling", default=False)
enable_dump_tensor = request.config.getoption("--dump-tensor", default=False)

callable_obj = self.build_callable(st_platform)
sub_ids = getattr(type(self), "_st_sub_ids", {})
Expand All @@ -661,6 +691,7 @@ def test_run(self, st_platform, st_worker, request):
rounds=rounds,
skip_golden=skip_golden,
enable_profiling=enable_profiling,
enable_dump_tensor=enable_dump_tensor,
)
ran_any = True

Expand All @@ -686,6 +717,7 @@ def run_module(module_name):
parser.add_argument("-n", "--rounds", type=int, default=1, help="Run each case N times (default: 1)")
parser.add_argument("--skip-golden", action="store_true", help="Skip golden comparison (benchmark mode)")
parser.add_argument("--enable-profiling", action="store_true", help="Enable profiling (first round only)")
parser.add_argument("--dump-tensor", action="store_true", help="Dump per-task tensor I/O at runtime")
parser.add_argument("--build", action="store_true", help="Compile runtime from source")
parser.add_argument(
"--log-level",
Expand Down Expand Up @@ -734,6 +766,7 @@ def run_module(module_name):
rounds=args.rounds,
skip_golden=args.skip_golden,
enable_profiling=args.enable_profiling,
enable_dump_tensor=args.dump_tensor,
)
print("PASSED")
except Exception as e:
Expand Down
Loading
Loading