diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bfc8f4758..879e40394 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -401,4 +401,6 @@ jobs: - name: Run pytest scene tests (a5) run: | - source ${ASCEND_HOME_PATH}/bin/setenv.bash && python -m pytest examples tests/st --platform a5 --device ${DEVICE_RANGE} -v + source ${ASCEND_HOME_PATH}/bin/setenv.bash + DEVICE_LIST=$(python -c "s,e='${DEVICE_RANGE}'.split('-'); print(','.join(str(i) for i in range(int(s),int(e)+1)))") + task-submit --timeout 1800 --max-time 1800 --device "$DEVICE_LIST" --run "python -m pytest examples tests/st --platform a5 --device ${DEVICE_RANGE} -v" diff --git a/conftest.py b/conftest.py index 071a8b655..736dda4e4 100644 --- a/conftest.py +++ b/conftest.py @@ -69,6 +69,7 @@ def pytest_addoption(parser): parser.addoption( "--enable-profiling", action="store_true", default=False, help="Enable profiling (first round only)" ) + parser.addoption("--dump-tensor", action="store_true", default=False, help="Dump per-task tensor I/O at runtime") parser.addoption("--build", action="store_true", default=False, help="Compile runtime from source") diff --git a/docs/tensor-dump.md b/docs/tensor-dump.md new file mode 100644 index 000000000..b8bc60a53 --- /dev/null +++ b/docs/tensor-dump.md @@ -0,0 +1,603 @@ +# Tensor Dump — Runtime Tensor Capture + +Tensor Dump captures per-task tensor inputs and outputs during kernel +execution and exports them to disk for offline inspection. It is a +runtime observability feature: host pre-allocates buffers on device, +AICPU writes records during execution, host collects data and exports +JSON manifest + binary payload. + +Supported on both architectures (`a2a3` / `a5`) and all three runtimes +(`host_build_graph`, `aicpu_build_graph`, `tensormap_and_ringbuffer`). +Opt-in via `--dump-tensor` — zero overhead when disabled. + +The **primary design** (a2a3) uses shared memory (`halHostRegister`) + +background threads for concurrent collection during execution. The a5 +implementation uses a **temporary memcpy-based fallback** (batch +collect-after-sync) because a5 hardware does not yet support +host-pinned shared memory mapping. Device-side data structures and +AICPU recording logic are **identical** across both platforms — only the +host-side collection transport differs. + +--- + +## 1. What gets captured + +For every task that AICPU dispatches: + +- **`TensorDumpRole`** — per formal callable signature (`IN` / `OUT` / + `INOUT`). +- **`TensorDumpStage`** — `BEFORE_DISPATCH` (inputs snapshotted before + the kernel runs) and `AFTER_COMPLETION` (outputs snapshotted after + the kernel reports FIN). `INOUT` tensors are captured at both stages. +- **Metadata** — `task_id`, `subtask_id` (AIC / AIV0 / AIV1), `func_id`, + `arg_index`, `dtype`, `shapes`, `raw_shapes`, `offsets`, + `is_contiguous`. +- **Payload bytes** — copied from the tensor's device buffer into a + per-thread circular arena. Non-contiguous views are gathered via + logical traversal; contiguous views take a fast-path memcpy. + +Each record is a fixed 128 B (two cache lines) — see `TensorDumpRecord` +in [`tensor_dump.h`](../src/a2a3/platform/include/common/tensor_dump.h). + +--- + +## 2. Architecture + +### 2.1 Common device-side structures + +Both platforms share the same device-side layout, published via +`kernel_args.dump_data_base`: + +```text +DumpSetupHeader (host init, AICPU reads) +├── num_dump_threads +├── records_per_buffer +├── magic = 0x44554D50 ("DUMP") +├── dump_buffer_ptrs [MAX_AICPU_THREADS] ──> DumpBuffer (per-thread) +├── arena_header_ptrs [MAX_AICPU_THREADS] ──> DumpArenaHeader +├── arena_data_ptrs [MAX_AICPU_THREADS] ──> arena bytes +└── arena_sizes [MAX_AICPU_THREADS] + +DumpBuffer (per-thread, 64 B header + records[]) + ├── count (AICPU writes) + ├── capacity (host sets) + ├── dropped_count (AICPU increments when full) + └── TensorDumpRecord records[capacity] ← 128 B each + +DumpArenaHeader (per-thread) + ├── write_offset (AICPU monotonic cursor) + └── arena_size (host sets) + +arena_data (per-thread, circular byte buffer) + default = BUFFERS_PER_THREAD × RECORDS_PER_BUFFER × AVG_TENSOR_BYTES + = 8 × 256 × 64 KiB = 128 MiB per thread +``` + +These structs are binary-identical between a2a3 and a5 (enforced by +`static_assert`). `dump_data_base` flows through `KernelArgs`, not +`Runtime` — AICPU reads it from `k_args->dump_data_base` in +`kernel.cpp` and passes it to `set_platform_dump_base()`. + +### 2.2 a2a3 — shared-memory + background thread (primary design) + +This is the canonical architecture. `halHostRegister` maps device memory +into host virtual address space so the host can read device buffers +directly without `rtMemcpy`. A `DumpMemoryManager` background thread +polls SPSC ready queues and recycles full metadata buffers **while +kernels are still executing**. + +```text + HOST DEVICE +┌──────────────────────────┐ ┌──────────────────────────┐ +│ TensorDumpCollector │ │ AICPU thread │ +│ │ │ │ +│ initialize() │ alloc + │ dump_tensor_init() │ +│ rtMalloc + halRegister │──register────>│ read DumpSetupHeader │ +│ build DumpDataHeader │ │ cache per-thread ptrs │ +│ │ │ │ +│ start_memory_manager() │ │ per-task run loop: │ +│ ┌────────────────────┐ │ │ BEFORE_DISPATCH │ +│ │ DumpMemoryManager │ │ │ dump_tensor_record() │ +│ │ background thread │ │ SPSC ready │ → write to arena │ +│ │ poll ready queue │<┼──queues──────<│ → append record │ +│ │ recycle buffers │─┼──free queue──>│ → push to ready_q │ +│ └────────────────────┘ │ │ dispatch kernel │ +│ │ │ wait FIN │ +│ poll_and_collect() │ │ AFTER_COMPLETION │ +│ concurrent thread │ shared mem │ dump_tensor_record() │ +│ reads arena via host │<──mapping────<│ │ +│ mapping (no memcpy) │ │ │ +│ │ │ dump_tensor_flush() │ +│ signal_execution_complete│ │ log per-thread stats │ +│ stop_memory_manager() │ └──────────────────────────┘ +│ drain_remaining_buffers()│ +│ scan_remaining_buffers() │ +│ │ +│ export_dump_files() │ +│ → outputs/tensor_dump_ │ +│ YYYYMMDD_HHMMSS/ │ +│ manifest.json │ +│ tensors.bin │ +└──────────────────────────┘ +``` + +**Execution flow** (`device_runner.cpp`): + +```text +init_tensor_dump() + dump_collector_.initialize(...) + kernel_args_.args.dump_data_base = dump_collector_.get_dump_shm_device_ptr() +start_memory_manager() ← spawn background polling thread +launch AICPU / AICore +spawn collector_thread ← poll_and_collect() concurrent with execution +rtStreamSynchronize ← wait for kernel completion +signal_execution_complete() ← tell background thread to drain +stop_memory_manager() +drain_remaining_buffers() ← pick up any stragglers +scan_remaining_dump_buffers() ← scan partial records still on device +export_dump_files() +``` + +Key classes (a2a3): + +- [`DumpMemoryManager`](../src/a2a3/platform/include/host/tensor_dump_collector.h) — + background thread: polls device ready queues, hands full buffers to + main thread, recycles them back to device free queue. +- [`TensorDumpCollector`](../src/a2a3/platform/include/host/tensor_dump_collector.h) — + main thread: `initialize` / `start_memory_manager` / + `poll_and_collect` / `signal_execution_complete` / + `drain_remaining_buffers` / `export_dump_files` / `finalize`. + +### 2.3 a5 — memcpy batch (temporary fallback) + +a5 hardware does not yet support `halHostRegister`. All device-to-host +transfers use `rtMemcpy` / `memcpy`, which requires the device to have +stopped writing. Collection happens **only after** +`rtStreamSynchronize`. No background threads, no SPSC queues. + +This is a temporary simplification; the a5 implementation should migrate +to the a2a3 shared-memory design once `halHostRegister` becomes +available on a5 hardware. + +```text + HOST DEVICE +┌──────────────────────────┐ ┌──────────────────────────┐ +│ TensorDumpCollector │ │ AICPU thread │ +│ │ │ │ +│ initialize() │ alloc + │ dump_tensor_init() │ +│ rtMalloc / malloc │──copy────────>│ read DumpSetupHeader │ +│ build DumpSetupHeader │ │ cache per-thread ptrs │ +│ copy to device │ │ │ +│ │ │ per-task run loop: │ +│ (no background thread) │ │ BEFORE_DISPATCH │ +│ │ │ dump_tensor_record() │ +│ ── kernel execution ── │ │ dispatch kernel │ +│ │ │ wait FIN │ +│ rtStreamSynchronize │ │ AFTER_COMPLETION │ +│ │ │ dump_tensor_record() │ +│ collect_all() │ batch │ │ +│ 2-step per thread: │<──memcpy─────<│ dump_tensor_flush() │ +│ 1. copy DumpBuffer hdr │ │ log per-thread stats │ +│ read count │ └──────────────────────────┘ +│ 2. copy records+arena │ +│ │ +│ export_dump_files() │ +│ → outputs/tensor_dump_ │ +│ YYYYMMDD_HHMMSS/ │ +│ manifest.json │ +│ tensors.bin │ +└──────────────────────────┘ +``` + +**Execution flow** (`device_runner.cpp`): + +```text +init_tensor_dump() + dump_collector_.initialize(...) + kernel_args_.dump_data_base = dump_collector_.get_dump_setup_device_ptr() +launch AICPU / AICore ← no background thread +rtStreamSynchronize ← wait for kernel completion +collect_all() ← batch memcpy all buffers back +export_dump_files() +``` + +Key class (a5): + +- [`TensorDumpCollector`](../src/a5/platform/include/host/tensor_dump_collector.h) — + `initialize` / `collect_all` / `export_dump_files` / `finalize`. + No `DumpMemoryManager`, no `start_memory_manager`. + +### 2.4 Common: where dump calls are wired in + +Each runtime's `aicpu_executor.cpp` calls `dump_tensors_for_task` at +two points in the per-task state machine: + +```text +┌──────────────────────────────────────┐ +│ per-task dispatch: │ +│ if enable_dump_tensor { │ +│ dump_tensors_for_task( │ +│ BEFORE_DISPATCH); │ +│ } │ +│ dispatch(task); │ +│ wait FIN; │ +│ if enable_dump_tensor { │ +│ dump_tensors_for_task( │ +│ AFTER_COMPLETION); │ +│ } │ +│ retire(task); │ +└──────────────────────────────────────┘ +``` + +`dump_tensors_for_task` walks the formal callable signature, matches +each non-scalar slot to a `TensorDumpInfo` (dtype + shape + offsets + +device address), and calls `dump_tensor_record` for slots that match +the current stage (inputs `BEFORE`, outputs `AFTER`, inouts both). + +### 2.5 Common: tensor metadata registration + +AICPU only has device addresses and sizes — it does **not** know the +logical shape / dtype / view geometry of each tensor unless the runtime +registers it. Each of the three runtimes exposes metadata through a +slightly different path, but they all converge on `TensorInfo` (see +[`tensor_info.h`](../src/a5/runtime/host_build_graph/runtime/tensor_info.h)): + +- **`host_build_graph`** — two orchestration-side APIs: + - `add_task()` → `set_tensor_info_to_task(task_id, info[], count)` + - `add_task_with_tensor_info()` (single call convenience wrapper) + + See + [`dump_tensor_orch.cpp`](../tests/st/a5/host_build_graph/dump_tensor_example/kernels/orchestration/dump_tensor_orch.cpp) + for both styles in one file. +- **`aicpu_build_graph`** — runtime layer fills `TensorInfo` from + `PTO2TaskPayload::tensors[]` directly. No orchestration API needed. +- **`tensormap_and_ringbuffer`** — identical to `aicpu_build_graph`; + the ring buffer carries `PTO2TaskPayload` which already contains + shape/offset arrays. + +When metadata is missing or inconsistent, the task is **skipped for +dump** and a single `LOG_WARN` is emitted (guarded by +`try_log_tensor_dump_layout_mismatch` to avoid log flooding). Normal +execution is never affected. + +--- + +## 3. Usage + +### 3.1 Enable at runtime + +The feature is gated end-to-end by a single boolean +(`ChipCallConfig::enable_dump_tensor`) that threads from the Python +harness through `ChipWorker::run` into `pto_runtime_c_api` and finally +into `DeviceRunner::run`. When `false`, zero bytes are allocated and +no dump code paths execute. + +**From a scene test** (`SceneTestCase.run_module` / pytest): + +```bash +# Standalone runner +python tests/st/a5/host_build_graph/dump_tensor_example/test_dump_tensor_example.py \ + -p a5sim --dump-tensor + +# pytest +pytest tests/st/a5/host_build_graph/dump_tensor_example -p a5sim --dump-tensor +``` + +**From `run_example.py`** (any example): + +```bash +python examples/scripts/run_example.py \ + -k examples/a5/host_build_graph/vector_example/kernels \ + -g examples/a5/host_build_graph/vector_example/golden.py \ + -p a5sim --dump-tensor +``` + +### 3.2 Output layout + +```text +outputs/ +└── tensor_dump_/ + ├── manifest.json # Array of TensorDumpRecord metadata + └── tensors.bin # Raw packed payload, indexed by bin_offset +``` + +`manifest.json`: + +```json +{ + "bin_file": "tensors.bin", + "tensors": [ + { + "task_id": "0x0000000200000a00", + "subtask_id": 1, + "role": "input", + "stage": "before_dispatch", + "func_id": 0, + "arg_index": 0, + "dtype": "float32", + "ndims": 1, + "shape": [16384], + "raw_shape": [16384], + "offsets": [0], + "is_contiguous": true, + "truncated": false, + "overwritten": false, + "bin_offset": 0, + "bin_size": 65536 + } + ] +} +``` + +### 3.3 Inspect with `tools/dump_viewer.py` + +The viewer auto-picks the latest `outputs/tensor_dump_*` directory +when invoked without arguments: + +```bash +# List every dumped tensor in the latest run +python tools/dump_viewer.py + +# Filter and save matching tensors to human-readable .txt files +python tools/dump_viewer.py --func 0 --stage before --role input --export + +# Export one specific entry by its manifest index +python tools/dump_viewer.py --index 42 + +# Pin to a specific dump directory +python tools/dump_viewer.py outputs/tensor_dump_20260414_092413 \ + --task 0x0000000200000a00 --export +``` + +Exported `.txt` files include metadata headers, a row-major overview +with aligned columns, and a detail listing with multi-dim indices — +safe to diff against golden tensors or pipe into a spreadsheet. + +### 3.4 Add dump support to a new test + +Only `host_build_graph` needs explicit wiring; the other two runtimes +pick up metadata automatically. + +```cpp +// In orchestration C++ (host_build_graph only) +TensorInfo info_a = make_tensor_info_from_tensor_arg(orch_args.tensor(0)); +TensorInfo info_b = make_tensor_info_from_tensor_arg(orch_args.tensor(1)); +TensorInfo info_f = make_tensor_info_from_tensor_arg(orch_args.tensor(2)); + +int t0 = add_task(runtime, args_t0, 4, /*func_id=*/0, CoreType::AIV); +TensorInfo t0_info[] = {info_a, info_b, info_f}; +set_tensor_info_to_task(runtime, t0, t0_info, 3); + +// Or in one call +int t1 = add_task_with_tensor_info( + runtime, args_t1, /*num_args=*/3, /*func_id=*/1, CoreType::AIV, + t1_info, /*tensor_count=*/1); +``` + +See the full template: +[`tests/st/a5/host_build_graph/dump_tensor_example`](../tests/st/a5/host_build_graph/dump_tensor_example/) +(and the `a2a3` mirror at `tests/st/a2a3/host_build_graph/dump_tensor_example`). + +--- + +## 4. Configuration knobs + +All defaults live in +[`platform_config.h`](../src/a2a3/platform/include/common/platform_config.h) +and match between `a2a3` and `a5`: + +| Constant | Default | Effect | +| -------- | ------- | ------ | +| `PLATFORM_DUMP_RECORDS_PER_BUFFER` | 256 | Max records per DumpBuffer (a2a3: per metadata buffer) | +| `PLATFORM_DUMP_BUFFERS_PER_THREAD` | 8 | Arena size multiplier (a2a3: also SPSC free queue depth) | +| `PLATFORM_DUMP_AVG_TENSOR_BYTES` | 64 KiB | Arena size multiplier | +| `PLATFORM_DUMP_MAX_DIMS` | 5 | Upper bound on shape / offset arrays | +| `PLATFORM_MAX_AICPU_THREADS` | 7 | Number of dump-producing threads | + +Per-thread arena = +`BUFFERS_PER_THREAD × RECORDS_PER_BUFFER × AVG_TENSOR_BYTES` += `8 × 256 × 65536` = **128 MiB**. + +--- + +## 5. Memory-pressure behaviour + +Three distinct failure modes exist when dump buffers run out of space. +All three are **safe** — they never crash the kernel or corrupt +execution — and all three surface in the JSON manifest plus the +`dump_tensor_flush` log line so users can detect and diagnose them. + +### 5.1 Truncation (`truncated = true`) + +**Trigger:** a single tensor's logical payload (`numel × elem_size`) +exceeds the entire per-thread arena size. + +**Mechanism (identical on a2a3 and a5):** before copying, AICPU +compares `bytes` against `arena_size`. When `bytes > arena_size`, +only `arena_size / 2` bytes are copied and the record is flagged +`truncated = 1`. + +```text +bytes = numel × elem_size +if bytes > arena_size: + copy_bytes = arena_size / 2 ← half the arena + truncated = true +``` + +**Effect:** the tensor entry in the manifest has `"truncated": true` +and `bin_size` is smaller than the full tensor. The payload contains +the first `arena_size / 2` bytes of the **logical** layout (gathered +or contiguous), enough for statistical sampling. + +**Tuning:** increase `PLATFORM_DUMP_AVG_TENSOR_BYTES` (arena grows +proportionally) so that the arena is at least as large as the biggest +tensor you need to inspect. + +### 5.2 Overwrite (`overwritten = true`) + +**Trigger:** the circular arena wraps around and AICPU writes new +payload data over a region whose metadata record has already been +emitted but whose payload has not yet been consumed by the host. + +**a2a3 mechanism:** the arena is a monotonic-offset circular buffer. +`arena_write_offset` grows without bound; the actual write position +is `offset % arena_size`. When the host processes a record it +compares the record's `payload_offset` against a high-water mark: + +```text +high_water = max payload_offset seen so far (maintained per-thread) +if high_water > arena_size: + oldest_valid = high_water − arena_size + if record.payload_offset < oldest_valid: + overwritten = true +``` + +Because a2a3 uses shared memory and a background reader, the host can +drain arena data **while the kernel is still running**. Overwrite +happens only when AICPU writes faster than the host can read — i.e. +many large tensors arrive in rapid succession without the host keeping +up. + +**a5 mechanism:** same arithmetic, but detection happens in +`collect_all()` after `rtStreamSynchronize`: + +```text +write_offset = arena_header.write_offset (total bytes ever written) +if write_offset > arena_size: + oldest_valid = write_offset − arena_size + if record.payload_offset < oldest_valid: + overwritten = true +``` + +Because a5 collects only after the stream finishes, the entire +execution window's data must fit in the arena. If total payload bytes +written across all tasks exceed `arena_size`, the earliest payloads +are overwritten. + +**Effect:** overwritten records appear in the manifest with +`"overwritten": true` and zero payload bytes in the binary file. +Metadata (shape, dtype, task_id) is preserved — only the raw data +is lost. + +**Tuning:** increase `PLATFORM_DUMP_BUFFERS_PER_THREAD` (arena grows +proportionally) so total payload fits, or reduce the number of tasks +being dumped. + +### 5.3 Record discard (`dropped_count` / `dropped_records`) + +**Trigger:** the metadata record buffer (not the payload arena) is +full and no replacement buffer is available. + +**a5 mechanism (simple):** each thread has a single `DumpBuffer` with +`capacity = RECORDS_PER_BUFFER` (default 256). When `count >= +capacity`, subsequent `dump_tensor_record()` calls increment +`dropped_count` and return immediately — **no metadata, no payload** +is stored for that tensor. + +```text +if buf.count >= buf.capacity: + buf.dropped_count++ + return ← tensor silently skipped +``` + +**a2a3 mechanism (rotating buffers):** each thread rotates through +multiple `DumpMetaBuffer`s via an SPSC free queue. When a buffer fills +(256 records), AICPU tries to: + +1. **Enqueue** the full buffer to the per-thread ready queue (for the + host background thread to pick up). +2. **Pop** a fresh buffer from the free queue. + +If the ready queue is full or the free queue is empty, AICPU +spin-waits up to `DUMP_SPIN_WAIT_LIMIT` (1 000 000 iterations) to +give the host `DumpMemoryManager` time to replenish. If the wait +expires: + +```text +// Overwrite current buffer — account for lost records +account_dropped_records(state, cur_buf.count) +cur_buf.count = 0 ← reset and reuse +dropped_record_count += N ← tracks total lost records +``` + +The same fallback applies during `dump_tensor_flush()` at end of +execution if the ready queue is full. + +**Effect:** `dropped_records` in the manifest summary shows how many +tensor records were lost. Individual dropped tensors do **not** appear +in the `tensors[]` array at all — they are gone without trace. + +**Tuning:** increase `PLATFORM_DUMP_BUFFERS_PER_THREAD` (more +rotation buffers) and/or `PLATFORM_DUMP_READYQUEUE_SIZE` (deeper host +handoff queue). + +### 5.4 Summary matrix + +| Condition | Flag | Metadata | Payload | a2a3 | a5 | +| --------- | ---- | -------- | ------- | ---- | -- | +| Tensor > arena | `truncated` | Preserved | Partial (`arena/2` bytes) | Same | Same | +| Arena wraps, old data overwritten | `overwritten` | Preserved | Lost (zero bytes in bin) | Rare (concurrent drain) | Likely if total data > arena | +| Record buffer full, no free buffer | `dropped_count` | Lost | Lost | After spin-wait fallback | Immediate when count ≥ capacity | + +--- + +## 6. Known issues + +### 6.1 After-completion dumps capture stale output-tensor data (mixed_example) + +**Status:** open — requires pto-isa clarification. + +**Symptom:** in `mixed_example` (a2a3 / tensormap_and_ringbuffer), +some `after_completion` output-tensor dumps show stale or zero +prefixes followed by correct suffixes. The kernel execution result +itself is correct; only the dumped snapshot is wrong. + +**Root cause:** the TSTORE instruction enqueues an asynchronous MTE3 +copy from vector pipe to Global Memory. The current post-TSTORE +synchronisation in affected kernels uses: + +```cpp +set_flag(PIPE_V, PIPE_MTE3); +wait_flag(PIPE_V, PIPE_MTE3); +``` + +This pair does **not** guarantee that the MTE3 outgoing queue has +fully drained to GM before AICore signals FIN to AICPU. When AICPU +receives FIN and immediately reads the output buffer for the +`after_completion` dump, it can observe partially-written data. + +**Evidence:** replacing the `set_flag/wait_flag` pair with +`pipe_barrier(PIPE_ALL)` makes every `after_completion` dump match +its recomputed golden values. + +**Affected scope:** any kernel that uses TSTORE and relies on +`set_flag(PIPE_V, PIPE_MTE3)` + `wait_flag(PIPE_V, PIPE_MTE3)` as +the final barrier before FIN. The dump itself is not at fault — the +issue is in the ordering semantics between the MTE3 queue and the +AICore → AICPU FIN handshake. + +**Workaround:** use `pipe_barrier(PIPE_ALL)` instead of the +`set_flag/wait_flag` pair in kernels whose outputs need correct +`after_completion` dumps. + +**Open question for pto-isa:** + +- Does `set_flag/wait_flag(PIPE_V, PIPE_MTE3)` guarantee that MTE3 + writes are visible in GM, or only that the MTE3 **pipe** is idle? +- What is the correct barrier to ensure GM visibility before FIN? + +**References:** + +- Kernels: `examples/a2a3/tensormap_and_ringbuffer/mixed_example/kernels/aic/` and `aiv/` +- Runtime: `src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp` + +--- + +## 7. Related docs + +- [chip-level-arch.md](chip-level-arch.md) — host / AICPU / AICore + program boundaries this feature spans. +- [task-flow.md](task-flow.md) — where AICPU dispatch and completion + sit in the per-task state machine. +- [distributed_level_runtime.md](distributed_level_runtime.md) — how + L2 (this feature) relates to L3+ composition. diff --git a/docs/testing.md b/docs/testing.md index ee5f8bdac..16ac9860f 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -51,6 +51,10 @@ python examples/a2a3/tensormap_and_ringbuffer/vector_example/test_vector_example python examples/a2a3/tensormap_and_ringbuffer/vector_example/test_vector_example.py \ -p a2a3 --enable-profiling +# Tensor dump +python tests/st/a2a3/tensormap_and_ringbuffer/alternating_matmul_add/test_alternating_matmul_add.py \ + -p a2a3 -d 11 --dump-tensor + # Single example via run_example.py (deprecated — prefer test_*.py standalone) python examples/scripts/run_example.py \ -k examples/a2a3/host_build_graph/vector_example/kernels \ @@ -95,6 +99,7 @@ pytest --platform a2a3sim --log-level debug # verbose C++ python test_xxx.py -p a2a3sim # default: 1 round + golden python test_xxx.py -p a2a3 -d 0 -n 100 --skip-golden # benchmark mode python test_xxx.py -p a2a3 --enable-profiling # profiling (first round) +python test_xxx.py -p a2a3 --dump-tensor # dump per-task tensor I/O python test_xxx.py -p a2a3sim --build # compile runtime from source python test_xxx.py -p a2a3sim --log-level debug # verbose C++ logging ``` @@ -106,6 +111,7 @@ python test_xxx.py -p a2a3sim --log-level debug # verbose C++ l | `--rounds N` | `-n` | 1 | Run each case N times | | `--skip-golden` | | false | Skip golden comparison (for benchmarking) | | `--enable-profiling` | | false | Enable profiling on first round only | +| `--dump-tensor` | | false | Dump per-task tensor I/O during runtime execution | | `--build` | | false | Compile runtime from source (not pre-built) | | `--log-level LEVEL` | | (none) | Set `PTO_LOG_LEVEL` env var (`error`/`warn`/`info`/`debug`) | diff --git a/examples/scripts/run_example.py b/examples/scripts/run_example.py index 83075a232..839b5600f 100644 --- a/examples/scripts/run_example.py +++ b/examples/scripts/run_example.py @@ -143,6 +143,12 @@ def compute_golden(tensors: dict, params: dict) -> None: help="Enable profiling and generate swimlane.json", ) + parser.add_argument( + "--dump-tensor", + action="store_true", + help="Dump per-task tensor I/O at runtime (controlled by enable_dump_tensor flag)", + ) + parser.add_argument( "--all", action="store_true", @@ -223,6 +229,7 @@ def compute_golden(tensors: dict, params: dict) -> None: device_id=args.device, platform=args.platform, enable_profiling=args.enable_profiling, + enable_dump_tensor=args.dump_tensor, run_all_cases=args.all, case_name=args.case, pto_isa_commit=args.pto_isa_commit, diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp index 2ae5dbb30..066977890 100644 --- a/python/bindings/task_interface.cpp +++ b/python/bindings/task_interface.cpp @@ -541,10 +541,12 @@ NB_MODULE(_task_interface, m) { .def_rw("block_dim", &ChipCallConfig::block_dim) .def_rw("aicpu_thread_num", &ChipCallConfig::aicpu_thread_num) .def_rw("enable_profiling", &ChipCallConfig::enable_profiling) + .def_rw("enable_dump_tensor", &ChipCallConfig::enable_dump_tensor) .def("__repr__", [](const ChipCallConfig &self) -> std::string { std::ostringstream os; os << "ChipCallConfig(block_dim=" << self.block_dim << ", aicpu_thread_num=" << self.aicpu_thread_num - << ", enable_profiling=" << (self.enable_profiling ? "True" : "False") << ")"; + << ", enable_profiling=" << (self.enable_profiling ? "True" : "False") + << ", enable_dump_tensor=" << (self.enable_dump_tensor ? "True" : "False") << ")"; return os.str(); }); diff --git a/simpler_setup/code_runner.py b/simpler_setup/code_runner.py index 9a8e2858f..a70f6096c 100644 --- a/simpler_setup/code_runner.py +++ b/simpler_setup/code_runner.py @@ -193,6 +193,7 @@ def __init__( # noqa: PLR0913 device_id: Optional[int] = None, platform: str = "a2a3", enable_profiling: bool = False, + enable_dump_tensor: bool = False, run_all_cases: bool = False, case_name: Optional[str] = None, pto_isa_commit: Optional[str] = None, @@ -212,6 +213,7 @@ def __init__( # noqa: PLR0913 self.golden_path = Path(golden_path).resolve() self.platform = platform self.enable_profiling = enable_profiling + self.enable_dump_tensor = enable_dump_tensor self.skip_golden = skip_golden self.project_root = PROJECT_ROOT @@ -608,6 +610,9 @@ def _compile_one_kernel(kernel): if self.enable_profiling and round_idx == 0: config.enable_profiling = True logger.info("Profiling enabled") + if self.enable_dump_tensor: + config.enable_dump_tensor = True + logger.info("Dump tensor enabled") with _temporary_env(run_env): worker.run(chip_callable, orch_args, config) @@ -682,6 +687,7 @@ def create_code_runner( # noqa: PLR0913 device_id=None, platform="a2a3", enable_profiling=False, + enable_dump_tensor=False, run_all_cases=False, case_name=None, pto_isa_commit=None, @@ -698,6 +704,7 @@ def create_code_runner( # noqa: PLR0913 device_id=device_id, platform=platform, enable_profiling=enable_profiling, + enable_dump_tensor=enable_dump_tensor, run_all_cases=run_all_cases, case_name=case_name, pto_isa_commit=pto_isa_commit, diff --git a/simpler_setup/scene_test.py b/simpler_setup/scene_test.py index 8133b264a..4e1aae4bb 100644 --- a/simpler_setup/scene_test.py +++ b/simpler_setup/scene_test.py @@ -499,13 +499,14 @@ def build_callable(self, platform): return self._compile_l3_callables(platform) raise ValueError(f"Unsupported level: {self._st_level}") - def _build_config(self, config_dict, enable_profiling=False): + def _build_config(self, config_dict, enable_profiling=False, enable_dump_tensor=False): from simpler.task_interface import ChipCallConfig # noqa: PLC0415 config = ChipCallConfig() config.block_dim = config_dict.get("block_dim", 1) config.aicpu_thread_num = config_dict.get("aicpu_thread_num", 3) config.enable_profiling = enable_profiling + config.enable_dump_tensor = enable_dump_tensor return config def _resolve_env(self): @@ -526,7 +527,15 @@ def _resolve_env(self): # ------------------------------------------------------------------ def _run_and_validate( - self, worker, callable_obj, case, sub_ids=None, rounds=1, skip_golden=False, enable_profiling=False + self, + worker, + callable_obj, + case, + sub_ids=None, + rounds=1, + skip_golden=False, + enable_profiling=False, + enable_dump_tensor=False, ): if self._st_level == 2: self._run_and_validate_l2( @@ -536,6 +545,7 @@ def _run_and_validate( rounds=rounds, skip_golden=skip_golden, enable_profiling=enable_profiling, + enable_dump_tensor=enable_dump_tensor, ) elif self._st_level == 3: self._run_and_validate_l3( @@ -546,9 +556,12 @@ def _run_and_validate( rounds=rounds, skip_golden=skip_golden, enable_profiling=enable_profiling, + enable_dump_tensor=enable_dump_tensor, ) - def _run_and_validate_l2(self, worker, callable_obj, case, rounds=1, skip_golden=False, enable_profiling=False): + def _run_and_validate_l2( + self, worker, callable_obj, case, rounds=1, skip_golden=False, enable_profiling=False, enable_dump_tensor=False + ): params = case.get("params", {}) config_dict = case.get("config", {}) orch_sig = self.CALLABLE.get("orchestration", {}).get("signature", []) @@ -575,7 +588,11 @@ def _run_and_validate_l2(self, worker, callable_obj, case, rounds=1, skip_golden for name, initial in initial_outputs.items(): getattr(test_args, name).copy_(initial) - config = self._build_config(config_dict, enable_profiling=(enable_profiling and round_idx == 0)) + config = self._build_config( + config_dict, + enable_profiling=(enable_profiling and round_idx == 0), + enable_dump_tensor=enable_dump_tensor, + ) with _temporary_env(self._resolve_env()): worker.run(callable_obj, chip_args, config=config) @@ -584,7 +601,15 @@ def _run_and_validate_l2(self, worker, callable_obj, case, rounds=1, skip_golden _compare_outputs(test_args, golden_args, output_names, self.RTOL, self.ATOL) def _run_and_validate_l3( - self, worker, compiled_callables, sub_ids, case, rounds=1, skip_golden=False, enable_profiling=False + self, + worker, + compiled_callables, + sub_ids, + case, + rounds=1, + skip_golden=False, + enable_profiling=False, + enable_dump_tensor=False, ): from simpler.worker import Task # noqa: PLC0415 @@ -619,7 +644,11 @@ def _run_and_validate_l3( for name, initial in initial_tensors.items(): getattr(test_args, name).copy_(initial) - config = self._build_config(config_dict, enable_profiling=(enable_profiling and round_idx == 0)) + config = self._build_config( + config_dict, + enable_profiling=(enable_profiling and round_idx == 0), + enable_dump_tensor=enable_dump_tensor, + ) # Wrap in Task — user orch signature: (orch, callables, task_args, config) def task_orch(orch, _unused, _ns=ns, _test_args=test_args, _config=config): @@ -642,6 +671,7 @@ def test_run(self, st_platform, st_worker, request): rounds = request.config.getoption("--rounds", default=1) skip_golden = request.config.getoption("--skip-golden", default=False) enable_profiling = request.config.getoption("--enable-profiling", default=False) + enable_dump_tensor = request.config.getoption("--dump-tensor", default=False) callable_obj = self.build_callable(st_platform) sub_ids = getattr(type(self), "_st_sub_ids", {}) @@ -661,6 +691,7 @@ def test_run(self, st_platform, st_worker, request): rounds=rounds, skip_golden=skip_golden, enable_profiling=enable_profiling, + enable_dump_tensor=enable_dump_tensor, ) ran_any = True @@ -686,6 +717,7 @@ def run_module(module_name): parser.add_argument("-n", "--rounds", type=int, default=1, help="Run each case N times (default: 1)") parser.add_argument("--skip-golden", action="store_true", help="Skip golden comparison (benchmark mode)") parser.add_argument("--enable-profiling", action="store_true", help="Enable profiling (first round only)") + parser.add_argument("--dump-tensor", action="store_true", help="Dump per-task tensor I/O at runtime") parser.add_argument("--build", action="store_true", help="Compile runtime from source") parser.add_argument( "--log-level", @@ -734,6 +766,7 @@ def run_module(module_name): rounds=args.rounds, skip_golden=args.skip_golden, enable_profiling=args.enable_profiling, + enable_dump_tensor=args.dump_tensor, ) print("PASSED") except Exception as e: diff --git a/src/a2a3/platform/include/aicpu/tensor_dump_aicpu.h b/src/a2a3/platform/include/aicpu/tensor_dump_aicpu.h new file mode 100644 index 000000000..50db7d59c --- /dev/null +++ b/src/a2a3/platform/include/aicpu/tensor_dump_aicpu.h @@ -0,0 +1,274 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * @file tensor_dump_aicpu.h + * @brief AICPU tensor dump collection interface + * + * Provides tensor dump management for AICPU side. + * Handles dump shared-memory base propagation plus buffer initialization, + * tensor data copying to arenas, metadata recording, and flushing. + */ + +#ifndef PLATFORM_AICPU_TENSOR_DUMP_AICPU_H_ +#define PLATFORM_AICPU_TENSOR_DUMP_AICPU_H_ + +#include + +#include "common/memory_barrier.h" +#include "common/tensor_dump.h" +#include "data_type.h" + +#ifdef __cplusplus +#include "callable.h" +#include "common/unified_log.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Set the tensor dump shared-memory base address. + * Called by the platform layer before AICPU execution starts. + * + * @param dump_data_base Device pointer (as uint64_t) to dump shared memory + */ +void set_platform_dump_base(uint64_t dump_data_base); + +/** + * Get the tensor dump shared-memory base address. + * + * @return Device pointer (as uint64_t) to dump shared memory + */ +uint64_t get_platform_dump_base(); + +/** + * Set whether tensor dump is enabled for this execution. + * Called by the platform layer before AICPU execution starts. + * + * @param enable true to enable tensor dump, false to disable + */ +void set_enable_dump_tensor(bool enable); + +/** + * Get whether tensor dump is enabled for this execution. + * + * @return true if tensor dump is enabled + */ +bool get_enable_dump_tensor(); + +#ifdef __cplusplus +} +#endif + +#ifdef __cplusplus +bool get_tensor_dump_role_from_direction(ArgDirection dir, TensorDumpRole *role); +int32_t count_callable_tensor_args(const CoreCallable &callable); +bool should_dump_tensor_at_stage(TensorDumpRole role, TensorDumpStage stage); +bool try_log_tensor_dump_layout_mismatch(); +int dump_tensor_record(int thread_idx, const TensorDumpInfo &info); + +template +inline void dump_tensors_for_task( + int32_t thread_idx, const SlotStateT &slot_state, TensorDumpStage stage, IsSubtaskActiveFn is_subtask_active, + GetFunctionBinAddrFn get_function_bin_addr +) { + const auto &pl = *slot_state.payload; + const CoreCallable *callables[MaxSubtaskSlots] = {}; + int32_t total_tensor_args = 0; + + for (int raw_subtask_id = 0; raw_subtask_id < MaxSubtaskSlots; raw_subtask_id++) { + if (!is_subtask_active(slot_state.active_mask, raw_subtask_id)) { + continue; + } + int32_t slot_idx = raw_subtask_id; + uint64_t callable_addr = get_function_bin_addr(slot_state.task->kernel_id[slot_idx]); + if (callable_addr == 0) { + return; + } + callables[slot_idx] = reinterpret_cast(callable_addr); + total_tensor_args += count_callable_tensor_args(*callables[slot_idx]); + } + + if (total_tensor_args != pl.tensor_count) { + if (try_log_tensor_dump_layout_mismatch()) { + LOG_WARN( + "Thread %d: tensor dump skipped for task 0x%" PRIx64 + ": active callable tensor count (%d) does not match payload tensor count (%d). " + "Task-level dump assumes payload tensors are concatenated by active subtask order.", + thread_idx, static_cast(slot_state.task->task_id.raw), total_tensor_args, pl.tensor_count + ); + } + return; + } + + rmb(); + + int32_t payload_index = 0; + for (int raw_subtask_id = 0; raw_subtask_id < MaxSubtaskSlots; raw_subtask_id++) { + if (!is_subtask_active(slot_state.active_mask, raw_subtask_id)) { + continue; + } + int32_t slot_idx = raw_subtask_id; + const CoreCallable &callable = *callables[slot_idx]; + for (int32_t sig_idx = 0; sig_idx < callable.sig_count(); sig_idx++) { + ArgDirection dir = callable.sig(sig_idx); + if (dir == ArgDirection::SCALAR) { + continue; + } + TensorDumpRole role; + if (get_tensor_dump_role_from_direction(dir, &role) && should_dump_tensor_at_stage(role, stage)) { + const auto &t = pl.tensors[payload_index]; + TensorDumpInfo info = {}; + info.buffer_addr = t.buffer.addr; + info.dtype = static_cast(t.dtype); + info.ndims = static_cast(t.ndims); + const uint32_t *raw_shapes = t.get_raw_shapes(); + for (uint32_t d = 0; d < t.ndims && d < PLATFORM_DUMP_MAX_DIMS; d++) { + info.shapes[d] = t.shapes[d]; + info.offsets[d] = t.is_all_offset_zero ? 0 : t.offsets[d]; + info.raw_shapes[d] = raw_shapes[d]; + } + info.task_id = slot_state.task->task_id.raw; + info.subtask_id = raw_subtask_id; + info.func_id = slot_state.task->kernel_id[slot_idx]; + info.arg_index = static_cast(payload_index); + info.role = role; + info.stage = stage; + dump_tensor_record(thread_idx, info); + } + payload_index++; + } + } +} + +template +inline void dump_tensors_for_task( + int32_t thread_idx, uint64_t task_id, uint8_t subtask_id, int32_t task_arg_count, int32_t func_id, + const CoreCallable &callable, const TensorInfoT *tensor_info, int32_t tensor_info_count, + const uint64_t *buffer_addrs, int32_t buffer_count, TensorDumpStage stage +) { + int32_t sig_count = callable.sig_count(); + if (task_arg_count < sig_count) { + static bool logged_task_signature_mismatch = false; + if (!logged_task_signature_mismatch) { + logged_task_signature_mismatch = true; + LOG_WARN( + "Thread %d: tensor dump skipped for task 0x%" PRIx64 + ": task args (%d) smaller than callable signature (%d)", + thread_idx, task_id, task_arg_count, sig_count + ); + } + return; + } + + int32_t tensor_arg_count = count_callable_tensor_args(callable); + if (tensor_info == nullptr || tensor_info_count != tensor_arg_count) { + if (tensor_arg_count == 0) { + return; + } + if (try_log_tensor_dump_layout_mismatch()) { + LOG_WARN( + "Thread %d: tensor dump skipped for task 0x%" PRIx64 + ": callable tensor args (%d) do not match registered tensor info (%d)", + thread_idx, task_id, tensor_arg_count, tensor_info_count + ); + } + return; + } + + if (buffer_addrs == nullptr || buffer_count != tensor_arg_count) { + static bool logged_task_tensor_addr_mismatch = false; + if (!logged_task_tensor_addr_mismatch) { + logged_task_tensor_addr_mismatch = true; + LOG_WARN( + "Thread %d: tensor dump skipped for task 0x%" PRIx64 + ": reconstructed tensor buffers (%d) do not match callable tensor args (%d)", + thread_idx, task_id, buffer_count, tensor_arg_count + ); + } + return; + } + + rmb(); + + int32_t tensor_arg_index = 0; + for (int32_t sig_idx = 0; sig_idx < sig_count; sig_idx++) { + ArgDirection dir = callable.sig(sig_idx); + if (dir == ArgDirection::SCALAR) { + continue; + } + + TensorDumpRole role; + if (!get_tensor_dump_role_from_direction(dir, &role) || !should_dump_tensor_at_stage(role, stage)) { + tensor_arg_index++; + continue; + } + + const auto &t = tensor_info[tensor_arg_index]; + TensorDumpInfo info = {}; + info.task_id = task_id; + info.subtask_id = subtask_id; + info.role = role; + info.stage = stage; + info.dtype = static_cast(t.dtype); + info.ndims = t.ndims; + info.func_id = static_cast(func_id); + info.arg_index = static_cast(tensor_arg_index); + info.buffer_addr = buffer_addrs[tensor_arg_index]; + for (uint32_t d = 0; d < t.ndims && d < PLATFORM_DUMP_MAX_DIMS; d++) { + info.shapes[d] = t.shapes[d]; + info.offsets[d] = t.offsets[d]; + info.raw_shapes[d] = t.raw_shapes[d]; + } + dump_tensor_record(thread_idx, info); + tensor_arg_index++; + } +} +#endif + +/** + * Initialize tensor dump. + * + * Sets up per-thread DumpBufferState pointers and pops initial + * metadata buffers from each thread's free_queue. + * + * @param num_dump_threads Number of scheduling threads that will dump tensors + */ +void dump_tensor_init(int num_dump_threads); + +/** + * Record a single tensor dump. + * + * Copies tensor data from GM to the thread's arena, appends a + * TensorDumpRecord to the current metadata buffer. Switches + * buffers when full via the SPSC free_queue. + * + * When metadata buffers are temporarily exhausted, old dump metadata may be + * overwritten so execution can continue without losing the active buffer. + * + * @param thread_idx Scheduling thread index + * @param info Tensor metadata and identification + * @return 0 on success or intentional drop, -1 only when dump state is unavailable + */ +int dump_tensor_record(int thread_idx, const TensorDumpInfo &info); + +/** + * Flush remaining tensor dump data for a thread. + * + * Marks non-empty metadata buffers as ready and enqueues them + * for host collection. + * + * @param thread_idx Thread index + */ +void dump_tensor_flush(int thread_idx); + +#endif // PLATFORM_AICPU_TENSOR_DUMP_AICPU_H_ diff --git a/src/a2a3/platform/include/common/kernel_args.h b/src/a2a3/platform/include/common/kernel_args.h index 593cda4c3..e1a70cab0 100644 --- a/src/a2a3/platform/include/common/kernel_args.h +++ b/src/a2a3/platform/include/common/kernel_args.h @@ -59,6 +59,8 @@ extern "C" { * - device_args: Written by host, read by AICPU (contains aicpu_so_bin/aicpu_so_len) * - runtime_args: Written by host, read by AICPU (task runtime, includes * handshake buffers) + * - dump_data_base: Written by host, read by AICPU platform layer; zero when + * tensor dump is unused * * Field Access Patterns: * - AICPU: receives KernelArgs* via DynTileFwkBackendKernelServer @@ -70,6 +72,7 @@ struct KernelArgs { __may_used_by_aicore__ Runtime *runtime_args{nullptr}; // Task runtime in device memory uint64_t regs{0}; // Per-core register base address array (platform-specific) uint64_t ffts_base_addr{0}; // FFTS base address for AICore + uint64_t dump_data_base{0}; // Dump shared memory base address, zero when unused }; #ifdef __cplusplus diff --git a/src/a2a3/platform/include/common/platform_config.h b/src/a2a3/platform/include/common/platform_config.h index d1ac3be8d..fad14e851 100644 --- a/src/a2a3/platform/include/common/platform_config.h +++ b/src/a2a3/platform/include/common/platform_config.h @@ -142,6 +142,52 @@ inline double cycles_to_us(uint64_t cycles) { return (static_cast(cycles) / PLATFORM_PROF_SYS_CNT_FREQ) * 1000000.0; } +// ============================================================================= +// Tensor Dump Configuration +// ============================================================================= + +/** + * Number of TensorDumpRecord entries per DumpMetaBuffer. + * Each record is 128 bytes, so one buffer = RECORDS * 128 bytes. + */ +constexpr int PLATFORM_DUMP_RECORDS_PER_BUFFER = 256; + +/** + * Pre-allocated DumpMetaBuffer count per AICPU scheduling thread. + * Pushed into the per-thread SPSC free_queue at init. + */ +constexpr int PLATFORM_DUMP_BUFFERS_PER_THREAD = 8; + +/** + * SPSC free_queue slot count for dump metadata buffers. + */ +constexpr int PLATFORM_DUMP_SLOT_COUNT = 4; + +/** + * Expected average tensor size in bytes. + * Used together with BUFFERS_PER_THREAD and RECORDS_PER_BUFFER to compute + * per-thread arena size: + * arena = BUFFERS_PER_THREAD * RECORDS_PER_BUFFER * AVG_TENSOR_BYTES + * Default: 4 * 256 * 65536 = 64 MB per thread. + */ +constexpr uint64_t PLATFORM_DUMP_AVG_TENSOR_BYTES = 65536; + +/** + * Maximum tensor dimensions (matches RUNTIME_MAX_TENSOR_DIMS). + */ +constexpr int PLATFORM_DUMP_MAX_DIMS = 5; + +/** + * Ready queue capacity for dump data. + * Sized to hold all dump buffers across all threads. + */ +constexpr int PLATFORM_DUMP_READYQUEUE_SIZE = PLATFORM_MAX_AICPU_THREADS * PLATFORM_DUMP_BUFFERS_PER_THREAD * 2; + +/** + * Idle timeout duration for tensor dump collection (seconds) + */ +constexpr int PLATFORM_DUMP_TIMEOUT_SECONDS = 30; + // ============================================================================= // Register Communication Configuration // ============================================================================= diff --git a/src/a2a3/platform/include/common/tensor_dump.h b/src/a2a3/platform/include/common/tensor_dump.h new file mode 100644 index 000000000..5a44069bc --- /dev/null +++ b/src/a2a3/platform/include/common/tensor_dump.h @@ -0,0 +1,304 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * @file tensor_dump.h + * @brief Tensor dump data structures for device-to-host tensor collection + * + * Independent shared memory region for capturing per-task tensor I/O. + * Fully decoupled from profiling — uses its own ready queues, buffer states, + * and memory manager thread. + * + * Memory layout (Dump SHM, allocated only when PTO2_DUMP_TENSOR=1): + * ┌─────────────────────────────────────────────────────────────┐ + * │ DumpDataHeader (fixed header) │ + * │ - Per-thread ready queues (circular FIFOs) │ + * │ - Metadata (num_dump_threads, config) │ + * ├─────────────────────────────────────────────────────────────┤ + * │ DumpBufferState[0] (Thread 0) │ + * │ - free_queue: SPSC queue of DumpMetaBuffer addresses │ + * │ - current_buf_ptr, arena_base, arena_write_offset │ + * ├─────────────────────────────────────────────────────────────┤ + * │ DumpBufferState[1] (Thread 1) │ + * ├─────────────────────────────────────────────────────────────┤ + * │ ... │ + * └─────────────────────────────────────────────────────────────┘ + * + * Per-thread payload arenas are separate allocations (registered once via + * halHostRegister). DumpMetaBuffers are allocated by the host and pushed + * into per-thread free_queues. + */ + +#ifndef SRC_A2A3_PLATFORM_INCLUDE_COMMON_TENSOR_DUMP_H_ +#define SRC_A2A3_PLATFORM_INCLUDE_COMMON_TENSOR_DUMP_H_ + +#include +#include + +#include "common/platform_config.h" + +// ============================================================================= +// Constants +// ============================================================================= + +constexpr uint32_t TENSOR_DUMP_MAGIC = 0x44554D50; // "DUMP" + +// ============================================================================= +// TensorDumpRole - Formal kernel signature direction +// ============================================================================= + +enum class TensorDumpRole : uint8_t { + INPUT = 0, + OUTPUT = 1, + INOUT = 2, +}; + +// ============================================================================= +// TensorDumpStage - When the tensor was captured +// ============================================================================= + +enum class TensorDumpStage : uint8_t { + BEFORE_DISPATCH = 0, + AFTER_COMPLETION = 1, +}; + +// ============================================================================= +// TensorDumpRecord - Single Tensor Dump Entry (128B = 2 cache lines) +// ============================================================================= + +/** + * Per-tensor metadata + payload reference. + * + * Cache line 1 (64B): identifiers, payload location, compact scalar metadata + * Cache line 2 (64B): logical/source layout arrays + */ +struct alignas(64) TensorDumpRecord { + // === Cache line 1 (64B) === + uint64_t task_id; // PTO2 encoding or plain task index + uint8_t subtask_id; // PTO2SubtaskSlot raw value (AIC=0, AIV0=1, AIV1=2) + uint8_t role; // TensorDumpRole (formal callable signature) + uint8_t stage; // TensorDumpStage (before/after execution) + uint8_t ndims; // Number of dimensions + uint32_t func_id; // Kernel function identifier + uint32_t arg_index; // Position in PTO2TaskPayload::tensors[] + uint8_t dtype; // DataType raw enum value + uint8_t truncated; // 1 if payload was truncated (tensor > arena capacity) + uint8_t is_contiguous; // 1 when source view is already contiguous + uint8_t pad0_align; // Explicit alignment before 64-bit payload offsets + uint64_t payload_offset; // Monotonic byte offset into thread arena + uint64_t payload_size; // Bytes actually copied (may be < full tensor bytes) + uint8_t pad0[24]; // Preserve 64B cache-line layout + + // === Cache line 2 (64B) === + uint32_t shapes[PLATFORM_DUMP_MAX_DIMS]; // Current view shape + uint32_t offsets[PLATFORM_DUMP_MAX_DIMS]; // Multi-dimensional offsets + uint32_t raw_shapes[PLATFORM_DUMP_MAX_DIMS]; // Underlying source layout shape + uint8_t pad1[4]; // Pad to 128 bytes +} __attribute__((aligned(64))); + +static_assert(sizeof(TensorDumpRecord) == 128, "TensorDumpRecord must be 128 bytes (2 cache lines)"); + +// ============================================================================= +// DumpMetaBuffer - Fixed-Size Record Buffer +// ============================================================================= + +/** + * Fixed-size dump record buffer. + * Capacity: PLATFORM_DUMP_RECORDS_PER_BUFFER + * Allocated by host, pushed into per-thread free_queue. + */ +struct DumpMetaBuffer { + TensorDumpRecord records[PLATFORM_DUMP_RECORDS_PER_BUFFER]; + volatile uint32_t count; // Current record count +} __attribute__((aligned(64))); + +// ============================================================================= +// DumpFreeQueue - SPSC Lock-Free Queue for Free Buffers +// ============================================================================= + +/** + * Single Producer Single Consumer (SPSC) lock-free queue. + * Same layout and semantics as PerfFreeQueue, separate type for decoupling. + * + * Producer: Host (DumpMemoryManager thread) pushes recycled/new buffers + * Consumer: Device (AICPU thread) pops buffers when switching + */ +struct DumpFreeQueue { + volatile uint64_t buffer_ptrs[PLATFORM_DUMP_SLOT_COUNT]; + volatile uint32_t head; // Consumer read position (Device increments) + volatile uint32_t tail; // Producer write position (Host increments) + uint32_t pad[13]; // Pad to 128 bytes +} __attribute__((aligned(64))); + +static_assert(sizeof(DumpFreeQueue) == 128, "DumpFreeQueue must be 128 bytes"); + +// ============================================================================= +// DumpBufferState - Per-Thread Buffer State +// ============================================================================= + +/** + * Per-thread buffer management state. + * + * Writers: + * - free_queue.tail: Host writes (pushes new buffers) + * - free_queue.head: Device writes (pops buffers) + * - current_buf_ptr: Device writes (after pop), Host reads (for flush/collect) + * - current_buf_seq: Device writes (monotonic counter) + * - arena_write_offset: Device writes (monotonic), Host reads (for overwrite detection) + * - dropped_record_count: Device writes (records lost before host export) + */ +struct DumpBufferState { + DumpFreeQueue free_queue; // SPSC queue of free DumpMetaBuffer addresses + volatile uint64_t current_buf_ptr; // Current active DumpMetaBuffer (0 = none) + volatile uint32_t current_buf_seq; // Sequence number for ordering + uint32_t pad0; // Alignment + volatile uint64_t arena_base; // Device pointer to this thread's arena + volatile uint64_t arena_size; // Arena size in bytes + volatile uint64_t arena_write_offset; // Monotonic write cursor (host computes % arena_size) + volatile uint32_t dropped_record_count; // Records dropped before host export + uint8_t pad1[28]; // Pad to 256 bytes +} __attribute__((aligned(64))); + +static_assert(sizeof(DumpBufferState) == 256, "DumpBufferState must be 256 bytes"); + +// ============================================================================= +// DumpReadyQueueEntry - Ready Queue Entry +// ============================================================================= + +/** + * When a DumpMetaBuffer is full, AICPU adds this entry to the thread's ready queue. + * Host memory manager retrieves entries and processes them. + */ +struct DumpReadyQueueEntry { + uint32_t thread_index; // Thread index (0 ~ num_dump_threads-1) + uint32_t pad0; + uint64_t buffer_ptr; // Device pointer to the full DumpMetaBuffer + uint32_t buffer_seq; // Sequence number for ordering + uint32_t pad1; +} __attribute__((aligned(32))); + +// ============================================================================= +// DumpDataHeader - Fixed Header +// ============================================================================= + +/** + * Dump data fixed header, located at the start of dump shared memory. + * + * Contains: + * 1. Per-thread ready queues (circular FIFOs) — one per AICPU thread + * 2. Metadata (thread count, config) + * + * Ready queue design mirrors PerfDataHeader but is independent: + * - Per-thread queues avoid lock contention + * - Producer: AICPU thread (adds full DumpMetaBuffers) + * - Consumer: Host DumpMemoryManager thread + * - Queue empty: head == tail + * - Queue full: (tail + 1) % capacity == head + */ +struct DumpDataHeader { + // Per-thread ready queues + DumpReadyQueueEntry queues[PLATFORM_MAX_AICPU_THREADS][PLATFORM_DUMP_READYQUEUE_SIZE]; + volatile uint32_t queue_heads[PLATFORM_MAX_AICPU_THREADS]; // Host reads (consumer) + volatile uint32_t queue_tails[PLATFORM_MAX_AICPU_THREADS]; // AICPU writes (producer) + + // Metadata (Host initializes, Device reads) + uint32_t num_dump_threads; + uint32_t records_per_buffer; + uint64_t arena_size_per_thread; + uint32_t magic; + uint32_t pad; +} __attribute__((aligned(64))); + +// ============================================================================= +// TensorDumpInfo - Lightweight Info Struct (passed from runtime to platform API) +// ============================================================================= + +/** + * Caller fills this struct from runtime-specific tensor types. + * Platform layer is agnostic to runtime-specific types (Tensor, PTO2TaskPayload, etc.). + */ +struct TensorDumpInfo { + uint64_t task_id; + uint8_t subtask_id; + TensorDumpRole role; + TensorDumpStage stage; + uint8_t dtype; + uint8_t ndims; + uint32_t func_id; + uint32_t arg_index; + uint64_t buffer_addr; + uint32_t shapes[PLATFORM_DUMP_MAX_DIMS]; + uint32_t offsets[PLATFORM_DUMP_MAX_DIMS]; + uint32_t raw_shapes[PLATFORM_DUMP_MAX_DIMS]; +}; + +// ============================================================================= +// Helper Functions - Memory Layout +// ============================================================================= + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Calculate total memory size for dump header + buffer states. + * + * @param num_dump_threads Number of AICPU scheduling threads + * @return Total bytes for DumpDataHeader + DumpBufferState array + */ +inline size_t calc_dump_data_size(int num_dump_threads) { + return sizeof(DumpDataHeader) + num_dump_threads * sizeof(DumpBufferState); +} + +/** + * Calculate per-thread arena size from configuration constants. + * + * @return Arena size in bytes per thread + */ +inline uint64_t calc_dump_arena_size() { + return static_cast(PLATFORM_DUMP_BUFFERS_PER_THREAD) * PLATFORM_DUMP_RECORDS_PER_BUFFER * + PLATFORM_DUMP_AVG_TENSOR_BYTES; +} + +/** + * Get DumpDataHeader pointer. + * + * @param base_ptr Dump shared memory base address + * @return DumpDataHeader pointer + */ +inline DumpDataHeader *get_dump_header(void *base_ptr) { return reinterpret_cast(base_ptr); } + +/** + * Get DumpBufferState array start address (after DumpDataHeader). + * + * @param base_ptr Dump shared memory base address + * @return DumpBufferState array pointer + */ +inline DumpBufferState *get_dump_buffer_states(void *base_ptr) { + return reinterpret_cast(reinterpret_cast(base_ptr) + sizeof(DumpDataHeader)); +} + +/** + * Get DumpBufferState for specified thread. + * + * @param base_ptr Dump shared memory base address + * @param thread_idx Thread index (0 ~ num_dump_threads-1) + * @return DumpBufferState pointer + */ +inline DumpBufferState *get_dump_buffer_state(void *base_ptr, int thread_idx) { + return &get_dump_buffer_states(base_ptr)[thread_idx]; +} + +#ifdef __cplusplus +} +#endif + +#endif // SRC_A2A3_PLATFORM_INCLUDE_COMMON_TENSOR_DUMP_H_ diff --git a/src/a2a3/platform/include/host/tensor_dump_collector.h b/src/a2a3/platform/include/host/tensor_dump_collector.h new file mode 100644 index 000000000..ee0a85b33 --- /dev/null +++ b/src/a2a3/platform/include/host/tensor_dump_collector.h @@ -0,0 +1,292 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * @file tensor_dump_collector.h + * @brief Host-side tensor dump collector with independent shared memory + * + * Fully decoupled from profiling: uses its own shared memory region, + * ready queues, and memory manager thread. + * + * Mirrors PerformanceCollector architecture: + * - DumpMemoryManager: Background thread that polls dump ready queues, + * recycles metadata buffers, and hands off full buffers to the main thread. + * - TensorDumpCollector: Main thread copies tensor data from arenas, + * manages lifecycle, and exports dump files. + */ + +#ifndef SRC_A2A3_PLATFORM_INCLUDE_HOST_TENSOR_DUMP_COLLECTOR_H_ +#define SRC_A2A3_PLATFORM_INCLUDE_HOST_TENSOR_DUMP_COLLECTOR_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/platform_config.h" +#include "common/tensor_dump.h" +#include "data_type.h" + +/** + * Memory allocation callback for tensor dump buffers and shared memory. + * + * @param size Memory size in bytes + * @param user_data Opaque allocator context + * @return Allocated device memory pointer, or nullptr on failure + */ +using DumpAllocCallback = void *(*)(size_t size, void *user_data); + +/** + * Memory registration callback for host-visible shared memory mappings. + * + * @param dev_ptr Device memory pointer + * @param size Memory size in bytes + * @param device_id Device ID + * @param user_data Opaque allocator context + * @param[out] host_ptr Host-mapped pointer + * @return 0 on success, error code on failure + */ +using DumpRegisterCallback = int (*)(void *dev_ptr, size_t size, int device_id, void *user_data, void **host_ptr); + +/** + * Memory unregister callback. + * + * @param dev_ptr Device memory pointer + * @param device_id Device ID + * @param user_data Opaque allocator context + * @return 0 on success, error code on failure + */ +using DumpUnregisterCallback = int (*)(void *dev_ptr, int device_id, void *user_data); + +/** + * Memory free callback. + * + * @param dev_ptr Device memory pointer + * @param user_data Opaque allocator context + * @return 0 on success, error code on failure + */ +using DumpFreeCallback = int (*)(void *dev_ptr, void *user_data); + +/** + * Callback for binding the memory-manager thread to a device context. + * + * @param device_id Device ID + * @param user_data Opaque allocator context + * @return 0 on success, error code on failure + */ +using DumpSetDeviceCallback = int (*)(int device_id, void *user_data); + +// ============================================================================= +// DumpMemoryManager - Background Thread +// ============================================================================= + +/** + * Information about a ready (full) dump metadata buffer + */ +struct DumpReadyBufferInfo { + uint32_t thread_index; + void *dev_buffer_ptr; + void *host_buffer_ptr; + uint32_t buffer_seq; +}; + +/** + * Dump buffer memory manager thread. + * + * Polls per-thread ready queues in DumpDataHeader, hands off full + * DumpMetaBuffers to the main thread, and recycles them back into + * the SPSC free_queue. + */ +class DumpMemoryManager { +public: + DumpMemoryManager() = default; + ~DumpMemoryManager(); + + DumpMemoryManager(const DumpMemoryManager &) = delete; + DumpMemoryManager &operator=(const DumpMemoryManager &) = delete; + + friend class TensorDumpCollector; + + void start( + void *shared_mem_host, int num_dump_threads, DumpAllocCallback alloc_cb, DumpRegisterCallback register_cb, + DumpFreeCallback free_cb, void *user_data, int device_id, DumpSetDeviceCallback set_device_cb = nullptr + ); + + void stop(); + + bool try_pop_ready(DumpReadyBufferInfo &info); + bool wait_pop_ready(DumpReadyBufferInfo &info, std::chrono::milliseconds timeout); + void notify_copy_done(void *dev_buffer_ptr); + + bool is_running() const { return running_.load(); } + +private: + std::thread mgmt_thread_; + std::atomic running_{false}; + + void *shared_mem_host_{nullptr}; + int num_dump_threads_{0}; + + DumpAllocCallback alloc_cb_{nullptr}; + DumpRegisterCallback register_cb_{nullptr}; + DumpFreeCallback free_cb_{nullptr}; + DumpSetDeviceCallback set_device_cb_{nullptr}; + void *user_data_{nullptr}; + int device_id_{-1}; + + std::mutex ready_mutex_; + std::condition_variable ready_cv_; + std::queue ready_queue_; + + std::mutex done_mutex_; + std::queue done_queue_; // Device pointers to recycle + + std::unordered_map dev_to_host_; + std::vector recycled_dump_buffers_; + + void mgmt_loop(); + void *alloc_and_register(size_t size, void **host_ptr_out); + void free_buffer(void *dev_ptr); + void *resolve_host_ptr(void *dev_ptr); + void register_mapping(void *dev_ptr, void *host_ptr); + void process_dump_entry(DumpDataHeader *header, int thread_idx, const DumpReadyQueueEntry &entry); +}; + +// ============================================================================= +// TensorDumpCollector - Main Collector +// ============================================================================= + +/** + * Collected tensor metadata + payload bytes + */ +struct DumpedTensor { + uint64_t task_id; + uint8_t subtask_id; + uint32_t func_id; + uint32_t arg_index; + TensorDumpRole role; + TensorDumpStage stage; + uint8_t dtype; + uint8_t ndims; + uint32_t raw_shapes[PLATFORM_DUMP_MAX_DIMS]; + uint32_t shapes[PLATFORM_DUMP_MAX_DIMS]; + uint32_t offsets[PLATFORM_DUMP_MAX_DIMS]; + bool is_contiguous; + bool truncated; + bool overwritten; + uint64_t payload_size; // original payload size (bytes may be cleared after writing) + uint64_t bin_offset; // byte offset into tensors.bin + std::vector bytes; +}; + +class TensorDumpCollector { +public: + TensorDumpCollector() = default; + ~TensorDumpCollector(); + + TensorDumpCollector(const TensorDumpCollector &) = delete; + TensorDumpCollector &operator=(const TensorDumpCollector &) = delete; + + /** + * Initialize tensor dump shared memory. + * + * Allocates DumpDataHeader + DumpBufferState array, per-thread arenas, + * and initial DumpMetaBuffers. + * + * @return 0 on success, error code on failure + */ + int initialize( + int num_dump_threads, int device_id, DumpAllocCallback alloc_cb, DumpRegisterCallback register_cb, + DumpFreeCallback free_cb, void *user_data, DumpSetDeviceCallback set_device_cb = nullptr + ); + + void start_memory_manager(); + void poll_and_collect(); + int export_dump_files(const std::string &output_path = "outputs"); + void stop_memory_manager(); + void drain_remaining_buffers(); + void scan_remaining_dump_buffers(); + void signal_execution_complete(); + + int finalize(DumpUnregisterCallback unregister_cb, DumpFreeCallback free_cb, void *user_data); + + bool is_initialized() const { return dump_shared_mem_host_ != nullptr; } + + void *get_dump_shm_device_ptr() const { return dump_shared_mem_dev_; } + +private: + void *dump_shared_mem_dev_{nullptr}; + void *dump_shared_mem_host_{nullptr}; + bool was_registered_{false}; + int device_id_{-1}; + int num_dump_threads_{0}; + + DumpAllocCallback alloc_cb_{nullptr}; + DumpRegisterCallback register_cb_{nullptr}; + DumpFreeCallback free_cb_{nullptr}; + DumpSetDeviceCallback set_device_cb_{nullptr}; + void *user_data_{nullptr}; + + // Per-thread arena pointers + struct ArenaInfo { + void *dev_ptr{nullptr}; + void *host_ptr{nullptr}; + uint64_t size{0}; + uint64_t high_water{0}; // For overwrite detection + }; + std::vector arenas_; + + DumpMemoryManager memory_manager_; + + // Collected dump tensors + std::vector collected_; + std::mutex collected_mutex_; + + // Execution complete signal + std::atomic execution_complete_{false}; + + // Stats + uint32_t total_dropped_record_count_{0}; + uint32_t total_truncated_count_{0}; + uint32_t total_overwrite_count_{0}; + + void *alloc_single_buffer(size_t size, void **host_ptr_out); + void process_dump_buffer(const DumpReadyBufferInfo &info); + + // Track processed buffer pointers to prevent double-processing + std::unordered_set processed_buffers_; + + // Writer thread: streams tensor payloads to a single tensors.bin + std::thread writer_thread_; + std::mutex write_mutex_; + std::condition_variable write_cv_; + std::queue write_queue_; + std::atomic writer_done_{false}; + + // Output directory and single binary file + std::filesystem::path run_dir_; + std::ofstream bin_file_; + uint64_t next_bin_offset_{0}; // only accessed by collect thread + + // Writer stats + std::atomic bytes_written_{0}; + + void writer_loop(); +}; + +#endif // SRC_A2A3_PLATFORM_INCLUDE_HOST_TENSOR_DUMP_COLLECTOR_H_ diff --git a/src/a2a3/platform/onboard/aicpu/kernel.cpp b/src/a2a3/platform/onboard/aicpu/kernel.cpp index d82696201..02eff65ad 100644 --- a/src/a2a3/platform/onboard/aicpu/kernel.cpp +++ b/src/a2a3/platform/onboard/aicpu/kernel.cpp @@ -16,6 +16,7 @@ #include "aicpu/device_log.h" #include "aicpu/platform_regs.h" #include "aicpu/platform_aicpu_affinity.h" +#include "aicpu/tensor_dump_aicpu.h" #include "runtime.h" // Forward declaration of aicpu_execute (implemented in aicpu_executor.cpp) @@ -80,6 +81,8 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer // Store platform regs before calling aicpu_execute set_platform_regs(k_args->regs); + set_platform_dump_base(k_args->dump_data_base); + set_enable_dump_tensor(k_args->dump_data_base != 0); // Affinity gate: drop excess threads before entering runtime if (!platform_aicpu_affinity_gate(runtime->sche_cpu_num, PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH)) { diff --git a/src/a2a3/platform/onboard/host/CMakeLists.txt b/src/a2a3/platform/onboard/host/CMakeLists.txt index 12c86f4fd..2bef4d9ba 100644 --- a/src/a2a3/platform/onboard/host/CMakeLists.txt +++ b/src/a2a3/platform/onboard/host/CMakeLists.txt @@ -38,6 +38,7 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/host_log.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/unified_log_host.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/performance_collector.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/tensor_dump_collector.cpp" ) if(DEFINED CUSTOM_SOURCE_DIRS) foreach(SRC_DIR ${CUSTOM_SOURCE_DIRS}) diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index 41d2235c8..b3def91cc 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -353,7 +353,7 @@ int DeviceRunner::copy_from_device(void *host_ptr, const void *dev_ptr, size_t b int DeviceRunner::run( Runtime &runtime, int block_dim, int device_id, const std::vector &aicpu_so_binary, - const std::vector &aicore_kernel_binary, int launch_aicpu_num + const std::vector &aicore_kernel_binary, int launch_aicpu_num, bool enable_dump_tensor ) { // Validate launch_aicpu_num if (launch_aicpu_num < 1 || launch_aicpu_num > PLATFORM_MAX_AICPU_THREADS) { @@ -473,6 +473,16 @@ int DeviceRunner::run( }); } + if (enable_dump_tensor) { + // Initialize tensor dump (independent from profiling) + rc = init_tensor_dump(runtime, num_aicore, device_id); + if (rc != 0) { + LOG_ERROR("init_tensor_dump failed: %d", rc); + return rc; + } + dump_collector_.start_memory_manager(); + } + auto perf_cleanup = RAIIScopeGuard([this]() { bool was_initialized = perf_collector_.is_initialized(); if (was_initialized) { @@ -540,6 +550,26 @@ int DeviceRunner::run( collector_thread.join(); } }); + auto collector_signal_guard = RAIIScopeGuard([this, &runtime]() { + if (runtime.enable_profiling) { + perf_collector_.signal_execution_complete(); + } + }); + + if (enable_dump_tensor) { + // Poll and collect dump data in a separate collector thread + std::thread dump_collector_thread([this]() { + dump_collector_.poll_and_collect(); + }); + auto dump_thread_guard = RAIIScopeGuard([&]() { + if (dump_collector_thread.joinable()) { + dump_collector_thread.join(); + } + }); + auto dump_signal_guard = RAIIScopeGuard([this]() { + dump_collector_.signal_execution_complete(); + }); + } std::cout << "\n=== rtStreamSynchronize stream_aicpu_===" << '\n'; // Synchronize streams @@ -555,11 +585,6 @@ int DeviceRunner::run( LOG_ERROR("rtStreamSynchronize (AICore) failed: %d", rc); return rc; } - - // Signal collector that device execution is complete - if (runtime.enable_profiling) { - perf_collector_.signal_execution_complete(); - } } // Stop memory management, drain remaining buffers, collect phase data, export @@ -571,6 +596,13 @@ int DeviceRunner::run( export_swimlane_json(); } + if (enable_dump_tensor) { + dump_collector_.stop_memory_manager(); + dump_collector_.drain_remaining_buffers(); + dump_collector_.scan_remaining_dump_buffers(); + dump_collector_.export_dump_files(); + } + // Print handshake results (reads from device memory, must be before free) print_handshake_results(); @@ -647,6 +679,24 @@ int DeviceRunner::finalize() { perf_collector_.finalize(unregister_cb, free_cb); } + if (dump_collector_.is_initialized()) { + auto unregister_cb = [](void *dev_ptr, int device_id, void *user_data) -> int { + (void)user_data; + HalHostUnregisterFn fn = get_halHostUnregister(); + if (fn != nullptr) { + return fn(dev_ptr, device_id); + } + return 0; + }; + + auto free_cb = [](void *dev_ptr, void *user_data) -> int { + auto *allocator = static_cast(user_data); + return allocator->free(dev_ptr); + }; + + dump_collector_.finalize(unregister_cb, free_cb, &mem_alloc_); + } + // Free all remaining allocations (including handshake buffer and binGmAddr) mem_alloc_.finalize(); @@ -832,3 +882,45 @@ void DeviceRunner::poll_and_collect_performance_data(int expected_tasks) { int DeviceRunner::export_swimlane_json(const std::string &output_path) { return perf_collector_.export_swimlane_json(output_path); } + +int DeviceRunner::init_tensor_dump(Runtime &runtime, int num_aicore, int device_id) { + int num_dump_threads = runtime.sche_cpu_num; + + auto alloc_cb = [](size_t size, void *user_data) -> void * { + auto *allocator = static_cast(user_data); + return allocator->alloc(size); + }; + + auto register_cb = [](void *dev_ptr, size_t size, int device_id, void *user_data, void **host_ptr) -> int { + (void)user_data; + if (load_hal_if_needed() != 0) { + LOG_ERROR("Failed to load ascend_hal for tensor dump: %s", dlerror()); + return -1; + } + HalHostRegisterFn fn = get_halHostRegister(); + if (fn == nullptr) { + LOG_ERROR("halHostRegister symbol not found: %s", dlerror()); + return -1; + } + return fn(dev_ptr, size, DEV_SVM_MAP_HOST, device_id, host_ptr); + }; + + auto free_cb = [](void *dev_ptr, void *user_data) -> int { + auto *allocator = static_cast(user_data); + return allocator->free(dev_ptr); + }; + + auto set_device_cb = [](int device_id, void * /*user_data*/) -> int { + return rtSetDevice(device_id); + }; + + int rc = dump_collector_.initialize( + num_dump_threads, device_id, alloc_cb, register_cb, free_cb, &mem_alloc_, set_device_cb + ); + if (rc != 0) { + return rc; + } + + kernel_args_.args.dump_data_base = reinterpret_cast(dump_collector_.get_dump_shm_device_ptr()); + return 0; +} diff --git a/src/a2a3/platform/onboard/host/device_runner.h b/src/a2a3/platform/onboard/host/device_runner.h index 0c7598363..be81110bb 100644 --- a/src/a2a3/platform/onboard/host/device_runner.h +++ b/src/a2a3/platform/onboard/host/device_runner.h @@ -43,6 +43,7 @@ #include "host/function_cache.h" #include "host/memory_allocator.h" #include "host/performance_collector.h" +#include "host/tensor_dump_collector.h" #include "runtime.h" /** @@ -252,7 +253,7 @@ class DeviceRunner { */ int run(Runtime &runtime, int block_dim, int device_id, const std::vector &aicpu_so_binary, - const std::vector &aicore_kernel_binary, int launch_aicpu_num = 1); + const std::vector &aicore_kernel_binary, int launch_aicpu_num = 1, bool enable_dump_tensor = false); /** * Print handshake results from device @@ -397,6 +398,9 @@ class DeviceRunner { // Performance profiling PerformanceCollector perf_collector_; + // Tensor dump (independent shared memory + memory manager) + TensorDumpCollector dump_collector_; + /** * Ensure device is initialized (lazy initialization) * @@ -442,6 +446,19 @@ class DeviceRunner { * @return 0 on success, error code on failure */ int init_performance_profiling(Runtime &runtime, int num_aicore, int device_id); + + /** + * Initialize tensor dump shared memory and collector. + * + * Allocates dump SHM + per-thread arenas, populates initial meta buffers, + * and stores the dump base in AICPU launch arguments. + * + * @param runtime Runtime instance to configure + * @param num_aicore Number of AICore instances (unused; dump is per-thread) + * @param device_id Device ID for host registration + * @return 0 on success, error code on failure + */ + int init_tensor_dump(Runtime &runtime, int num_aicore, int device_id); }; #endif // RUNTIME_DEVICERUNNER_H diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp index cc65fe997..d672355f6 100644 --- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp @@ -122,7 +122,7 @@ int set_device(DeviceContextHandle ctx, int device_id) { int run_runtime( DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim, int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary, - size_t aicore_size, int enable_profiling + size_t aicore_size, int enable_profiling, int enable_dump_tensor ) { if (ctx == NULL || runtime == NULL) return -1; if (aicpu_binary == NULL || aicpu_size == 0 || aicore_binary == NULL || aicore_size == 0) return -1; @@ -168,7 +168,7 @@ int run_runtime( std::vector aicpu_vec(aicpu_binary, aicpu_binary + aicpu_size); std::vector aicore_vec(aicore_binary, aicore_binary + aicore_size); - rc = runner->run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num); + rc = runner->run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num, enable_dump_tensor != 0); if (rc != 0) { validate_runtime_impl(r); r->~Runtime(); diff --git a/src/a2a3/platform/sim/host/CMakeLists.txt b/src/a2a3/platform/sim/host/CMakeLists.txt index 8432536fd..b28b6b57a 100644 --- a/src/a2a3/platform/sim/host/CMakeLists.txt +++ b/src/a2a3/platform/sim/host/CMakeLists.txt @@ -43,6 +43,7 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/host_log.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/unified_log_host.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/performance_collector.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/tensor_dump_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../aicpu/platform_aicpu_affinity.cpp" ) diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp index b4a0ffe8b..0979052b6 100644 --- a/src/a2a3/platform/sim/host/device_runner.cpp +++ b/src/a2a3/platform/sim/host/device_runner.cpp @@ -45,6 +45,8 @@ typedef void (*aicore_execute_func_t)( Runtime *runtime, int block_idx, CoreType core_type, uint32_t physical_core_id, uint64_t regs ); typedef void (*set_platform_regs_func_t)(uint64_t regs); +typedef void (*set_platform_dump_base_func_t)(uint64_t dump_data_base); +typedef void (*set_enable_dump_tensor_func_t)(bool enable); namespace { @@ -145,6 +147,18 @@ int DeviceRunner::ensure_binaries_loaded( LOG_ERROR("dlsym failed for set_platform_regs: %s", dlerror()); return -1; } + set_platform_dump_base_func_ = + reinterpret_cast(dlsym(aicpu_so_handle_, "set_platform_dump_base")); + if (set_platform_dump_base_func_ == nullptr) { + LOG_ERROR("dlsym failed for set_platform_dump_base: %s", dlerror()); + return -1; + } + set_enable_dump_tensor_func_ = + reinterpret_cast(dlsym(aicpu_so_handle_, "set_enable_dump_tensor")); + if (set_enable_dump_tensor_func_ == nullptr) { + LOG_ERROR("dlsym failed for set_enable_dump_tensor: %s", dlerror()); + return -1; + } LOG_INFO("DeviceRunner(sim): Loaded aicpu_execute from %s", aicpu_so_path_.c_str()); } @@ -210,7 +224,7 @@ int DeviceRunner::copy_from_device(void *host_ptr, const void *dev_ptr, size_t b int DeviceRunner::run( Runtime &runtime, int block_dim, int device_id, const std::vector &aicpu_so_binary, - const std::vector &aicore_kernel_binary, int launch_aicpu_num + const std::vector &aicore_kernel_binary, int launch_aicpu_num, bool enable_dump_tensor ) { clear_cpu_sim_shared_storage(); // Validate launch_aicpu_num @@ -312,6 +326,16 @@ int DeviceRunner::run( }); } + if (enable_dump_tensor) { + // Initialize tensor dump (independent from profiling) + rc = init_tensor_dump(runtime, num_aicore, device_id); + if (rc != 0) { + LOG_ERROR("init_tensor_dump failed: %d", rc); + return rc; + } + dump_collector_.start_memory_manager(); + } + auto perf_cleanup = RAIIScopeGuard([this]() { bool was_initialized = perf_collector_.is_initialized(); if (was_initialized) { @@ -361,6 +385,8 @@ int DeviceRunner::run( // Set platform regs in the AICPU .so before launching threads set_platform_regs_func_(kernel_args_.regs); + set_platform_dump_base_func_(kernel_args_.dump_data_base); + set_enable_dump_tensor_func_(enable_dump_tensor); // Launch AICPU threads (over-launch for affinity gate) constexpr int over_launch = PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH; @@ -400,23 +426,53 @@ int DeviceRunner::run( }); } - // Wait for all threads to complete - LOG_INFO("Waiting for threads to complete"); - for (auto &t : aicpu_threads) { - t.join(); - } - for (auto &t : aicore_threads) { - t.join(); - } + if (enable_dump_tensor) { + // Poll and collect dump data in a separate collector thread + std::thread dump_collector_thread([this]() { + dump_collector_.poll_and_collect(); + }); - // Signal collector that device execution is complete - if (runtime.enable_profiling) { - perf_collector_.signal_execution_complete(); - } + // Wait for all threads to complete + LOG_INFO("Waiting for threads to complete"); + for (auto &t : aicpu_threads) { + t.join(); + } + for (auto &t : aicore_threads) { + t.join(); + } + + // Signal collector that device execution is complete + if (runtime.enable_profiling) { + perf_collector_.signal_execution_complete(); + } + dump_collector_.signal_execution_complete(); + + // Wait for collector thread if it was launched + if (runtime.enable_profiling && collector_thread.joinable()) { + collector_thread.join(); + } + if (dump_collector_thread.joinable()) { + dump_collector_thread.join(); + } + } else { + // Wait for all threads to complete + LOG_INFO("Waiting for threads to complete"); + for (auto &t : aicpu_threads) { + t.join(); + } + for (auto &t : aicore_threads) { + t.join(); + } + + // Signal collector that device execution is complete + if (runtime.enable_profiling) { + perf_collector_.signal_execution_complete(); + } - // Wait for collector thread if it was launched - if (runtime.enable_profiling && collector_thread.joinable()) { - collector_thread.join(); + // Wait for collector thread if it was launched + if (runtime.enable_profiling && collector_thread.joinable()) { + collector_thread.join(); + } } LOG_INFO("All threads completed"); @@ -436,6 +492,13 @@ int DeviceRunner::run( export_swimlane_json(); } + if (enable_dump_tensor) { + dump_collector_.stop_memory_manager(); + dump_collector_.drain_remaining_buffers(); + dump_collector_.scan_remaining_dump_buffers(); + dump_collector_.export_dump_files(); + } + // Print handshake results at end of run print_handshake_results(); @@ -468,6 +531,8 @@ void DeviceRunner::unload_executor_binaries() { aicpu_so_handle_ = nullptr; aicpu_execute_func_ = nullptr; set_platform_regs_func_ = nullptr; + set_platform_dump_base_func_ = nullptr; + set_enable_dump_tensor_func_ = nullptr; } if (!aicpu_so_path_.empty()) { std::remove(aicpu_so_path_.c_str()); @@ -501,6 +566,16 @@ int DeviceRunner::finalize() { perf_collector_.finalize(nullptr, free_cb); } + if (dump_collector_.is_initialized()) { + auto free_cb = [](void *dev_ptr, void *user_data) -> int { + (void)user_data; + free(dev_ptr); + return 0; + }; + + dump_collector_.finalize(nullptr, free_cb, nullptr); + } + // Kernel binaries should have been removed by validate_runtime_impl() if (!func_id_to_addr_.empty()) { LOG_ERROR("finalize() called with %zu kernel binaries still cached", func_id_to_addr_.size()); @@ -657,3 +732,26 @@ void DeviceRunner::poll_and_collect_performance_data(int expected_tasks) { int DeviceRunner::export_swimlane_json(const std::string &output_path) { return perf_collector_.export_swimlane_json(output_path); } + +int DeviceRunner::init_tensor_dump(Runtime &runtime, int num_aicore, int device_id) { + (void)num_aicore; + int num_dump_threads = runtime.sche_cpu_num; + + auto alloc_cb = [](size_t size, void * /*user_data*/) -> void * { + return malloc(size); + }; + + auto free_cb = [](void *dev_ptr, void * /*user_data*/) -> int { + free(dev_ptr); + return 0; + }; + + // Simulation: no registration needed (dev == host) + int rc = dump_collector_.initialize(num_dump_threads, device_id, alloc_cb, nullptr, free_cb, nullptr, nullptr); + if (rc != 0) { + return rc; + } + + kernel_args_.dump_data_base = reinterpret_cast(dump_collector_.get_dump_shm_device_ptr()); + return 0; +} diff --git a/src/a2a3/platform/sim/host/device_runner.h b/src/a2a3/platform/sim/host/device_runner.h index dc2c87aa2..018809f65 100644 --- a/src/a2a3/platform/sim/host/device_runner.h +++ b/src/a2a3/platform/sim/host/device_runner.h @@ -50,6 +50,7 @@ #include "host/function_cache.h" #include "host/memory_allocator.h" #include "host/performance_collector.h" +#include "host/tensor_dump_collector.h" #include "runtime.h" /** @@ -142,7 +143,7 @@ class DeviceRunner { */ int run(Runtime &runtime, int block_dim, int device_id, const std::vector &aicpu_so_binary, - const std::vector &aicore_kernel_binary, int launch_aicpu_num = 1); + const std::vector &aicore_kernel_binary, int launch_aicpu_num = 1, bool enable_dump_tensor = false); /** * Print handshake results @@ -233,12 +234,17 @@ class DeviceRunner { int (*aicpu_execute_func_)(Runtime *){nullptr}; void (*aicore_execute_func_)(Runtime *, int, CoreType, uint32_t, uint64_t){nullptr}; void (*set_platform_regs_func_)(uint64_t){nullptr}; + void (*set_platform_dump_base_func_)(uint64_t){nullptr}; + void (*set_enable_dump_tensor_func_)(bool){nullptr}; std::string aicpu_so_path_; std::string aicore_so_path_; // Performance profiling PerformanceCollector perf_collector_; + // Tensor dump (independent shared memory + memory manager) + TensorDumpCollector dump_collector_; + // Private helper methods int ensure_device_initialized( int device_id, const std::vector &aicpu_so_binary, const std::vector &aicore_kernel_binary @@ -259,6 +265,8 @@ class DeviceRunner { * @return 0 on success, error code on failure */ int init_performance_profiling(Runtime &runtime, int num_aicore, int device_id); + + int init_tensor_dump(Runtime &runtime, int num_aicore, int device_id); }; #endif // SRC_A2A3_PLATFORM_SIM_HOST_DEVICE_RUNNER_H_ diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp index 28e382724..3e7dfd89e 100644 --- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp @@ -124,7 +124,7 @@ int set_device(DeviceContextHandle ctx, int device_id) { int run_runtime( DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim, int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary, - size_t aicore_size, int enable_profiling + size_t aicore_size, int enable_profiling, int enable_dump_tensor ) { if (ctx == NULL || runtime == NULL) return -1; @@ -167,7 +167,7 @@ int run_runtime( if (aicore_binary != NULL && aicore_size > 0) { aicore_vec.assign(aicore_binary, aicore_binary + aicore_size); } - rc = runner->run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num); + rc = runner->run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num, enable_dump_tensor != 0); if (rc != 0) { validate_runtime_impl(r); r->~Runtime(); diff --git a/src/a2a3/platform/src/aicpu/tensor_dump_aicpu.cpp b/src/a2a3/platform/src/aicpu/tensor_dump_aicpu.cpp new file mode 100644 index 000000000..fcd7be6b5 --- /dev/null +++ b/src/a2a3/platform/src/aicpu/tensor_dump_aicpu.cpp @@ -0,0 +1,489 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * @file tensor_dump_aicpu.cpp + * @brief AICPU tensor dump collection implementation + * + * Mirrors performance_collector_aicpu.cpp patterns: + * - Per-thread DumpBufferState with SPSC free queues + * - Per-thread ready queue for handing off full metadata buffers + * - Per-thread circular arena for tensor payload data + */ + +#include "aicpu/tensor_dump_aicpu.h" + +#include + +#include "common/memory_barrier.h" +#include "common/platform_config.h" +#include "common/unified_log.h" + +// Cached pointers for hot-path access (set during init) +static uint64_t g_platform_dump_base = 0; +static DumpDataHeader *s_dump_header = nullptr; +static DumpBufferState *s_dump_states[PLATFORM_MAX_AICPU_THREADS] = {}; +static DumpMetaBuffer *s_current_dump_buf[PLATFORM_MAX_AICPU_THREADS] = {}; + +static bool s_logged_ready_queue_full[PLATFORM_MAX_AICPU_THREADS] = {}; +static bool s_logged_no_free_meta_buffer[PLATFORM_MAX_AICPU_THREADS] = {}; +static bool s_logged_dump_layout_mismatch = false; +static uint32_t s_records_written[PLATFORM_MAX_AICPU_THREADS] = {}; +static uint32_t s_buffers_switched[PLATFORM_MAX_AICPU_THREADS] = {}; +static uint32_t s_buffers_flushed[PLATFORM_MAX_AICPU_THREADS] = {}; + +static inline void account_dropped_records(DumpBufferState *state, uint32_t dropped_records) { + if (state == nullptr || dropped_records == 0) { + return; + } + uint32_t prev = state->dropped_record_count; + uint32_t next = prev + dropped_records; + state->dropped_record_count = (next < prev) ? UINT32_MAX : next; +} + +extern "C" void set_platform_dump_base(uint64_t dump_data_base) { g_platform_dump_base = dump_data_base; } + +extern "C" uint64_t get_platform_dump_base() { return g_platform_dump_base; } + +static bool g_enable_dump_tensor = false; + +extern "C" void set_enable_dump_tensor(bool enable) { g_enable_dump_tensor = enable; } + +extern "C" bool get_enable_dump_tensor() { return g_enable_dump_tensor; } + +bool get_tensor_dump_role_from_direction(ArgDirection dir, TensorDumpRole *role) { + switch (dir) { + case ArgDirection::IN: + *role = TensorDumpRole::INPUT; + return true; + case ArgDirection::OUT: + *role = TensorDumpRole::OUTPUT; + return true; + case ArgDirection::INOUT: + *role = TensorDumpRole::INOUT; + return true; + case ArgDirection::SCALAR: + return false; + } + return false; +} + +int32_t count_callable_tensor_args(const CoreCallable &callable) { + int32_t tensor_count = 0; + for (int32_t i = 0; i < callable.sig_count(); i++) { + if (callable.sig(i) != ArgDirection::SCALAR) { + tensor_count++; + } + } + return tensor_count; +} + +bool should_dump_tensor_at_stage(TensorDumpRole role, TensorDumpStage stage) { + switch (role) { + case TensorDumpRole::INPUT: + return stage == TensorDumpStage::BEFORE_DISPATCH; + case TensorDumpRole::OUTPUT: + return stage == TensorDumpStage::AFTER_COMPLETION; + case TensorDumpRole::INOUT: + return true; + } + return false; +} + +bool try_log_tensor_dump_layout_mismatch() { + if (s_logged_dump_layout_mismatch) { + return false; + } + s_logged_dump_layout_mismatch = true; + return true; +} + +/** + * Enqueue a full dump metadata buffer to the thread's ready queue. + */ +static int enqueue_dump_ready_buffer(int thread_idx, uint64_t buffer_ptr, uint32_t buffer_seq) { + uint32_t capacity = PLATFORM_DUMP_READYQUEUE_SIZE; + uint32_t current_tail = s_dump_header->queue_tails[thread_idx]; + uint32_t current_head = s_dump_header->queue_heads[thread_idx]; + + uint32_t next_tail = (current_tail + 1) % capacity; + if (next_tail == current_head) { + return -1; // Queue full + } + + s_dump_header->queues[thread_idx][current_tail].thread_index = static_cast(thread_idx); + s_dump_header->queues[thread_idx][current_tail].buffer_ptr = buffer_ptr; + s_dump_header->queues[thread_idx][current_tail].buffer_seq = buffer_seq; + wmb(); + s_dump_header->queue_tails[thread_idx] = next_tail; + wmb(); + + return 0; +} + +/** + * Maximum spin-wait iterations when free_queue or ready_queue is exhausted. + * Gives host mgmt_loop time to replenish before falling back to buffer overwrite. + */ +static constexpr uint32_t DUMP_SPIN_WAIT_LIMIT = 1000000; + +/** + * Switch metadata buffer: enqueue the full buffer, pop a new one. + * Spin-waits briefly for host to replenish before falling back to overwrite. + */ +static int switch_dump_meta_buffer(int thread_idx) { + if (thread_idx < 0 || thread_idx >= PLATFORM_MAX_AICPU_THREADS) { + return -1; + } + DumpBufferState *state = s_dump_states[thread_idx]; + DumpMetaBuffer *cur = s_current_dump_buf[thread_idx]; + if (state == nullptr || cur == nullptr) { + return -1; + } + + // Spin-wait for a free buffer, giving host mgmt_loop time to replenish + rmb(); + uint32_t head = state->free_queue.head; + uint32_t tail = state->free_queue.tail; + if (head == tail) { + for (uint32_t spin = 0; spin < DUMP_SPIN_WAIT_LIMIT; spin++) { + rmb(); + head = state->free_queue.head; + tail = state->free_queue.tail; + if (head != tail) { + break; + } + } + } + if (head == tail) { + // Still empty after spin — overwrite current buffer + account_dropped_records(state, cur->count); + cur->count = 0; + wmb(); + if (!s_logged_no_free_meta_buffer[thread_idx]) { + s_logged_no_free_meta_buffer[thread_idx] = true; + LOG_WARN( + "Tensor dump ran out of free metadata buffers on thread %d after spin-wait, " + "overwriting current buffer. Increase PLATFORM_DUMP_BUFFERS_PER_THREAD.", + thread_idx + ); + } + return 0; + } + + // Enqueue the full buffer (spin-wait if ready queue is full) + uint64_t buf_addr = reinterpret_cast(cur); + uint32_t seq = state->current_buf_seq; + int rc = enqueue_dump_ready_buffer(thread_idx, buf_addr, seq); + if (rc != 0) { + for (uint32_t spin = 0; spin < DUMP_SPIN_WAIT_LIMIT; spin++) { + rmb(); + rc = enqueue_dump_ready_buffer(thread_idx, buf_addr, seq); + if (rc == 0) { + break; + } + } + } + if (rc != 0) { + // Still full after spin — overwrite current buffer + account_dropped_records(state, cur->count); + cur->count = 0; + wmb(); + if (!s_logged_ready_queue_full[thread_idx]) { + s_logged_ready_queue_full[thread_idx] = true; + LOG_WARN( + "Tensor dump ready queue full on thread %d after spin-wait, " + "overwriting current buffer. Increase PLATFORM_DUMP_READYQUEUE_SIZE.", + thread_idx + ); + } + return 0; + } + + // Pop next buffer from free_queue + uint64_t new_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_DUMP_SLOT_COUNT]; + rmb(); + state->free_queue.head = head + 1; + + DumpMetaBuffer *new_buf = reinterpret_cast(new_ptr); + new_buf->count = 0; + s_current_dump_buf[thread_idx] = new_buf; + state->current_buf_ptr = new_ptr; + state->current_buf_seq = seq + 1; + wmb(); + + s_buffers_switched[thread_idx]++; + + return 0; +} + +struct CircularArenaWriter { + char *arena; + uint64_t arena_size; + uint64_t base_offset; + uint64_t bytes_written; + + void write(const void *src, uint64_t size) { + if (size == 0) { + return; + } + uint64_t pos = (base_offset + bytes_written) % arena_size; + if (pos + size <= arena_size) { + memcpy(arena + pos, src, size); + } else { + uint64_t first = arena_size - pos; + memcpy(arena + pos, src, first); + memcpy(arena, reinterpret_cast(src) + first, size - first); + } + bytes_written += size; + } +}; + +static inline uint64_t get_tensor_dump_num_elements(const TensorDumpInfo &info) { + uint64_t elements = 1; + for (uint32_t d = 0; d < info.ndims && d < PLATFORM_DUMP_MAX_DIMS; d++) { + elements *= info.shapes[d]; + } + return elements; +} + +static inline bool tensor_dump_is_contiguous(const TensorDumpInfo &info) { + if (info.ndims == 0) { + return true; + } + for (uint32_t d = 1; d < info.ndims && d < PLATFORM_DUMP_MAX_DIMS; d++) { + if (info.shapes[d] != info.raw_shapes[d]) { + return false; + } + } + return true; +} + +static inline uint64_t tensor_dump_start_offset_elements(const TensorDumpInfo &info) { + uint64_t result = 0; + uint64_t stride = 1; + for (int d = static_cast(info.ndims) - 1; d >= 0; d--) { + result += static_cast(info.offsets[d]) * stride; + stride *= info.raw_shapes[d]; + } + return result; +} + +static inline void write_tensor_dump_contiguous_prefix( + CircularArenaWriter *writer, const TensorDumpInfo &info, uint64_t elem_sz, uint64_t copy_bytes +) { + uint64_t start_offset = tensor_dump_start_offset_elements(info); + const char *src = reinterpret_cast(info.buffer_addr) + start_offset * elem_sz; + writer->write(src, copy_bytes); +} + +static void gather_tensor_dump_dim( + CircularArenaWriter *writer, const TensorDumpInfo &info, uint64_t elem_sz, uint32_t dim, + uint64_t base_element_index, uint64_t *remaining_bytes +) { + if (*remaining_bytes == 0 || dim >= PLATFORM_DUMP_MAX_DIMS) { + return; + } + if (dim + 1 >= info.ndims) { + uint64_t row_start = base_element_index + info.offsets[dim]; + const char *src = reinterpret_cast(info.buffer_addr) + row_start * elem_sz; + uint64_t row_bytes = static_cast(info.shapes[dim]) * elem_sz; + uint64_t bytes_to_copy = (row_bytes < *remaining_bytes) ? row_bytes : *remaining_bytes; + writer->write(src, bytes_to_copy); + *remaining_bytes -= bytes_to_copy; + return; + } + + uint64_t inner_stride = 1; + for (uint32_t d = dim + 1; d < info.ndims && d < PLATFORM_DUMP_MAX_DIMS; d++) { + inner_stride *= info.raw_shapes[d]; + } + for (uint32_t i = 0; i < info.shapes[dim] && *remaining_bytes > 0; i++) { + uint64_t next_base = base_element_index + (static_cast(info.offsets[dim]) + i) * inner_stride; + gather_tensor_dump_dim(writer, info, elem_sz, dim + 1, next_base, remaining_bytes); + } +} + +static inline void write_tensor_dump_logical_prefix( + CircularArenaWriter *writer, const TensorDumpInfo &info, uint64_t elem_sz, uint64_t copy_bytes +) { + if (copy_bytes == 0) { + return; + } + if (tensor_dump_is_contiguous(info)) { + write_tensor_dump_contiguous_prefix(writer, info, elem_sz, copy_bytes); + return; + } + + uint64_t remaining_bytes = copy_bytes; + gather_tensor_dump_dim(writer, info, elem_sz, 0, 0, &remaining_bytes); +} + +void dump_tensor_init(int num_dump_threads) { + void *dump_base = reinterpret_cast(get_platform_dump_base()); + if (dump_base == nullptr) { + LOG_ERROR("platform dump base is NULL, cannot initialize tensor dump"); + return; + } + + s_dump_header = get_dump_header(dump_base); + + LOG_INFO("Initializing tensor dump for %d threads", num_dump_threads); + + // Pop initial metadata buffer from free_queue for each thread + for (int t = 0; t < num_dump_threads; t++) { + DumpBufferState *state = get_dump_buffer_state(dump_base, t); + s_dump_states[t] = state; + + rmb(); + uint32_t head = state->free_queue.head; + uint32_t tail = state->free_queue.tail; + if (head != tail) { + uint64_t buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_DUMP_SLOT_COUNT]; + rmb(); + state->free_queue.head = head + 1; + wmb(); + + DumpMetaBuffer *buf = reinterpret_cast(buf_ptr); + buf->count = 0; + s_current_dump_buf[t] = buf; + state->current_buf_ptr = buf_ptr; + state->current_buf_seq = 0; + wmb(); + LOG_DEBUG("Thread %d: popped initial dump buffer (addr=0x%lx)", t, buf_ptr); + } else { + LOG_ERROR("Thread %d: dump free_queue is empty during init!", t); + s_current_dump_buf[t] = nullptr; + state->current_buf_ptr = 0; + } + } + + memset(s_logged_ready_queue_full, 0, sizeof(s_logged_ready_queue_full)); + memset(s_logged_no_free_meta_buffer, 0, sizeof(s_logged_no_free_meta_buffer)); + memset(s_records_written, 0, sizeof(s_records_written)); + memset(s_buffers_switched, 0, sizeof(s_buffers_switched)); + memset(s_buffers_flushed, 0, sizeof(s_buffers_flushed)); +} + +int dump_tensor_record(int thread_idx, const TensorDumpInfo &info) { + if (s_dump_header == nullptr) { + return -1; + } + if (thread_idx < 0 || thread_idx >= PLATFORM_MAX_AICPU_THREADS) { + return -1; + } + + DumpBufferState *state = s_dump_states[thread_idx]; + DumpMetaBuffer *buf = s_current_dump_buf[thread_idx]; + if (buf == nullptr) { + return -1; + } + + // Switch metadata buffer if full + if (buf->count >= PLATFORM_DUMP_RECORDS_PER_BUFFER) { + if (switch_dump_meta_buffer(thread_idx) != 0) { + return -1; // No free buffer + } + buf = s_current_dump_buf[thread_idx]; + if (buf == nullptr) { + return -1; + } + } + + // Reserve space in arena + // Compute actual tensor data size from shape (not buffer.size which may include padding) + uint64_t actual_elements = get_tensor_dump_num_elements(info); + uint64_t elem_sz = get_element_size(static_cast(info.dtype)); + uint64_t bytes = actual_elements * elem_sz; + uint64_t copy_bytes = bytes; + bool truncated = false; + bool is_contiguous = tensor_dump_is_contiguous(info); + + if (bytes > state->arena_size) { + // Tensor larger than entire arena — copy a partial sample + copy_bytes = state->arena_size / 2; + truncated = true; + } + + uint64_t offset = state->arena_write_offset; + state->arena_write_offset = offset + copy_bytes; + + // Copy tensor data into arena (circular wraparound) + char *arena = reinterpret_cast(state->arena_base); + uint64_t arena_sz = state->arena_size; + CircularArenaWriter writer = {arena, arena_sz, offset, 0}; + write_tensor_dump_logical_prefix(&writer, info, elem_sz, copy_bytes); + wmb(); + + // Append metadata record + uint32_t idx = buf->count; + TensorDumpRecord *rec = &buf->records[idx]; + rec->task_id = info.task_id; + rec->subtask_id = info.subtask_id; + rec->func_id = info.func_id; + rec->arg_index = info.arg_index; + rec->is_contiguous = is_contiguous ? 1 : 0; + rec->role = static_cast(info.role); + rec->stage = static_cast(info.stage); + rec->ndims = info.ndims; + rec->dtype = info.dtype; + rec->truncated = truncated ? 1 : 0; + rec->payload_offset = offset; + rec->payload_size = copy_bytes; + for (int d = 0; d < info.ndims && d < PLATFORM_DUMP_MAX_DIMS; d++) { + rec->raw_shapes[d] = info.raw_shapes[d]; + rec->shapes[d] = info.shapes[d]; + rec->offsets[d] = info.offsets[d]; + } + buf->count = idx + 1; + wmb(); + + s_records_written[thread_idx]++; + + return 0; +} + +void dump_tensor_flush(int thread_idx) { + if (s_dump_header == nullptr) { + return; + } + if (thread_idx < 0 || thread_idx >= PLATFORM_MAX_AICPU_THREADS) { + return; + } + + DumpMetaBuffer *buf = s_current_dump_buf[thread_idx]; + if (buf != nullptr && buf->count > 0) { + uint64_t buf_addr = reinterpret_cast(buf); + uint32_t seq = s_dump_states[thread_idx]->current_buf_seq; + if (enqueue_dump_ready_buffer(thread_idx, buf_addr, seq) != 0) { + account_dropped_records(s_dump_states[thread_idx], buf->count); + buf->count = 0; + wmb(); + if (!s_logged_ready_queue_full[thread_idx]) { + s_logged_ready_queue_full[thread_idx] = true; + LOG_WARN( + "Tensor dump ready queue is full on thread %d, so the current metadata buffer will be " + "overwritten. Increase PLATFORM_DUMP_READYQUEUE_SIZE.", + thread_idx + ); + } + } + s_current_dump_buf[thread_idx] = nullptr; + s_dump_states[thread_idx]->current_buf_ptr = 0; + } + + s_buffers_flushed[thread_idx]++; + uint32_t dropped = s_dump_states[thread_idx] ? s_dump_states[thread_idx]->dropped_record_count : 0; + LOG_INFO( + "Thread %d: dump_tensor_flush (records=%u, buf_switches=%u, flushes=%u, dropped=%u)", thread_idx, + s_records_written[thread_idx], s_buffers_switched[thread_idx], s_buffers_flushed[thread_idx], dropped + ); +} diff --git a/src/a2a3/platform/src/host/tensor_dump_collector.cpp b/src/a2a3/platform/src/host/tensor_dump_collector.cpp new file mode 100644 index 000000000..a6a487c83 --- /dev/null +++ b/src/a2a3/platform/src/host/tensor_dump_collector.cpp @@ -0,0 +1,957 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * @file tensor_dump_collector.cpp + * @brief Host-side tensor dump collector implementation + * + * Mirrors performance_collector.cpp patterns: + * - DumpMemoryManager: background thread polling dump ready queues + * - TensorDumpCollector: lifecycle management, arena reads, file export + */ + +#include "host/tensor_dump_collector.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "common/memory_barrier.h" +#include "common/unified_log.h" + +// ============================================================================= +// DumpMemoryManager +// ============================================================================= + +DumpMemoryManager::~DumpMemoryManager() { + if (running_.load()) { + stop(); + } +} + +void DumpMemoryManager::register_mapping(void *dev_ptr, void *host_ptr) { dev_to_host_[dev_ptr] = host_ptr; } + +void *DumpMemoryManager::resolve_host_ptr(void *dev_ptr) { + auto it = dev_to_host_.find(dev_ptr); + if (it != dev_to_host_.end()) { + return it->second; + } + // Simulation mode: dev == host + return dev_ptr; +} + +void *DumpMemoryManager::alloc_and_register(size_t size, void **host_ptr_out) { + void *dev_ptr = alloc_cb_(size, user_data_); + if (dev_ptr == nullptr) { + return nullptr; + } + + void *host_ptr = dev_ptr; // Default for simulation + if (register_cb_ != nullptr) { + int rc = register_cb_(dev_ptr, size, device_id_, user_data_, &host_ptr); + if (rc != 0) { + free_cb_(dev_ptr, user_data_); + return nullptr; + } + } + + dev_to_host_[dev_ptr] = host_ptr; + if (host_ptr_out) { + *host_ptr_out = host_ptr; + } + return dev_ptr; +} + +void DumpMemoryManager::free_buffer(void *dev_ptr) { + if (dev_ptr && free_cb_) { + free_cb_(dev_ptr, user_data_); + } + dev_to_host_.erase(dev_ptr); +} + +void DumpMemoryManager::process_dump_entry( + DumpDataHeader * /*header*/, int thread_idx, const DumpReadyQueueEntry &entry +) { + void *dev_ptr = reinterpret_cast(entry.buffer_ptr); + void *host_ptr = resolve_host_ptr(dev_ptr); + + DumpReadyBufferInfo info; + info.thread_index = entry.thread_index; + info.dev_buffer_ptr = dev_ptr; + info.host_buffer_ptr = host_ptr; + info.buffer_seq = entry.buffer_seq; + + { + std::lock_guard lock(ready_mutex_); + ready_queue_.push(info); + } + ready_cv_.notify_one(); + + // Replenish: fill free_queue to capacity + DumpBufferState *state = get_dump_buffer_state(shared_mem_host_, thread_idx); + rmb(); + uint32_t fq_head = state->free_queue.head; + uint32_t fq_tail = state->free_queue.tail; + uint32_t fq_used = fq_tail - fq_head; + + while (fq_used < PLATFORM_DUMP_SLOT_COUNT) { + void *new_dev = nullptr; + if (!recycled_dump_buffers_.empty()) { + new_dev = recycled_dump_buffers_.back(); + recycled_dump_buffers_.pop_back(); + } else { + // Batch-allocate to refill recycled pool, avoiding per-buffer alloc overhead + int batch = PLATFORM_DUMP_BUFFERS_PER_THREAD - PLATFORM_DUMP_SLOT_COUNT; + if (batch < 1) { + batch = 1; + } + for (int i = 0; i < batch; i++) { + void *host = nullptr; + void *dev = alloc_and_register(sizeof(DumpMetaBuffer), &host); + if (dev == nullptr) { + break; + } + recycled_dump_buffers_.push_back(dev); + } + if (!recycled_dump_buffers_.empty()) { + new_dev = recycled_dump_buffers_.back(); + recycled_dump_buffers_.pop_back(); + } + } + if (new_dev == nullptr) { + break; + } + state->free_queue.buffer_ptrs[fq_tail % PLATFORM_DUMP_SLOT_COUNT] = reinterpret_cast(new_dev); + wmb(); + fq_tail++; + state->free_queue.tail = fq_tail; + wmb(); + fq_used++; + } +} + +void DumpMemoryManager::mgmt_loop() { + if (set_device_cb_ != nullptr) { + set_device_cb_(device_id_, user_data_); + } + + DumpDataHeader *header = get_dump_header(shared_mem_host_); + uint64_t total_entries_processed = 0; + uint64_t total_replenished = 0; + + while (running_.load()) { + bool did_work = false; + + // Poll all threads' ready queues + for (int t = 0; t < num_dump_threads_; t++) { + rmb(); + uint32_t head = header->queue_heads[t]; + uint32_t tail = header->queue_tails[t]; + + while (head != tail) { + DumpReadyQueueEntry &entry = header->queues[t][head]; + process_dump_entry(header, t, entry); + head = (head + 1) % PLATFORM_DUMP_READYQUEUE_SIZE; + header->queue_heads[t] = head; + wmb(); + did_work = true; + total_entries_processed++; + tail = header->queue_tails[t]; + } + } + + // Proactively replenish free queues from recycled pool even when no new ready entries + { + std::lock_guard lock(done_mutex_); + while (!done_queue_.empty()) { + void *dev_ptr = done_queue_.front(); + done_queue_.pop(); + recycled_dump_buffers_.push_back(dev_ptr); + did_work = true; + } + } + + // Push recycled buffers into free queues that have space + for (int t = 0; t < num_dump_threads_ && !recycled_dump_buffers_.empty(); t++) { + DumpBufferState *state = get_dump_buffer_state(shared_mem_host_, t); + rmb(); + uint32_t fq_head = state->free_queue.head; + uint32_t fq_tail = state->free_queue.tail; + uint32_t fq_used = fq_tail - fq_head; + + while (fq_used < PLATFORM_DUMP_SLOT_COUNT && !recycled_dump_buffers_.empty()) { + void *new_dev = recycled_dump_buffers_.back(); + recycled_dump_buffers_.pop_back(); + state->free_queue.buffer_ptrs[fq_tail % PLATFORM_DUMP_SLOT_COUNT] = reinterpret_cast(new_dev); + wmb(); + fq_tail++; + state->free_queue.tail = fq_tail; + wmb(); + fq_used++; + total_replenished++; + did_work = true; + } + } + + if (!did_work) { + std::this_thread::sleep_for(std::chrono::microseconds(10)); + } + } + + LOG_DEBUG("Dump memory manager: %lu ready entries, %lu replenished", total_entries_processed, total_replenished); +} + +void DumpMemoryManager::start( + void *shared_mem_host, int num_dump_threads, DumpAllocCallback alloc_cb, DumpRegisterCallback register_cb, + DumpFreeCallback free_cb, void *user_data, int device_id, DumpSetDeviceCallback set_device_cb +) { + shared_mem_host_ = shared_mem_host; + num_dump_threads_ = num_dump_threads; + alloc_cb_ = alloc_cb; + register_cb_ = register_cb; + free_cb_ = free_cb; + user_data_ = user_data; + device_id_ = device_id; + set_device_cb_ = set_device_cb; + + LOG_INFO("Starting dump memory manager (device=%d, threads=%d)", device_id, num_dump_threads); + running_.store(true); + mgmt_thread_ = std::thread(&DumpMemoryManager::mgmt_loop, this); +} + +void DumpMemoryManager::stop() { + running_.store(false); + if (mgmt_thread_.joinable()) { + mgmt_thread_.join(); + } +} + +bool DumpMemoryManager::try_pop_ready(DumpReadyBufferInfo &info) { + std::lock_guard lock(ready_mutex_); + if (ready_queue_.empty()) { + return false; + } + info = ready_queue_.front(); + ready_queue_.pop(); + return true; +} + +bool DumpMemoryManager::wait_pop_ready(DumpReadyBufferInfo &info, std::chrono::milliseconds timeout) { + std::unique_lock lock(ready_mutex_); + if (ready_cv_.wait_for(lock, timeout, [this] { + return !ready_queue_.empty(); + })) { + info = ready_queue_.front(); + ready_queue_.pop(); + return true; + } + return false; +} + +void DumpMemoryManager::notify_copy_done(void *dev_buffer_ptr) { + std::lock_guard lock(done_mutex_); + done_queue_.push(dev_buffer_ptr); +} + +// ============================================================================= +// TensorDumpCollector +// ============================================================================= + +TensorDumpCollector::~TensorDumpCollector() { + if (memory_manager_.is_running()) { + memory_manager_.stop(); + } +} + +void *TensorDumpCollector::alloc_single_buffer(size_t size, void **host_ptr_out) { + void *dev_ptr = alloc_cb_(size, user_data_); + if (dev_ptr == nullptr) { + return nullptr; + } + + void *host_ptr = dev_ptr; + if (register_cb_ != nullptr) { + int rc = register_cb_(dev_ptr, size, device_id_, user_data_, &host_ptr); + if (rc != 0) { + free_cb_(dev_ptr, user_data_); + return nullptr; + } + } + + if (host_ptr_out) { + *host_ptr_out = host_ptr; + } + return dev_ptr; +} + +int TensorDumpCollector::initialize( + int num_dump_threads, int device_id, DumpAllocCallback alloc_cb, DumpRegisterCallback register_cb, + DumpFreeCallback free_cb, void *user_data, DumpSetDeviceCallback set_device_cb +) { + num_dump_threads_ = num_dump_threads; + device_id_ = device_id; + alloc_cb_ = alloc_cb; + register_cb_ = register_cb; + free_cb_ = free_cb; + user_data_ = user_data; + set_device_cb_ = set_device_cb; + + // Allocate dump shared memory (header + buffer states) + size_t shm_size = calc_dump_data_size(num_dump_threads); + dump_shared_mem_dev_ = alloc_single_buffer(shm_size, &dump_shared_mem_host_); + if (dump_shared_mem_dev_ == nullptr) { + LOG_ERROR("Failed to allocate dump shared memory (%zu bytes)", shm_size); + return -1; + } + was_registered_ = (register_cb != nullptr); + + // Initialize header + memset(dump_shared_mem_host_, 0, shm_size); + DumpDataHeader *header = get_dump_header(dump_shared_mem_host_); + header->magic = TENSOR_DUMP_MAGIC; + header->num_dump_threads = static_cast(num_dump_threads); + header->records_per_buffer = PLATFORM_DUMP_RECORDS_PER_BUFFER; + + uint64_t arena_size = calc_dump_arena_size(); + header->arena_size_per_thread = arena_size; + + // Allocate per-thread arenas + arenas_.resize(num_dump_threads); + for (int t = 0; t < num_dump_threads; t++) { + ArenaInfo &ai = arenas_[t]; + ai.size = arena_size; + ai.dev_ptr = alloc_single_buffer(arena_size, &ai.host_ptr); + if (ai.dev_ptr == nullptr) { + LOG_ERROR("Failed to allocate dump arena for thread %d (%lu bytes)", t, arena_size); + return -1; + } + + // Set arena info in buffer state + DumpBufferState *state = get_dump_buffer_state(dump_shared_mem_host_, t); + state->arena_base = reinterpret_cast(ai.dev_ptr); + state->arena_size = arena_size; + state->arena_write_offset = 0; + state->dropped_record_count = 0; + + LOG_INFO( + "Thread %d: dump arena allocated (dev=%p, host=%p, size=%lu MB)", t, ai.dev_ptr, ai.host_ptr, + arena_size / (1024 * 1024) + ); + } + + // Allocate initial DumpMetaBuffers and push into free_queues + for (int t = 0; t < num_dump_threads; t++) { + DumpBufferState *state = get_dump_buffer_state(dump_shared_mem_host_, t); + + for (int b = 0; b < PLATFORM_DUMP_BUFFERS_PER_THREAD; b++) { + void *host_ptr = nullptr; + void *dev_ptr = alloc_single_buffer(sizeof(DumpMetaBuffer), &host_ptr); + if (dev_ptr == nullptr) { + LOG_ERROR("Failed to allocate dump meta buffer %d for thread %d", b, t); + return -1; + } + + memory_manager_.register_mapping(dev_ptr, host_ptr); + + if (b < PLATFORM_DUMP_SLOT_COUNT) { + // Push into SPSC free_queue + uint32_t tail = state->free_queue.tail; + state->free_queue.buffer_ptrs[tail % PLATFORM_DUMP_SLOT_COUNT] = reinterpret_cast(dev_ptr); + state->free_queue.tail = tail + 1; + } else { + // Remaining go to recycled pool + memory_manager_.recycled_dump_buffers_.push_back(dev_ptr); + } + } + } + + LOG_INFO( + "Tensor dump initialized: %d threads, arena=%lu MB/thread, %d buffers/thread", num_dump_threads, + arena_size / (1024 * 1024), PLATFORM_DUMP_BUFFERS_PER_THREAD + ); + + return 0; +} + +void TensorDumpCollector::start_memory_manager() { + execution_complete_.store(false); + memory_manager_.start( + dump_shared_mem_host_, num_dump_threads_, alloc_cb_, register_cb_, free_cb_, user_data_, device_id_, + set_device_cb_ + ); +} + +void TensorDumpCollector::process_dump_buffer(const DumpReadyBufferInfo &info) { + // Track processed buffer pointers to prevent double-processing + // (flush + drain can deliver a buffer that scan_remaining also sees) + if (processed_buffers_.count(info.dev_buffer_ptr)) { + return; + } + processed_buffers_.insert(info.dev_buffer_ptr); + + DumpMetaBuffer *buf = reinterpret_cast(info.host_buffer_ptr); + uint32_t count = buf->count; + + if (count == 0) { + return; + } + + if (count > PLATFORM_DUMP_RECORDS_PER_BUFFER) { + LOG_ERROR( + "Dump collector: invalid record count %u in buffer (thread=%u, seq=%u, max=%d), skipping", count, + info.thread_index, info.buffer_seq, PLATFORM_DUMP_RECORDS_PER_BUFFER + ); + return; + } + + for (uint32_t i = 0; i < count; i++) { + const TensorDumpRecord &rec = buf->records[i]; + + DumpedTensor dt; + dt.task_id = rec.task_id; + dt.subtask_id = rec.subtask_id; + dt.func_id = rec.func_id; + dt.arg_index = rec.arg_index; + dt.role = static_cast(rec.role); + dt.stage = static_cast(rec.stage); + dt.dtype = rec.dtype; + dt.ndims = rec.ndims; + dt.is_contiguous = (rec.is_contiguous != 0); + dt.truncated = (rec.truncated != 0); + dt.overwritten = false; + if (dt.truncated && ++total_truncated_count_ == 1) { + LOG_WARN("Tensor dump truncation detected. Increase PLATFORM_DUMP_AVG_TENSOR_BYTES."); + } + memcpy(dt.raw_shapes, rec.raw_shapes, sizeof(dt.raw_shapes)); + memcpy(dt.shapes, rec.shapes, sizeof(dt.shapes)); + memcpy(dt.offsets, rec.offsets, sizeof(dt.offsets)); + + // Read tensor data from arena + int thread_idx = static_cast(info.thread_index); + if (thread_idx < static_cast(arenas_.size())) { + ArenaInfo &ai = arenas_[thread_idx]; + char *arena_host = reinterpret_cast(ai.host_ptr); + uint64_t arena_sz = ai.size; + + // Check if data was overwritten (offset too old) + uint64_t high_water = ai.high_water; + if (high_water > arena_sz && rec.payload_offset < high_water - arena_sz) { + dt.overwritten = true; + if (++total_overwrite_count_ == 1) { + LOG_WARN( + "Tensor dump overwrite detected: host drain was slower than arena reuse. " + "Increase PLATFORM_DUMP_BUFFERS_PER_THREAD." + ); + } + } else { + dt.overwritten = false; + } + + if (!dt.overwritten && rec.payload_size > 0) { + dt.bytes.resize(rec.payload_size); + uint64_t pos = rec.payload_offset % arena_sz; + if (pos + rec.payload_size <= arena_sz) { + memcpy(dt.bytes.data(), arena_host + pos, rec.payload_size); + } else { + // Wraparound read + uint64_t first = arena_sz - pos; + memcpy(dt.bytes.data(), arena_host + pos, first); + memcpy(dt.bytes.data() + first, arena_host, rec.payload_size - first); + } + } + + // Update high-water mark + uint64_t end_offset = rec.payload_offset + rec.payload_size; + if (end_offset > ai.high_water) { + ai.high_water = end_offset; + } + } + + dt.payload_size = dt.bytes.size(); + + bool has_payload = !dt.overwritten && !dt.bytes.empty(); + dt.bin_offset = has_payload ? next_bin_offset_ : 0; + if (has_payload) { + next_bin_offset_ += dt.payload_size; + } + + // Store metadata-only copy in collected_ (no payload bytes) + DumpedTensor meta = dt; + meta.bytes.clear(); + { + std::lock_guard lock(collected_mutex_); + collected_.push_back(std::move(meta)); + } + + // Enqueue full tensor (with payload) to writer thread + if (has_payload) { + { + std::lock_guard lock(write_mutex_); + write_queue_.push(std::move(dt)); + } + write_cv_.notify_one(); + } + } +} + +static const char *tensor_dump_role_name(TensorDumpRole role) { + switch (role) { + case TensorDumpRole::INPUT: + return "input"; + case TensorDumpRole::OUTPUT: + return "output"; + case TensorDumpRole::INOUT: + return "inout"; + } + return "unknown"; +} + +static const char *tensor_dump_stage_name(TensorDumpStage stage) { + switch (stage) { + case TensorDumpStage::BEFORE_DISPATCH: + return "before_dispatch"; + case TensorDumpStage::AFTER_COMPLETION: + return "after_completion"; + } + return "unknown"; +} + +static std::string dims_to_string(const uint32_t dims[], int ndims) { + std::ostringstream ss; + ss << "["; + for (int d = 0; d < ndims; d++) { + if (d > 0) { + ss << ", "; + } + ss << dims[d]; + } + ss << "]"; + return ss.str(); +} + +void TensorDumpCollector::poll_and_collect() { + const auto wait_timeout = std::chrono::milliseconds(100); + const auto idle_timeout = std::chrono::seconds(PLATFORM_DUMP_TIMEOUT_SECONDS); + uint64_t buffers_collected = 0; + auto start_time = std::chrono::steady_clock::now(); + auto last_progress_time = start_time; + bool idle_timer_started = false; + std::chrono::steady_clock::time_point idle_start; + + // Create output directory and start writer thread + auto now_wall = std::chrono::system_clock::now(); + auto time_t_now = std::chrono::system_clock::to_time_t(now_wall); + struct tm tm_now; + localtime_r(&time_t_now, &tm_now); + char ts[32]; + strftime(ts, sizeof(ts), "%Y%m%d_%H%M%S", &tm_now); + + std::string base_name = std::string("tensor_dump_") + ts; + run_dir_ = std::filesystem::path("outputs") / base_name; + std::filesystem::create_directories(run_dir_); + bin_file_.open(run_dir_ / (base_name + ".bin"), std::ios::binary); + next_bin_offset_ = 0; + + writer_done_.store(false); + bytes_written_.store(0); + writer_thread_ = std::thread(&TensorDumpCollector::writer_loop, this); + + while (true) { + DumpReadyBufferInfo info; + if (memory_manager_.try_pop_ready(info)) { + process_dump_buffer(info); + memory_manager_.notify_copy_done(info.dev_buffer_ptr); + buffers_collected++; + idle_timer_started = false; + + auto now = std::chrono::steady_clock::now(); + if (std::chrono::duration_cast(now - last_progress_time).count() >= 5) { + auto elapsed_s = std::chrono::duration_cast(now - start_time).count(); + LOG_INFO( + "Collecting: %zu tensors, %.1f GB written (%lds)", collected_.size(), bytes_written_.load() / 1e9, + elapsed_s + ); + last_progress_time = now; + } + } else { + if (!memory_manager_.wait_pop_ready(info, wait_timeout)) { + if (execution_complete_.load()) { + DumpReadyBufferInfo drain_info; + while (memory_manager_.try_pop_ready(drain_info)) { + process_dump_buffer(drain_info); + memory_manager_.notify_copy_done(drain_info.dev_buffer_ptr); + buffers_collected++; + } + break; + } + + if (!idle_timer_started) { + idle_start = std::chrono::steady_clock::now(); + idle_timer_started = true; + } + auto idle_elapsed = std::chrono::steady_clock::now() - idle_start; + if (idle_elapsed >= idle_timeout) { + LOG_ERROR( + "Tensor dump collection idle timeout after %ld seconds", + std::chrono::duration_cast(idle_elapsed).count() + ); + LOG_ERROR( + "Collected %lu buffers and %zu tensors before timeout", buffers_collected, collected_.size() + ); + break; + } + continue; + } + process_dump_buffer(info); + memory_manager_.notify_copy_done(info.dev_buffer_ptr); + buffers_collected++; + idle_timer_started = false; + } + } + + // Stop writer thread and wait for it to drain, reporting progress + writer_done_.store(true); + write_cv_.notify_one(); + while (writer_thread_.joinable()) { + if (write_queue_.empty()) { + writer_thread_.join(); + break; + } + auto elapsed_s = + std::chrono::duration_cast(std::chrono::steady_clock::now() - start_time).count(); + LOG_INFO( + "Writing to disk: %.1f GB written, %zu tensors remaining (%lds)", bytes_written_.load() / 1e9, + write_queue_.size(), elapsed_s + ); + std::this_thread::sleep_for(std::chrono::seconds(10)); + } + + bin_file_.close(); + + auto elapsed_ms = + std::chrono::duration_cast(std::chrono::steady_clock::now() - start_time).count(); + LOG_INFO( + "Collected %zu tensors, wrote %.1f GB to disk (%.1fs)", collected_.size(), bytes_written_.load() / 1e9, + elapsed_ms / 1000.0 + ); +} + +void TensorDumpCollector::signal_execution_complete() { execution_complete_.store(true); } + +void TensorDumpCollector::stop_memory_manager() { memory_manager_.stop(); } + +void TensorDumpCollector::drain_remaining_buffers() { + DumpReadyBufferInfo info; + while (memory_manager_.try_pop_ready(info)) { + process_dump_buffer(info); + memory_manager_.notify_copy_done(info.dev_buffer_ptr); + } +} + +void TensorDumpCollector::scan_remaining_dump_buffers() { + uint32_t dropped_total = 0; + // Scan current_buf_ptr for each thread for partial buffers not yet enqueued + for (int t = 0; t < num_dump_threads_; t++) { + DumpBufferState *state = get_dump_buffer_state(dump_shared_mem_host_, t); + + // Accumulate dropped-record counts regardless of buffer state. + total_dropped_record_count_ += state->dropped_record_count; + dropped_total += state->dropped_record_count; + + uint64_t cur_ptr = state->current_buf_ptr; + if (cur_ptr == 0) { + continue; + } + + void *dev_ptr = reinterpret_cast(cur_ptr); + void *host_ptr = memory_manager_.resolve_host_ptr(dev_ptr); + + DumpMetaBuffer *buf = reinterpret_cast(host_ptr); + if (buf->count > 0) { + DumpReadyBufferInfo info; + info.thread_index = static_cast(t); + info.dev_buffer_ptr = dev_ptr; + info.host_buffer_ptr = host_ptr; + info.buffer_seq = state->current_buf_seq; + process_dump_buffer(info); + } + } + if (dropped_total > 0) { + LOG_WARN( + "Dump collector: %u records dropped on device side. " + "Increase PLATFORM_DUMP_BUFFERS_PER_THREAD or PLATFORM_DUMP_READYQUEUE_SIZE.", + dropped_total + ); + } +} + +static std::string get_dtype_name_from_raw(uint8_t dtype) { return get_dtype_name(static_cast(dtype)); } + +static uint64_t get_num_elements(const DumpedTensor &dt) { + uint64_t numel = 1; + for (int d = 0; d < dt.ndims; d++) { + numel *= dt.shapes[d]; + } + return (dt.ndims == 0) ? 1 : numel; +} + +void TensorDumpCollector::writer_loop() { + while (true) { + DumpedTensor dt; + { + std::unique_lock lock(write_mutex_); + write_cv_.wait(lock, [this] { + return !write_queue_.empty() || writer_done_.load(); + }); + if (write_queue_.empty() && writer_done_.load()) { + break; + } + dt = std::move(write_queue_.front()); + write_queue_.pop(); + } + + if (!dt.bytes.empty()) { + bin_file_.write( + reinterpret_cast(dt.bytes.data()), static_cast(dt.bytes.size()) + ); + } + + bytes_written_ += dt.bytes.size(); + } +} + +int TensorDumpCollector::export_dump_files(const std::string & /*output_path*/) { + if (collected_.empty()) { + LOG_WARN("No tensor dump data to export"); + return 0; + } + auto export_start = std::chrono::steady_clock::now(); + + // Sort by task_id then subtask_id then func_id. + std::sort(collected_.begin(), collected_.end(), [](const DumpedTensor &a, const DumpedTensor &b) { + if (a.task_id != b.task_id) return a.task_id < b.task_id; + if (a.subtask_id != b.subtask_id) return a.subtask_id < b.subtask_id; + if (a.func_id != b.func_id) return a.func_id < b.func_id; + if (a.stage != b.stage) return static_cast(a.stage) < static_cast(b.stage); + if (a.arg_index != b.arg_index) return a.arg_index < b.arg_index; + return static_cast(a.role) < static_cast(b.role); + }); + + LOG_INFO("Writing JSON manifest for %zu tensors...", collected_.size()); + + uint32_t num_before_dispatch = 0; + uint32_t num_after_completion = 0; + uint32_t num_input_tensors = 0; + uint32_t num_output_tensors = 0; + uint32_t num_inout_tensors = 0; + for (const auto &dt : collected_) { + if (dt.stage == TensorDumpStage::BEFORE_DISPATCH) { + num_before_dispatch++; + } else { + num_after_completion++; + } + switch (dt.role) { + case TensorDumpRole::INPUT: + num_input_tensors++; + break; + case TensorDumpRole::OUTPUT: + num_output_tensors++; + break; + case TensorDumpRole::INOUT: + num_inout_tensors++; + break; + } + } + + // Write JSON manifest (txt/bin files already written by writer thread) + std::string base_name = run_dir_.filename().string(); + std::ofstream json(run_dir_ / (base_name + ".json")); + json << "{\n"; + json << " \"timestamp\": \"" << base_name.substr(sizeof("tensor_dump_") - 1) << "\",\n"; + json << " \"run_dir\": \"" << base_name << "\",\n"; + json << " \"bin_format\": {\n"; + json << " \"type\": \"logical_contiguous\",\n"; + json << " \"byte_order\": \"little_endian\"\n"; + json << " },\n"; + json << " \"total_tensors\": " << collected_.size() << ",\n"; + json << " \"before_dispatch\": " << num_before_dispatch << ",\n"; + json << " \"after_completion\": " << num_after_completion << ",\n"; + json << " \"input_tensors\": " << num_input_tensors << ",\n"; + json << " \"output_tensors\": " << num_output_tensors << ",\n"; + json << " \"inout_tensors\": " << num_inout_tensors << ",\n"; + json << " \"truncated_tensors\": " << total_truncated_count_ << ",\n"; + json << " \"dropped_records\": " << total_dropped_record_count_ << ",\n"; + json << " \"dropped_overwrite\": " << total_overwrite_count_ << ",\n"; + json << " \"bin_file\": \"" << base_name << ".bin\",\n"; + json << " \"tensors\": [\n"; + + bool first_entry = true; + + for (size_t i = 0; i < collected_.size(); i++) { + const DumpedTensor &dt = collected_[i]; + std::string dtype_name = get_dtype_name_from_raw(dt.dtype); + uint64_t numel = get_num_elements(dt); + + std::string shape_str = dims_to_string(dt.shapes, dt.ndims); + std::string raw_shape_str = dims_to_string(dt.raw_shapes, dt.ndims); + std::string offsets_str = dims_to_string(dt.offsets, dt.ndims); + + if (!first_entry) json << ",\n"; + first_entry = false; + + json << " {\"task_id\": \"0x" << std::hex << std::setfill('0') << std::setw(16) << dt.task_id << std::dec + << "\", \"subtask_id\": " << static_cast(dt.subtask_id) << ", \"func_id\": " << dt.func_id + << ", \"role\": \"" << tensor_dump_role_name(dt.role) << "\", \"stage\": \"" + << tensor_dump_stage_name(dt.stage) << "\", \"arg_index\": " << dt.arg_index << ", \"dtype\": \"" + << dtype_name << "\", \"is_contiguous\": " << (dt.is_contiguous ? "true" : "false") + << ", \"shape\": " << shape_str << ", \"raw_shape\": " << raw_shape_str << ", \"offsets\": " << offsets_str + << ", \"numel\": " << numel << ", \"bin_offset\": " << dt.bin_offset + << ", \"bin_size\": " << dt.payload_size << ", \"truncated\": " << (dt.truncated ? "true" : "false") + << ", \"overwritten\": " << (dt.overwritten ? "true" : "false") << "}"; + } + + json << "\n ]\n}\n"; + json.close(); + + auto export_end = std::chrono::steady_clock::now(); + auto total_ms = std::chrono::duration_cast(export_end - export_start).count(); + LOG_INFO("Wrote JSON manifest (%zu tensors) to %s (%ldms)", collected_.size(), run_dir_.c_str(), total_ms); + + if (total_truncated_count_ > 0 || total_dropped_record_count_ > 0 || total_overwrite_count_ > 0) { + LOG_WARN( + "Tensor dump anomalies: truncated=%u, dropped_records=%u, overwritten=%u", total_truncated_count_, + total_dropped_record_count_, total_overwrite_count_ + ); + } + + // Clear state so subsequent runs don't accumulate data from previous runs + collected_.clear(); + processed_buffers_.clear(); + total_dropped_record_count_ = 0; + total_truncated_count_ = 0; + total_overwrite_count_ = 0; + for (auto &ai : arenas_) { + ai.high_water = 0; + } + return 0; +} + +int TensorDumpCollector::finalize(DumpUnregisterCallback unregister_cb, DumpFreeCallback free_cb, void *user_data) { + // Stop memory manager if still running + if (memory_manager_.is_running()) { + memory_manager_.stop(); + } + + std::unordered_set released_meta_buffers; + auto release_meta_buffer = [&](void *ptr) { + if (ptr == nullptr || !released_meta_buffers.insert(ptr).second) { + return; + } + if (was_registered_ && unregister_cb) { + unregister_cb(ptr, device_id_, user_data); + } + if (free_cb) { + free_cb(ptr, user_data); + } + }; + + // Free DumpMetaBuffers still in free_queues and current_buf_ptr + if (dump_shared_mem_host_) { + for (int t = 0; t < num_dump_threads_; t++) { + DumpBufferState *state = get_dump_buffer_state(dump_shared_mem_host_, t); + + // Free current buffer if any + release_meta_buffer(reinterpret_cast(state->current_buf_ptr)); + state->current_buf_ptr = 0; + + // Free all buffers remaining in free_queue + rmb(); + uint32_t head = state->free_queue.head; + uint32_t tail = state->free_queue.tail; + uint32_t queued = tail - head; + if (queued > PLATFORM_DUMP_SLOT_COUNT) { + queued = PLATFORM_DUMP_SLOT_COUNT; + } + for (uint32_t i = 0; i < queued; i++) { + uint32_t slot = (head + i) % PLATFORM_DUMP_SLOT_COUNT; + release_meta_buffer(reinterpret_cast(state->free_queue.buffer_ptrs[slot])); + state->free_queue.buffer_ptrs[slot] = 0; + } + state->free_queue.head = tail; + } + } + + // Free buffers still queued for host processing + { + std::lock_guard lock(memory_manager_.ready_mutex_); + while (!memory_manager_.ready_queue_.empty()) { + release_meta_buffer(memory_manager_.ready_queue_.front().dev_buffer_ptr); + memory_manager_.ready_queue_.pop(); + } + } + + // Free buffers held by memory manager (done_queue + recycled pool) + { + std::lock_guard lock(memory_manager_.done_mutex_); + while (!memory_manager_.done_queue_.empty()) { + void *ptr = memory_manager_.done_queue_.front(); + memory_manager_.done_queue_.pop(); + release_meta_buffer(ptr); + } + } + for (void *ptr : memory_manager_.recycled_dump_buffers_) { + release_meta_buffer(ptr); + } + memory_manager_.recycled_dump_buffers_.clear(); + memory_manager_.dev_to_host_.clear(); + + // Free arenas + for (auto &ai : arenas_) { + if (ai.dev_ptr) { + if (unregister_cb) { + unregister_cb(ai.dev_ptr, device_id_, user_data); + } + if (free_cb) { + free_cb(ai.dev_ptr, user_data); + } + ai.dev_ptr = nullptr; + ai.host_ptr = nullptr; + } + } + arenas_.clear(); + + // Free shared memory + if (dump_shared_mem_dev_) { + if (was_registered_ && unregister_cb) { + unregister_cb(dump_shared_mem_dev_, device_id_, user_data); + } + if (free_cb) { + free_cb(dump_shared_mem_dev_, user_data); + } + dump_shared_mem_dev_ = nullptr; + dump_shared_mem_host_ = nullptr; + } + + // Reset state + num_dump_threads_ = 0; + execution_complete_.store(false); + collected_.clear(); + processed_buffers_.clear(); + total_dropped_record_count_ = 0; + total_truncated_count_ = 0; + total_overwrite_count_ = 0; + + return 0; +} diff --git a/src/a2a3/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp index e27329d32..8e0e1a855 100644 --- a/src/a2a3/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp @@ -36,6 +36,7 @@ // Performance profiling headers #include "aicpu/performance_collector_aicpu.h" +#include "aicpu/tensor_dump_aicpu.h" #include "common/memory_barrier.h" #include "common/perf_profiling.h" #include "common/unified_log.h" @@ -405,6 +406,19 @@ struct AicpuExecutor { ); cur_thread_completed++; if (mixed_complete) { +#if PTO2_DUMP_TENSOR + if (get_enable_dump_tensor()) { + dump_tensors_for_task( + thread_idx, slot_state, TensorDumpStage::AFTER_COMPLETION, + [](uint8_t active_mask, uint8_t raw_subtask_id) { + return pto2_subtask_active(active_mask, static_cast(raw_subtask_id)); + }, + [this](int32_t func_id) { + return get_function_bin_addr(func_id); + } + ); + } +#endif completed_this_turn++; } made_progress = true; @@ -499,9 +513,16 @@ struct AicpuExecutor { PTO2SubtaskSlot subslot #if PTO2_PROFILING , - bool profiling_enabled, int32_t thread_idx + bool profiling_enabled +#endif +#if PTO2_PROFILING || PTO2_DUMP_TENSOR + , + int32_t thread_idx #endif ) { +#if !PTO2_PROFILING + (void)runtime; // NOLINT(readability/casting) +#endif PTO2DispatchPayload &payload = s_pto2_payload_per_core[core_id]; PTO2TaskDescriptor &task = *slot_state.task; int32_t slot_idx = static_cast(subslot); @@ -518,6 +539,7 @@ struct AicpuExecutor { core_dispatch_counts_[core_id]++; } #endif + // Per-core monotonic counter for register protocol uniqueness. // PTO2 task_id encodes (ring_id << 32 | local_id); truncation to uint32 loses ring_id, // so tasks from different rings with the same local_id would write identical DATA_MAIN_BASE @@ -922,6 +944,11 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa perf_aicpu_set_orch_thread_idx(sched_thread_num_); } #endif +#if PTO2_DUMP_TENSOR + if (get_enable_dump_tensor()) { + dump_tensor_init(orch_to_sched_ ? thread_num_ : sched_thread_num_); + } +#endif DEV_INFO("Thread %d: one-time init done", thread_idx); pto2_init_complete_.store(true, std::memory_order_release); @@ -1136,13 +1163,29 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa uint64_t t_setup_start = get_sys_cnt_aicpu(); #endif ResourceCount rc = shape_resource_count(shape); - +#if PTO2_DUMP_TENSOR + if (get_enable_dump_tensor()) { + dump_tensors_for_task( + thread_idx, *slot_state, TensorDumpStage::BEFORE_DISPATCH, + [](uint8_t active_mask, uint8_t raw_subtask_id) { + return pto2_subtask_active(active_mask, static_cast(raw_subtask_id)); + }, + [this](int32_t func_id) { + return get_function_bin_addr(func_id); + } + ); + } +#endif if (rc.aic) { dispatch_subtask_to_core( runtime, tracker, c.aic_core_id, CoreType::AIC, *slot_state, PTO2SubtaskSlot::AIC #if PTO2_PROFILING , - profiling_enabled, thread_idx + profiling_enabled +#endif +#if PTO2_PROFILING || PTO2_DUMP_TENSOR + , + thread_idx #endif ); } @@ -1152,7 +1195,11 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa runtime, tracker, aiv0, CoreType::AIV, *slot_state, PTO2SubtaskSlot::AIV0 #if PTO2_PROFILING , - profiling_enabled, thread_idx + profiling_enabled +#endif +#if PTO2_PROFILING || PTO2_DUMP_TENSOR + , + thread_idx #endif ); } @@ -1161,7 +1208,11 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa runtime, tracker, c.aiv_core_ids[1], CoreType::AIV, *slot_state, PTO2SubtaskSlot::AIV1 #if PTO2_PROFILING , - profiling_enabled, thread_idx + profiling_enabled +#endif +#if PTO2_PROFILING || PTO2_DUMP_TENSOR + , + thread_idx #endif ); } @@ -1221,13 +1272,29 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa #endif Cluster &c = tracker.clusters[ci]; ResourceCount rc = shape_resource_count(shape); - +#if PTO2_DUMP_TENSOR + if (get_enable_dump_tensor()) { + dump_tensors_for_task( + thread_idx, *slot_state, TensorDumpStage::BEFORE_DISPATCH, + [](uint8_t active_mask, uint8_t raw_subtask_id) { + return pto2_subtask_active(active_mask, static_cast(raw_subtask_id)); + }, + [this](int32_t func_id) { + return get_function_bin_addr(func_id); + } + ); + } +#endif if (rc.aic) { dispatch_subtask_to_core( runtime, tracker, c.aic_core_id, CoreType::AIC, *slot_state, PTO2SubtaskSlot::AIC #if PTO2_PROFILING , - profiling_enabled, thread_idx + profiling_enabled +#endif +#if PTO2_PROFILING || PTO2_DUMP_TENSOR + , + thread_idx #endif ); } @@ -1237,7 +1304,11 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa runtime, tracker, aiv_id, CoreType::AIV, *slot_state, PTO2SubtaskSlot::AIV0 #if PTO2_PROFILING , - profiling_enabled, thread_idx + profiling_enabled +#endif +#if PTO2_PROFILING || PTO2_DUMP_TENSOR + , + thread_idx #endif ); } @@ -1246,7 +1317,11 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa runtime, tracker, c.aiv_core_ids[1], CoreType::AIV, *slot_state, PTO2SubtaskSlot::AIV1 #if PTO2_PROFILING , - profiling_enabled, thread_idx + profiling_enabled +#endif +#if PTO2_PROFILING || PTO2_DUMP_TENSOR + , + thread_idx #endif ); } @@ -1575,6 +1650,11 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa perf_aicpu_flush_phase_buffers(thread_idx); } #endif +#if PTO2_DUMP_TENSOR + if (get_enable_dump_tensor()) { + dump_tensor_flush(thread_idx); + } +#endif return cur_thread_completed; } diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2_types.h b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2_types.h index b75834dfa..85decb324 100644 --- a/src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2_types.h +++ b/src/a2a3/runtime/aicpu_build_graph/runtime/pto_runtime2_types.h @@ -58,6 +58,14 @@ #error "PTO2_SCHED_PROFILING requires PTO2_PROFILING=1" #endif +// ============================================================================= +// Dump Tensor Configuration +// ============================================================================= + +#ifndef PTO2_DUMP_TENSOR +#define PTO2_DUMP_TENSOR 1 +#endif + // ============================================================================= // AICPU Error Codes (written to shared memory for Host-side diagnosis) // ============================================================================= diff --git a/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp index 2db2d00bc..3b4f541ba 100644 --- a/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/host_build_graph/aicpu/aicpu_executor.cpp @@ -18,6 +18,8 @@ #include "aicpu/device_time.h" #include "aicpu/performance_collector_aicpu.h" #include "aicpu/platform_regs.h" +#include "aicpu/tensor_dump_aicpu.h" +#include "callable.h" #include "common/memory_barrier.h" #include "common/perf_profiling.h" #include "common/platform_config.h" @@ -115,8 +117,8 @@ struct AicpuExecutor { // Helper functions (inline to avoid linker issues, not always_inline to preserve barriers) inline void resolve_task_dependencies( - Task *task, Runtime &runtime, int *cur_ready_queue_aic, int &cur_aic_tail, int &cur_aic_ready_count, - int *cur_ready_queue_aiv, int &cur_aiv_tail, int &cur_aiv_ready_count + Task *task, Runtime &runtime, int thread_idx, int *cur_ready_queue_aic, int &cur_aic_tail, + int &cur_aic_ready_count, int *cur_ready_queue_aiv, int &cur_aiv_tail, int &cur_aiv_ready_count ); inline bool try_dispatch_task( @@ -127,13 +129,56 @@ struct AicpuExecutor { static AicpuExecutor g_aicpu_executor; +#if PTO2_DUMP_TENSOR +static int +collect_task_tensor_buffer_addrs(const Runtime &runtime, const Task &task, uint64_t *buffer_addrs, int max_count) { + int found = 0; + for (int arg_idx = 0; arg_idx < task.num_args; arg_idx++) { + uint64_t arg = task.args[arg_idx]; + if (!runtime.is_tensor_buffer_addr(arg)) { + continue; + } + if (found < max_count) { + buffer_addrs[found] = arg; + } + found++; + } + return found; +} +#endif + // ===== Helper Function Implementations ===== // Resolve dependencies: decrement fanin and enqueue newly ready tasks inline void AicpuExecutor::resolve_task_dependencies( - Task *task, Runtime &runtime, int *cur_ready_queue_aic, int &cur_aic_tail, int &cur_aic_ready_count, + Task *task, Runtime &runtime, int thread_idx, int *cur_ready_queue_aic, int &cur_aic_tail, int &cur_aic_ready_count, int *cur_ready_queue_aiv, int &cur_aiv_tail, int &cur_aiv_ready_count ) { + if (task == nullptr) { + return; + } + +#if PTO2_DUMP_TENSOR + if (get_enable_dump_tensor()) { + uint64_t callable_addr = runtime.get_function_bin_addr(task->func_id); + if (callable_addr != 0) { + const CoreCallable *callable = reinterpret_cast(callable_addr); + int tensor_info_count = 0; + const TensorInfo *tensor_info = runtime.get_tensor_info(task->task_id, &tensor_info_count); + uint64_t tensor_buffer_addrs[RUNTIME_MAX_ARGS] = {}; + int tensor_buffer_count = + collect_task_tensor_buffer_addrs(runtime, *task, tensor_buffer_addrs, RUNTIME_MAX_ARGS); + dump_tensors_for_task( + thread_idx, static_cast(task->task_id), 0, task->num_args, task->func_id, *callable, + tensor_info, tensor_info_count, tensor_buffer_addrs, tensor_buffer_count, + TensorDumpStage::AFTER_COMPLETION + ); + } + } +#else + (void)thread_idx; +#endif + for (int j = 0; j < task->fanout_count; j++) { int dep_id = task->fanout[j]; Task *dep = runtime.get_task(dep_id); @@ -197,6 +242,28 @@ inline bool AicpuExecutor::try_dispatch_task( running_task_ids_[core_id] ); +#if PTO2_DUMP_TENSOR + if (get_enable_dump_tensor()) { + Task *task = runtime.get_task(task_id); + if (task != nullptr) { + uint64_t callable_addr = runtime.get_function_bin_addr(task->func_id); + if (callable_addr != 0) { + const CoreCallable *callable = reinterpret_cast(callable_addr); + int tensor_info_count = 0; + const TensorInfo *tensor_info = runtime.get_tensor_info(task_id, &tensor_info_count); + uint64_t tensor_buffer_addrs[RUNTIME_MAX_ARGS] = {}; + int tensor_buffer_count = + collect_task_tensor_buffer_addrs(runtime, *task, tensor_buffer_addrs, RUNTIME_MAX_ARGS); + dump_tensors_for_task( + thread_idx, static_cast(task_id), 0, task->num_args, task->func_id, *callable, + tensor_info, tensor_info_count, tensor_buffer_addrs, tensor_buffer_count, + TensorDumpStage::BEFORE_DISPATCH + ); + } + } + } +#endif + // Set state before writing register to avoid race with AICore ACK pending_task_ids_[core_id] = task_id; @@ -266,6 +333,11 @@ int AicpuExecutor::init(Runtime *runtime) { if (runtime->enable_profiling) { perf_aicpu_init_profiling(runtime); } +#if PTO2_DUMP_TENSOR + if (get_enable_dump_tensor()) { + dump_tensor_init(thread_num_); + } +#endif init_done_.store(true, std::memory_order_release); LOG_INFO("AicpuExecutor: Init complete"); @@ -696,7 +768,7 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const Task *prev_running_task = runtime.get_task(prev_running_id); resolve_task_dependencies( - prev_running_task, runtime, cur_ready_queue_aic, cur_aic_tail, cur_aic_ready_count, + prev_running_task, runtime, thread_idx, cur_ready_queue_aic, cur_aic_tail, cur_aic_ready_count, cur_ready_queue_aiv, cur_aiv_tail, cur_aiv_ready_count ); @@ -705,8 +777,8 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const Task *task = runtime.get_task(completed_task_id); resolve_task_dependencies( - task, runtime, cur_ready_queue_aic, cur_aic_tail, cur_aic_ready_count, cur_ready_queue_aiv, - cur_aiv_tail, cur_aiv_ready_count + task, runtime, thread_idx, cur_ready_queue_aic, cur_aic_tail, cur_aic_ready_count, + cur_ready_queue_aiv, cur_aiv_tail, cur_aiv_ready_count ); made_progress = true; @@ -760,7 +832,7 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const Task *prev_running_task = runtime.get_task(prev_running_id); resolve_task_dependencies( - prev_running_task, runtime, cur_ready_queue_aic, cur_aic_tail, cur_aic_ready_count, + prev_running_task, runtime, thread_idx, cur_ready_queue_aic, cur_aic_tail, cur_aic_ready_count, cur_ready_queue_aiv, cur_aiv_tail, cur_aiv_ready_count ); @@ -818,8 +890,8 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const Task *task = runtime.get_task(completed_task_id); resolve_task_dependencies( - task, runtime, cur_ready_queue_aic, cur_aic_tail, cur_aic_ready_count, cur_ready_queue_aiv, - cur_aiv_tail, cur_aiv_ready_count + task, runtime, thread_idx, cur_ready_queue_aic, cur_aic_tail, cur_aic_ready_count, + cur_ready_queue_aiv, cur_aiv_tail, cur_aiv_ready_count ); made_progress = true; @@ -988,6 +1060,11 @@ int AicpuExecutor::run(Runtime *runtime) { if (runtime->enable_profiling) { perf_aicpu_flush_buffers(runtime, thread_idx, cur_thread_cores, thread_cores_num_[thread_idx]); } +#if PTO2_DUMP_TENSOR + if (get_enable_dump_tensor()) { + dump_tensor_flush(thread_idx); + } +#endif LOG_INFO("Thread %d: Completed", thread_idx); @@ -1007,6 +1084,17 @@ void AicpuExecutor::deinit(Runtime *runtime) { // Next round's Host DMA (rtMemcpy) writes fresh Runtime to HBM but // bypasses this cache. Invalidating now ensures next round reads from HBM. cache_invalidate_range(runtime, sizeof(Runtime)); + if (runtime->get_tensor_info_storage() != nullptr && runtime->get_tensor_info_storage_bytes() > 0) { + cache_invalidate_range( + runtime->get_tensor_info_storage(), static_cast(runtime->get_tensor_info_storage_bytes()) + ); + } + if (runtime->get_tensor_allocation_storage() != nullptr && runtime->get_tensor_allocation_storage_bytes() > 0) { + cache_invalidate_range( + runtime->get_tensor_allocation_storage(), + static_cast(runtime->get_tensor_allocation_storage_bytes()) + ); + } // === Existing reset logic === ready_count_aic_.store(0, std::memory_order_release); diff --git a/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp b/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp index bac4c3052..57f4e1586 100644 --- a/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp +++ b/src/a2a3/runtime/host_build_graph/host/runtime_maker.cpp @@ -35,6 +35,7 @@ #include #include #include +#include #include "callable.h" #include "orchestration_api.h" @@ -46,16 +47,84 @@ namespace { struct OrchestrationRuntimeImpl { const OrchestrationRuntimeOps *ops; Runtime *runtime; + struct TensorInfoBuilder *tensor_info_builder; + struct TensorAllocationBuilder *tensor_allocation_builder; +}; + +struct TensorInfoBuilder { + std::vector> tensor_info_by_task; + + int set_tensor_info_to_task(int task_id, const TensorInfo *tensor_info, int tensor_count) { + if (task_id < 0 || tensor_count < 0 || tensor_count > RUNTIME_MAX_ARGS) { + return -1; + } + if (static_cast(task_id) >= tensor_info_by_task.size()) { + tensor_info_by_task.resize(static_cast(task_id) + 1); + } + std::vector &task_info = tensor_info_by_task[static_cast(task_id)]; + task_info.assign(tensor_info, tensor_info + tensor_count); + return 0; + } +}; + +struct TensorAllocationBuilder { + std::vector allocations; + + void record_allocation(void *ptr, size_t size) { + if (ptr == nullptr || size == 0) { + return; + } + allocations.push_back({reinterpret_cast(ptr), static_cast(size)}); + } + + void erase_allocation(void *ptr) { + if (ptr == nullptr) { + return; + } + uint64_t base_addr = reinterpret_cast(ptr); + for (auto it = allocations.begin(); it != allocations.end(); ++it) { + if (it->base_addr == base_addr) { + allocations.erase(it); + return; + } + } + } }; Runtime *unwrap_runtime(OrchestrationRuntime *runtime) { return reinterpret_cast(runtime)->runtime; } +TensorInfoBuilder *unwrap_tensor_info_builder(OrchestrationRuntime *runtime) { + return reinterpret_cast(runtime)->tensor_info_builder; +} + +TensorAllocationBuilder *unwrap_tensor_allocation_builder(OrchestrationRuntime *runtime) { + return reinterpret_cast(runtime)->tensor_allocation_builder; +} + int runtime_add_task(OrchestrationRuntime *runtime, uint64_t *args, int num_args, int func_id, CoreType core_type) { return unwrap_runtime(runtime)->add_task(args, num_args, func_id, core_type); } +int runtime_set_tensor_info_to_task( + OrchestrationRuntime *runtime, int task_id, const TensorInfo *tensor_info, int tensor_count +) { + Runtime *host_runtime = unwrap_runtime(runtime); + if (task_id < 0 || task_id >= host_runtime->get_task_count()) { + LOG_ERROR("Invalid task_id %d for task tensor info", task_id); + return -1; + } + if (tensor_count == 0) { + return 0; + } + if (tensor_info == nullptr) { + LOG_ERROR("Task %d tensor info pointer is null", task_id); + return -1; + } + return unwrap_tensor_info_builder(runtime)->set_tensor_info_to_task(task_id, tensor_info, tensor_count); +} + void runtime_add_successor(OrchestrationRuntime *runtime, int from_task, int to_task) { unwrap_runtime(runtime)->add_successor(from_task, to_task); } @@ -69,10 +138,13 @@ int runtime_get_task_count(OrchestrationRuntime *runtime) { return unwrap_runtim void runtime_print_runtime(OrchestrationRuntime *runtime) { unwrap_runtime(runtime)->print_runtime(); } void *runtime_device_malloc(OrchestrationRuntime *runtime, size_t size) { - return unwrap_runtime(runtime)->host_api.device_malloc(size); + void *ptr = unwrap_runtime(runtime)->host_api.device_malloc(size); + unwrap_tensor_allocation_builder(runtime)->record_allocation(ptr, size); + return ptr; } void runtime_device_free(OrchestrationRuntime *runtime, void *ptr) { + unwrap_tensor_allocation_builder(runtime)->erase_allocation(ptr); unwrap_runtime(runtime)->host_api.device_free(ptr); } @@ -81,8 +153,9 @@ int runtime_copy_to_device(OrchestrationRuntime *runtime, void *dev_ptr, const v } const OrchestrationRuntimeOps k_orchestration_runtime_ops = { - runtime_add_task, runtime_add_successor, runtime_record_tensor_pair, runtime_get_task_count, - runtime_print_runtime, runtime_device_malloc, runtime_device_free, runtime_copy_to_device, + runtime_add_task, runtime_set_tensor_info_to_task, runtime_add_successor, runtime_record_tensor_pair, + runtime_get_task_count, runtime_print_runtime, runtime_device_malloc, runtime_device_free, + runtime_copy_to_device, }; bool write_all_bytes(int fd, const uint8_t *data, size_t size) { @@ -124,6 +197,78 @@ bool create_temp_so_file(const uint8_t *data, size_t size, std::string *out_path return true; } +int upload_tensor_info_storage(Runtime *runtime, const TensorInfoBuilder &builder) { + runtime->clear_tensor_info_storage(); + for (int task_id = 0; task_id < RUNTIME_MAX_TASKS; task_id++) { + runtime->set_tensor_info_range(task_id, 0, 0); + } + + int task_count = runtime->get_task_count(); + std::vector compact_tensor_info; + for (int task_id = 0; task_id < task_count; task_id++) { + const std::vector *task_info = nullptr; + if (static_cast(task_id) < builder.tensor_info_by_task.size()) { + task_info = &builder.tensor_info_by_task[static_cast(task_id)]; + } + uint32_t offset = static_cast(compact_tensor_info.size()); + uint16_t count = 0; + if (task_info != nullptr) { + count = static_cast(task_info->size()); + compact_tensor_info.insert(compact_tensor_info.end(), task_info->begin(), task_info->end()); + } + runtime->set_tensor_info_range(task_id, offset, count); + } + + if (compact_tensor_info.empty()) { + return 0; + } + + size_t tensor_info_bytes = compact_tensor_info.size() * sizeof(TensorInfo); + void *dev_tensor_info_storage = runtime->host_api.device_malloc(tensor_info_bytes); + if (dev_tensor_info_storage == nullptr) { + LOG_ERROR("Failed to allocate tensor info storage (%zu bytes)", tensor_info_bytes); + return -1; + } + + int rc = runtime->host_api.copy_to_device(dev_tensor_info_storage, compact_tensor_info.data(), tensor_info_bytes); + if (rc != 0) { + LOG_ERROR("Failed to copy tensor info storage to device: %d", rc); + runtime->host_api.device_free(dev_tensor_info_storage); + return rc; + } + + runtime->set_tensor_info_storage(dev_tensor_info_storage, tensor_info_bytes); + LOG_INFO("Uploaded %zu tensor info entries (%zu bytes)", compact_tensor_info.size(), tensor_info_bytes); + return 0; +} + +int upload_tensor_allocation_storage(Runtime *runtime, const TensorAllocationBuilder &builder) { + runtime->clear_tensor_allocation_storage(); + if (builder.allocations.empty()) { + return 0; + } + + size_t allocation_bytes = builder.allocations.size() * sizeof(TensorAllocationInfo); + void *dev_allocation_storage = runtime->host_api.device_malloc(allocation_bytes); + if (dev_allocation_storage == nullptr) { + LOG_ERROR("Failed to allocate tensor allocation storage (%zu bytes)", allocation_bytes); + return -1; + } + + int rc = runtime->host_api.copy_to_device(dev_allocation_storage, builder.allocations.data(), allocation_bytes); + if (rc != 0) { + LOG_ERROR("Failed to copy tensor allocation storage to device: %d", rc); + runtime->host_api.device_free(dev_allocation_storage); + return rc; + } + + runtime->set_tensor_allocation_storage( + dev_allocation_storage, static_cast(builder.allocations.size()), allocation_bytes + ); + LOG_INFO("Uploaded %zu tensor allocation ranges (%zu bytes)", builder.allocations.size(), allocation_bytes); + return 0; +} + } // namespace #ifdef __cplusplus @@ -215,7 +360,11 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip orch_args->tensor_count(), orch_args->scalar_count() ); - OrchestrationRuntimeImpl orchestration_runtime = {&k_orchestration_runtime_ops, runtime}; + TensorInfoBuilder tensor_info_builder; + TensorAllocationBuilder tensor_allocation_builder; + OrchestrationRuntimeImpl orchestration_runtime = { + &k_orchestration_runtime_ops, runtime, &tensor_info_builder, &tensor_allocation_builder + }; // Call orchestration function to build task graph // The orchestration function handles device memory allocation and copy-to-device @@ -227,6 +376,26 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip return rc; } + rc = upload_tensor_allocation_storage(runtime, tensor_allocation_builder); + if (rc != 0) { + LOG_ERROR("Failed to upload tensor allocations: %d", rc); + runtime->clear_tensor_pairs(); + dlclose(handle); + return rc; + } + + rc = upload_tensor_info_storage(runtime, tensor_info_builder); + if (rc != 0) { + LOG_ERROR("Failed to upload tensor info storage: %d", rc); + if (runtime->get_tensor_allocation_storage() != nullptr) { + runtime->host_api.device_free(runtime->get_tensor_allocation_storage()); + runtime->clear_tensor_allocation_storage(); + } + runtime->clear_tensor_pairs(); + dlclose(handle); + return rc; + } + LOG_INFO("Runtime initialized. Ready for execution from Python."); // Host orchestration is complete once orch_func returns. The task graph now @@ -294,6 +463,15 @@ int validate_runtime_impl(Runtime *runtime) { } runtime->clear_registered_kernels(); + if (runtime->get_tensor_info_storage() != nullptr) { + runtime->host_api.device_free(runtime->get_tensor_info_storage()); + runtime->clear_tensor_info_storage(); + } + if (runtime->get_tensor_allocation_storage() != nullptr) { + runtime->host_api.device_free(runtime->get_tensor_allocation_storage()); + runtime->clear_tensor_allocation_storage(); + } + // Clear tensor pairs runtime->clear_tensor_pairs(); diff --git a/src/a2a3/runtime/host_build_graph/orchestration/orchestration_api.h b/src/a2a3/runtime/host_build_graph/orchestration/orchestration_api.h index fa21cc916..34b754969 100644 --- a/src/a2a3/runtime/host_build_graph/orchestration/orchestration_api.h +++ b/src/a2a3/runtime/host_build_graph/orchestration/orchestration_api.h @@ -23,11 +23,15 @@ #include "common/core_type.h" #include "task_args.h" +#include "tensor_info.h" typedef struct OrchestrationRuntime OrchestrationRuntime; typedef struct OrchestrationRuntimeOps { int (*add_task)(OrchestrationRuntime *runtime, uint64_t *args, int num_args, int func_id, CoreType core_type); + int (*set_tensor_info_to_task)( + OrchestrationRuntime *runtime, int task_id, const TensorInfo *tensor_info, int tensor_count + ); void (*add_successor)(OrchestrationRuntime *runtime, int from_task, int to_task); void (*record_tensor_pair)(OrchestrationRuntime *runtime, void *host_ptr, void *dev_ptr, size_t size); int (*get_task_count)(OrchestrationRuntime *runtime); @@ -47,6 +51,25 @@ add_task(OrchestrationRuntime *runtime, uint64_t *args, int num_args, int func_i return runtime->ops->add_task(runtime, args, num_args, func_id, core_type); } +static inline int +set_tensor_info_to_task(OrchestrationRuntime *runtime, int task_id, const TensorInfo *tensor_info, int tensor_count) { + return runtime->ops->set_tensor_info_to_task(runtime, task_id, tensor_info, tensor_count); +} + +static inline int add_task_with_tensor_info( + OrchestrationRuntime *runtime, uint64_t *args, int num_args, int func_id, CoreType core_type, + const TensorInfo *tensor_info, int tensor_count +) { + int task_id = add_task(runtime, args, num_args, func_id, core_type); + if (task_id < 0) { + return task_id; + } + if (set_tensor_info_to_task(runtime, task_id, tensor_info, tensor_count) != 0) { + return -1; + } + return task_id; +} + static inline void add_successor(OrchestrationRuntime *runtime, int from_task, int to_task) { runtime->ops->add_successor(runtime, from_task, to_task); } diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.cpp b/src/a2a3/runtime/host_build_graph/runtime/runtime.cpp index 9899b1a48..25af6e4c7 100644 --- a/src/a2a3/runtime/host_build_graph/runtime/runtime.cpp +++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.cpp @@ -48,6 +48,11 @@ Runtime::Runtime() { enable_profiling = false; perf_data_base = 0; tensor_pair_count = 0; + tensor_info_storage_ = nullptr; + tensor_info_storage_bytes_ = 0; + tensor_allocation_storage_ = nullptr; + tensor_allocation_storage_bytes_ = 0; + tensor_allocation_count_ = 0; // Initialize kernel binary tracking registered_kernel_count_ = 0; @@ -56,6 +61,8 @@ Runtime::Runtime() { for (int i = 0; i < RUNTIME_MAX_FUNC_ID; i++) { func_id_to_addr_[i] = 0; } + memset(tensor_info_offsets_, 0, sizeof(tensor_info_offsets_)); + memset(tensor_info_counts_, 0, sizeof(tensor_info_counts_)); } // ============================================================================= diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.h b/src/a2a3/runtime/host_build_graph/runtime/runtime.h index 0c92ed234..f528acf81 100644 --- a/src/a2a3/runtime/host_build_graph/runtime/runtime.h +++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.h @@ -39,6 +39,7 @@ #include "common/core_type.h" #include "common/perf_profiling.h" #include "common/platform_config.h" +#include "tensor_info.h" // Logging macros using unified logging interface #include "common/unified_log.h" @@ -226,6 +227,17 @@ class Runtime { int registered_kernel_func_ids_[RUNTIME_MAX_FUNC_ID]; int registered_kernel_count_; + // Tensor info metadata for tensor dump + void *tensor_info_storage_; + uint64_t tensor_info_storage_bytes_; + uint32_t tensor_info_offsets_[RUNTIME_MAX_TASKS]; + uint16_t tensor_info_counts_[RUNTIME_MAX_TASKS]; + + // Device allocation ranges used to recover tensor buffer addresses from task.args[] + void *tensor_allocation_storage_; + uint64_t tensor_allocation_storage_bytes_; + uint32_t tensor_allocation_count_; + public: /** * Constructor - zero-initialize all arrays @@ -336,6 +348,78 @@ class Runtime { */ void clear_tensor_pairs(); + // ========================================================================= + // Tensor Info Metadata + // ========================================================================= + + void set_tensor_info_storage(void *ptr, uint64_t bytes) { + tensor_info_storage_ = ptr; + tensor_info_storage_bytes_ = bytes; + } + + void clear_tensor_info_storage() { + tensor_info_storage_ = nullptr; + tensor_info_storage_bytes_ = 0; + } + + void set_tensor_info_range(int task_id, uint32_t offset, uint16_t count) { + if (task_id < 0 || task_id >= RUNTIME_MAX_TASKS) return; + tensor_info_offsets_[task_id] = offset; + tensor_info_counts_[task_id] = count; + } + + const TensorInfo *get_tensor_info(int task_id, int *count) const { + if (count != nullptr) { + *count = 0; + } + if (task_id < 0 || task_id >= RUNTIME_MAX_TASKS || tensor_info_storage_ == nullptr) { + return nullptr; + } + uint16_t tensor_info_count = tensor_info_counts_[task_id]; + if (tensor_info_count == 0) { + return nullptr; + } + if (count != nullptr) { + *count = static_cast(tensor_info_count); + } + const TensorInfo *base = reinterpret_cast(tensor_info_storage_); + return base + tensor_info_offsets_[task_id]; + } + + void *get_tensor_info_storage() const { return tensor_info_storage_; } + + uint64_t get_tensor_info_storage_bytes() const { return tensor_info_storage_bytes_; } + + void set_tensor_allocation_storage(void *ptr, uint32_t count, uint64_t bytes) { + tensor_allocation_storage_ = ptr; + tensor_allocation_count_ = count; + tensor_allocation_storage_bytes_ = bytes; + } + + void clear_tensor_allocation_storage() { + tensor_allocation_storage_ = nullptr; + tensor_allocation_count_ = 0; + tensor_allocation_storage_bytes_ = 0; + } + + bool is_tensor_buffer_addr(uint64_t addr) const { + if (tensor_allocation_storage_ == nullptr || tensor_allocation_count_ == 0) { + return false; + } + const TensorAllocationInfo *allocations = + reinterpret_cast(tensor_allocation_storage_); + for (uint32_t i = 0; i < tensor_allocation_count_; i++) { + if (allocations[i].contains(addr)) { + return true; + } + } + return false; + } + + void *get_tensor_allocation_storage() const { return tensor_allocation_storage_; } + + uint64_t get_tensor_allocation_storage_bytes() const { return tensor_allocation_storage_bytes_; } + // ========================================================================= // Device Orchestration (stub for API compatibility) // ========================================================================= diff --git a/src/a2a3/runtime/host_build_graph/runtime/tensor_info.h b/src/a2a3/runtime/host_build_graph/runtime/tensor_info.h new file mode 100644 index 000000000..5d2bf0b30 --- /dev/null +++ b/src/a2a3/runtime/host_build_graph/runtime/tensor_info.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_TENSOR_INFO_H_ +#define SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_TENSOR_INFO_H_ + +#include + +#include "common/platform_config.h" +#include "data_type.h" +#include "tensor_arg.h" + +// ============================================================================= +// Dump Tensor Configuration +// ============================================================================= + +#ifndef PTO2_DUMP_TENSOR +#define PTO2_DUMP_TENSOR 1 +#endif + +struct TensorInfo { + DataType dtype; + uint8_t ndims; + uint16_t reserved; + uint32_t shapes[PLATFORM_DUMP_MAX_DIMS]; + uint32_t raw_shapes[PLATFORM_DUMP_MAX_DIMS]; + uint32_t offsets[PLATFORM_DUMP_MAX_DIMS]; +}; + +static_assert(sizeof(TensorInfo) == 64, "TensorInfo must stay compact"); + +struct TensorAllocationInfo { + uint64_t base_addr; + uint64_t size_bytes; + + bool contains(uint64_t addr) const { return addr >= base_addr && addr < base_addr + size_bytes; } +}; + +static_assert(sizeof(TensorAllocationInfo) == 16, "TensorAllocationInfo must stay compact"); + +inline TensorInfo make_tensor_info( + DataType dtype, uint32_t ndims, const uint32_t *shapes, const uint32_t *raw_shapes = nullptr, + const uint32_t *offsets = nullptr +) { + TensorInfo info = {}; + info.dtype = dtype; + info.ndims = static_cast(ndims); + for (uint32_t i = 0; i < ndims && i < PLATFORM_DUMP_MAX_DIMS; i++) { + info.shapes[i] = shapes[i]; + info.raw_shapes[i] = (raw_shapes != nullptr) ? raw_shapes[i] : shapes[i]; + info.offsets[i] = (offsets != nullptr) ? offsets[i] : 0; + } + return info; +} + +inline TensorInfo make_tensor_info_from_tensor_arg(const ContinuousTensor &tensor) { + return make_tensor_info(tensor.dtype, tensor.ndims, tensor.shapes); +} + +#endif // SRC_A2A3_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_TENSOR_INFO_H_ diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 54bd70ce0..177af14ca 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -36,6 +36,7 @@ // Performance profiling headers #include "aicpu/performance_collector_aicpu.h" +#include "aicpu/tensor_dump_aicpu.h" #include "common/memory_barrier.h" #include "common/perf_profiling.h" #include "common/unified_log.h" @@ -492,6 +493,19 @@ struct AicpuExecutor { #endif bool mixed_complete = rt->scheduler.on_subtask_complete(slot_state); if (mixed_complete) { +#if PTO2_DUMP_TENSOR + if (get_enable_dump_tensor()) { + dump_tensors_for_task( + thread_idx, slot_state, TensorDumpStage::AFTER_COMPLETION, + [](uint8_t active_mask, uint8_t raw_subtask_id) { + return pto2_subtask_active(active_mask, static_cast(raw_subtask_id)); + }, + [this](int32_t func_id) { + return get_function_bin_addr(func_id); + } + ); + } +#endif #if PTO2_SCHED_PROFILING PTO2CompletionStats cstats = rt->scheduler.on_mixed_task_complete(slot_state, thread_idx, local_bufs); notify_edges_total += cstats.fanout_edges; @@ -949,6 +963,19 @@ struct AicpuExecutor { #endif ) { CoreTracker &tracker = core_trackers_[thread_idx]; +#if PTO2_DUMP_TENSOR + if (get_enable_dump_tensor()) { + dump_tensors_for_task( + thread_idx, slot_state, TensorDumpStage::BEFORE_DISPATCH, + [](uint8_t active_mask, uint8_t raw_subtask_id) { + return pto2_subtask_active(active_mask, static_cast(raw_subtask_id)); + }, + [this](int32_t func_id) { + return get_function_bin_addr(func_id); + } + ); + } +#endif if (shape == PTO2ResourceShape::MIX) { dispatch_mix_block_to_cluster( runtime, thread_idx, cluster_offset, slot_state @@ -1512,6 +1539,11 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa perf_aicpu_set_orch_thread_idx(sched_thread_num_); } #endif +#if PTO2_DUMP_TENSOR + if (get_enable_dump_tensor()) { + dump_tensor_init(orch_to_sched_ ? thread_num_ : sched_thread_num_); + } +#endif DEV_INFO("Thread %d: one-time init done", thread_idx); pto2_init_complete_.store(true, std::memory_order_release); @@ -2243,6 +2275,11 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa perf_aicpu_flush_phase_buffers(thread_idx); } #endif +#if PTO2_DUMP_TENSOR + if (get_enable_dump_tensor()) { + dump_tensor_flush(thread_idx); + } +#endif return cur_thread_completed; } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h index a2d8ae976..5f20f1739 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h @@ -69,6 +69,14 @@ #error "PTO2_TENSORMAP_PROFILING requires PTO2_ORCH_PROFILING=1" #endif +// ============================================================================= +// Dump Tensor Configuration +// ============================================================================= + +#ifndef PTO2_DUMP_TENSOR +#define PTO2_DUMP_TENSOR 1 +#endif + // ============================================================================= // Configuration Constants // ============================================================================= diff --git a/src/a5/platform/include/aicpu/tensor_dump_aicpu.h b/src/a5/platform/include/aicpu/tensor_dump_aicpu.h new file mode 100644 index 000000000..7d3a7328d --- /dev/null +++ b/src/a5/platform/include/aicpu/tensor_dump_aicpu.h @@ -0,0 +1,270 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * @file tensor_dump_aicpu.h + * @brief AICPU tensor dump collection interface (memcpy-based) + * + * Provides tensor dump management for AICPU side. + * Same public API as A2A3 for future migration compatibility. + * Simplified internals: direct buffer writes, no SPSC queues. + */ + +#ifndef SRC_A5_PLATFORM_AICPU_TENSOR_DUMP_AICPU_H_ +#define SRC_A5_PLATFORM_AICPU_TENSOR_DUMP_AICPU_H_ + +#include + +#include "common/memory_barrier.h" +#include "common/tensor_dump.h" +#include "data_type.h" + +#ifdef __cplusplus +#include "callable.h" +#include "common/unified_log.h" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Set the tensor dump base address. + * Called by the platform layer before AICPU execution starts. + * + * @param dump_data_base Device pointer (as uint64_t) to DumpSetupHeader + */ +void set_platform_dump_base(uint64_t dump_data_base); + +/** + * Get the tensor dump base address. + * + * @return Device pointer (as uint64_t) to DumpSetupHeader + */ +uint64_t get_platform_dump_base(); + +/** + * Set whether tensor dump is enabled for this execution. + * Called by the platform layer before AICPU execution starts. + * + * @param enable true to enable tensor dump, false to disable + */ +void set_enable_dump_tensor(bool enable); + +/** + * Get whether tensor dump is enabled for this execution. + * + * @return true if tensor dump is enabled + */ +bool get_enable_dump_tensor(); + +#ifdef __cplusplus +} +#endif + +#ifdef __cplusplus +bool get_tensor_dump_role_from_direction(ArgDirection dir, TensorDumpRole *role); +int32_t count_callable_tensor_args(const CoreCallable &callable); +bool should_dump_tensor_at_stage(TensorDumpRole role, TensorDumpStage stage); +bool try_log_tensor_dump_layout_mismatch(); +int dump_tensor_record(int thread_idx, const TensorDumpInfo &info); + +template +inline void dump_tensors_for_task( + int32_t thread_idx, const SlotStateT &slot_state, TensorDumpStage stage, IsSubtaskActiveFn is_subtask_active, + GetFunctionBinAddrFn get_function_bin_addr +) { + const auto &pl = *slot_state.payload; + const CoreCallable *callables[MaxSubtaskSlots] = {}; + int32_t total_tensor_args = 0; + + for (int raw_subtask_id = 0; raw_subtask_id < MaxSubtaskSlots; raw_subtask_id++) { + if (!is_subtask_active(slot_state.active_mask, raw_subtask_id)) { + continue; + } + int32_t slot_idx = raw_subtask_id; + uint64_t callable_addr = get_function_bin_addr(slot_state.task->kernel_id[slot_idx]); + if (callable_addr == 0) { + return; + } + callables[slot_idx] = reinterpret_cast(callable_addr); + total_tensor_args += count_callable_tensor_args(*callables[slot_idx]); + } + + if (total_tensor_args != pl.tensor_count) { + if (try_log_tensor_dump_layout_mismatch()) { + LOG_WARN( + "Thread %d: tensor dump skipped for task 0x%" PRIx64 + ": active callable tensor count (%d) does not match payload tensor count (%d). " + "Task-level dump assumes payload tensors are concatenated by active subtask order.", + thread_idx, static_cast(slot_state.task->task_id.raw), total_tensor_args, pl.tensor_count + ); + } + return; + } + + rmb(); + + int32_t payload_index = 0; + for (int raw_subtask_id = 0; raw_subtask_id < MaxSubtaskSlots; raw_subtask_id++) { + if (!is_subtask_active(slot_state.active_mask, raw_subtask_id)) { + continue; + } + int32_t slot_idx = raw_subtask_id; + const CoreCallable &callable = *callables[slot_idx]; + for (int32_t sig_idx = 0; sig_idx < callable.sig_count(); sig_idx++) { + ArgDirection dir = callable.sig(sig_idx); + if (dir == ArgDirection::SCALAR) { + continue; + } + TensorDumpRole role; + if (get_tensor_dump_role_from_direction(dir, &role) && should_dump_tensor_at_stage(role, stage)) { + const auto &t = pl.tensors[payload_index]; + TensorDumpInfo info = {}; + info.buffer_addr = t.buffer.addr; + info.dtype = static_cast(t.dtype); + info.ndims = static_cast(t.ndims); + const uint32_t *raw_shapes = t.get_raw_shapes(); + for (uint32_t d = 0; d < t.ndims && d < PLATFORM_DUMP_MAX_DIMS; d++) { + info.shapes[d] = t.shapes[d]; + info.offsets[d] = t.is_all_offset_zero ? 0 : t.offsets[d]; + info.raw_shapes[d] = raw_shapes[d]; + } + info.task_id = slot_state.task->task_id.raw; + info.subtask_id = raw_subtask_id; + info.func_id = slot_state.task->kernel_id[slot_idx]; + info.arg_index = static_cast(payload_index); + info.role = role; + info.stage = stage; + dump_tensor_record(thread_idx, info); + } + payload_index++; + } + } +} + +template +inline void dump_tensors_for_task( + int32_t thread_idx, uint64_t task_id, uint8_t subtask_id, int32_t task_arg_count, int32_t func_id, + const CoreCallable &callable, const TensorInfoT *tensor_info, int32_t tensor_info_count, + const uint64_t *buffer_addrs, int32_t buffer_count, TensorDumpStage stage +) { + int32_t sig_count = callable.sig_count(); + if (task_arg_count < sig_count) { + static bool logged_task_signature_mismatch = false; + if (!logged_task_signature_mismatch) { + logged_task_signature_mismatch = true; + LOG_WARN( + "Thread %d: tensor dump skipped for task 0x%" PRIx64 + ": task args (%d) smaller than callable signature (%d)", + thread_idx, task_id, task_arg_count, sig_count + ); + } + return; + } + + int32_t tensor_arg_count = count_callable_tensor_args(callable); + if (tensor_info == nullptr || tensor_info_count != tensor_arg_count) { + if (tensor_arg_count == 0) { + return; + } + if (try_log_tensor_dump_layout_mismatch()) { + LOG_WARN( + "Thread %d: tensor dump skipped for task 0x%" PRIx64 + ": callable tensor args (%d) do not match registered tensor info (%d)", + thread_idx, task_id, tensor_arg_count, tensor_info_count + ); + } + return; + } + + if (buffer_addrs == nullptr || buffer_count != tensor_arg_count) { + static bool logged_task_tensor_addr_mismatch = false; + if (!logged_task_tensor_addr_mismatch) { + logged_task_tensor_addr_mismatch = true; + LOG_WARN( + "Thread %d: tensor dump skipped for task 0x%" PRIx64 + ": reconstructed tensor buffers (%d) do not match callable tensor args (%d)", + thread_idx, task_id, buffer_count, tensor_arg_count + ); + } + return; + } + + rmb(); + + int32_t tensor_arg_index = 0; + for (int32_t sig_idx = 0; sig_idx < sig_count; sig_idx++) { + ArgDirection dir = callable.sig(sig_idx); + if (dir == ArgDirection::SCALAR) { + continue; + } + + TensorDumpRole role; + if (!get_tensor_dump_role_from_direction(dir, &role) || !should_dump_tensor_at_stage(role, stage)) { + tensor_arg_index++; + continue; + } + + const auto &t = tensor_info[tensor_arg_index]; + TensorDumpInfo info = {}; + info.task_id = task_id; + info.subtask_id = subtask_id; + info.role = role; + info.stage = stage; + info.dtype = static_cast(t.dtype); + info.ndims = t.ndims; + info.func_id = static_cast(func_id); + info.arg_index = static_cast(tensor_arg_index); + info.buffer_addr = buffer_addrs[tensor_arg_index]; + for (uint32_t d = 0; d < t.ndims && d < PLATFORM_DUMP_MAX_DIMS; d++) { + info.shapes[d] = t.shapes[d]; + info.offsets[d] = t.offsets[d]; + info.raw_shapes[d] = t.raw_shapes[d]; + } + dump_tensor_record(thread_idx, info); + tensor_arg_index++; + } +} +#endif + +/** + * Initialize tensor dump. + * + * Reads DumpSetupHeader from dump_data_base and caches per-thread + * DumpBuffer and arena pointers. + * + * @param num_dump_threads Number of scheduling threads that will dump tensors + */ +void dump_tensor_init(int num_dump_threads); + +/** + * Record a single tensor dump. + * + * Copies tensor data to the thread's arena, appends a TensorDumpRecord + * to the thread's DumpBuffer. Silently drops when buffer is full. + * + * @param thread_idx Scheduling thread index + * @param info Tensor metadata and identification + * @return 0 on success or intentional drop, -1 only when dump state is unavailable + */ +int dump_tensor_record(int thread_idx, const TensorDumpInfo &info); + +/** + * Flush remaining tensor dump data for a thread. + * + * In the memcpy design this is a no-op for data (host reads after sync). + * Logs per-thread dump statistics. + * + * @param thread_idx Thread index + */ +void dump_tensor_flush(int thread_idx); + +#endif // SRC_A5_PLATFORM_AICPU_TENSOR_DUMP_AICPU_H_ diff --git a/src/a5/platform/include/common/kernel_args.h b/src/a5/platform/include/common/kernel_args.h index e4dee44b8..9bb81ee12 100644 --- a/src/a5/platform/include/common/kernel_args.h +++ b/src/a5/platform/include/common/kernel_args.h @@ -69,6 +69,7 @@ struct KernelArgs { DeviceArgs *device_args{nullptr}; // Device arguments (AICPU reads, contains SO info) Runtime *runtime_args{nullptr}; // Task runtime in device memory uint64_t regs{0}; // Per-core register base address array (platform-specific) + uint64_t dump_data_base{0}; // Dump shared memory base address, zero when unused }; #ifdef __cplusplus diff --git a/src/a5/platform/include/common/platform_config.h b/src/a5/platform/include/common/platform_config.h index 4dd043b0a..bdadec7c4 100644 --- a/src/a5/platform/include/common/platform_config.h +++ b/src/a5/platform/include/common/platform_config.h @@ -111,6 +111,55 @@ inline double cycles_to_us(uint64_t cycles) { return (static_cast(cycles) / PLATFORM_PROF_SYS_CNT_FREQ) * 1000000.0; } +// ============================================================================= +// Tensor Dump Configuration +// ============================================================================= + +/** + * Number of TensorDumpRecord entries per DumpBuffer. + * Each record is 128 bytes, so one buffer = RECORDS * 128 bytes. + */ +constexpr int PLATFORM_DUMP_RECORDS_PER_BUFFER = 256; + +/** + * Pre-allocated DumpBuffer count per AICPU scheduling thread. + * Retained for configuration parity with A2A3; in the memcpy design + * this controls arena sizing only (no SPSC free queues). + */ +constexpr int PLATFORM_DUMP_BUFFERS_PER_THREAD = 8; + +/** + * SPSC free_queue slot count for dump metadata buffers. + * Retained for configuration parity with A2A3. + */ +constexpr int PLATFORM_DUMP_SLOT_COUNT = 4; + +/** + * Expected average tensor size in bytes. + * Used together with BUFFERS_PER_THREAD and RECORDS_PER_BUFFER to compute + * per-thread arena size: + * arena = BUFFERS_PER_THREAD * RECORDS_PER_BUFFER * AVG_TENSOR_BYTES + * Default: 4 * 256 * 65536 = 64 MB per thread. + */ +constexpr uint64_t PLATFORM_DUMP_AVG_TENSOR_BYTES = 65536; + +/** + * Maximum tensor dimensions (matches RUNTIME_MAX_TENSOR_DIMS). + */ +constexpr int PLATFORM_DUMP_MAX_DIMS = 5; + +/** + * Ready queue capacity for dump data. + * Retained for configuration parity with A2A3. + */ +constexpr int PLATFORM_DUMP_READYQUEUE_SIZE = PLATFORM_MAX_AICPU_THREADS * PLATFORM_DUMP_BUFFERS_PER_THREAD * 2; + +/** + * Idle timeout duration for tensor dump collection (seconds) + * Retained for configuration parity with A2A3. + */ +constexpr int PLATFORM_DUMP_TIMEOUT_SECONDS = 30; + // ============================================================================= // Register Communication Configuration // ============================================================================= diff --git a/src/a5/platform/include/common/tensor_dump.h b/src/a5/platform/include/common/tensor_dump.h new file mode 100644 index 000000000..9595b74b9 --- /dev/null +++ b/src/a5/platform/include/common/tensor_dump.h @@ -0,0 +1,231 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * @file tensor_dump.h + * @brief Tensor dump data structures for device-to-host tensor collection (memcpy-based) + * + * A5 simplified design: pre-allocated buffers + direct write + memcpy collect-after-sync. + * Mirrors PerformanceCollector pattern — no shared memory, no background threads, + * no SPSC queues. + * + * Memory layout (allocated only when enable_dump_tensor=true): + * + * DumpSetupHeader (single, published via kernel_args.dump_data_base) + * ├── dump_buffer_ptrs[] → per-thread DumpBuffer (count + records[]) + * ├── arena_header_ptrs[] → per-thread DumpArenaHeader (write_offset) + * └── arena_data_ptrs[] → per-thread arena data region + * + * After stream sync, host copies everything back via rtMemcpy / memcpy. + */ + +#ifndef SRC_A5_PLATFORM_INCLUDE_COMMON_TENSOR_DUMP_H_ +#define SRC_A5_PLATFORM_INCLUDE_COMMON_TENSOR_DUMP_H_ + +#include +#include + +#include "common/platform_config.h" + +// ============================================================================= +// Constants +// ============================================================================= + +constexpr uint32_t TENSOR_DUMP_MAGIC = 0x44554D50; // "DUMP" + +// ============================================================================= +// TensorDumpRole - Formal kernel signature direction +// ============================================================================= + +enum class TensorDumpRole : uint8_t { + INPUT = 0, + OUTPUT = 1, + INOUT = 2, +}; + +// ============================================================================= +// TensorDumpStage - When the tensor was captured +// ============================================================================= + +enum class TensorDumpStage : uint8_t { + BEFORE_DISPATCH = 0, + AFTER_COMPLETION = 1, +}; + +// ============================================================================= +// TensorDumpRecord - Single Tensor Dump Entry (128B = 2 cache lines) +// ============================================================================= + +/** + * Per-tensor metadata + payload reference. + * Identical layout to A2A3 for binary compatibility. + * + * Cache line 1 (64B): identifiers, payload location, compact scalar metadata + * Cache line 2 (64B): logical/source layout arrays + */ +struct alignas(64) TensorDumpRecord { + // === Cache line 1 (64B) === + uint64_t task_id; // PTO2 encoding or plain task index + uint8_t subtask_id; // PTO2SubtaskSlot raw value (AIC=0, AIV0=1, AIV1=2) + uint8_t role; // TensorDumpRole (formal callable signature) + uint8_t stage; // TensorDumpStage (before/after execution) + uint8_t ndims; // Number of dimensions + uint32_t func_id; // Kernel function identifier + uint32_t arg_index; // Position in PTO2TaskPayload::tensors[] + uint8_t dtype; // DataType raw enum value + uint8_t truncated; // 1 if payload was truncated (tensor > arena capacity) + uint8_t is_contiguous; // 1 when source view is already contiguous + uint8_t pad0_align; // Explicit alignment before 64-bit payload offsets + uint64_t payload_offset; // Monotonic byte offset into thread arena + uint64_t payload_size; // Bytes actually copied (may be < full tensor bytes) + uint8_t pad0[24]; // Preserve 64B cache-line layout + + // === Cache line 2 (64B) === + uint32_t shapes[PLATFORM_DUMP_MAX_DIMS]; // Current view shape + uint32_t offsets[PLATFORM_DUMP_MAX_DIMS]; // Multi-dimensional offsets + uint32_t raw_shapes[PLATFORM_DUMP_MAX_DIMS]; // Underlying source layout shape + uint8_t pad1[4]; // Pad to 128 bytes +} __attribute__((aligned(64))); + +static_assert(sizeof(TensorDumpRecord) == 128, "TensorDumpRecord must be 128 bytes (2 cache lines)"); + +// ============================================================================= +// DumpBuffer - Per-Thread Record Buffer (count-first, like PerfBuffer) +// ============================================================================= + +/** + * Per-thread dump record buffer. AICPU writes records sequentially; + * when count reaches capacity, further records are silently dropped. + * Host copies the buffer back after stream sync. + */ +struct DumpBuffer { + volatile uint32_t count; // Records written by AICPU (at offset 0) + uint32_t capacity; // Max records (set by host during init) + volatile uint32_t dropped_count; // Records dropped (buffer or arena full) + uint32_t pad[13]; // Pad header to 64B cache line + // TensorDumpRecord records[] follows (flexible array member) + // Access via: reinterpret_cast(this + 1) +} __attribute__((aligned(64))); + +static_assert(sizeof(DumpBuffer) == 64, "DumpBuffer header must be 64 bytes"); + +// ============================================================================= +// DumpArenaHeader - Per-Thread Arena Metadata +// ============================================================================= + +/** + * Per-thread arena metadata. Separate from the arena data region + * so host can read just the header to determine how much data was written. + */ +struct DumpArenaHeader { + volatile uint64_t write_offset; // Monotonic write cursor (AICPU increments) + uint64_t arena_size; // Total arena bytes (set by host) + uint32_t pad[12]; // Pad to 64B +} __attribute__((aligned(64))); + +static_assert(sizeof(DumpArenaHeader) == 64, "DumpArenaHeader must be 64 bytes"); + +// ============================================================================= +// DumpSetupHeader - Host-Initialized, AICPU Reads +// ============================================================================= + +/** + * Setup header published via kernel_args.dump_data_base. + * Host initializes all fields and copies to device before execution. + * AICPU reads pointers during dump_tensor_init(). + */ +struct DumpSetupHeader { + uint32_t num_dump_threads; + uint32_t records_per_buffer; + uint32_t magic; + uint32_t pad0; + // Per-thread device pointers + uint64_t dump_buffer_ptrs[PLATFORM_MAX_AICPU_THREADS]; // -> DumpBuffer + uint64_t arena_header_ptrs[PLATFORM_MAX_AICPU_THREADS]; // -> DumpArenaHeader + uint64_t arena_data_ptrs[PLATFORM_MAX_AICPU_THREADS]; // -> arena data region + uint64_t arena_sizes[PLATFORM_MAX_AICPU_THREADS]; +} __attribute__((aligned(64))); + +// ============================================================================= +// TensorDumpInfo - Lightweight Info Struct (passed from runtime to platform API) +// ============================================================================= + +/** + * Caller fills this struct from runtime-specific tensor types. + * Platform layer is agnostic to runtime-specific types (Tensor, PTO2TaskPayload, etc.). + * Identical to A2A3 TensorDumpInfo for API compatibility. + */ +struct TensorDumpInfo { + uint64_t task_id; + uint8_t subtask_id; + TensorDumpRole role; + TensorDumpStage stage; + uint8_t dtype; + uint8_t ndims; + uint32_t func_id; + uint32_t arg_index; + uint64_t buffer_addr; + uint32_t shapes[PLATFORM_DUMP_MAX_DIMS]; + uint32_t offsets[PLATFORM_DUMP_MAX_DIMS]; + uint32_t raw_shapes[PLATFORM_DUMP_MAX_DIMS]; +}; + +// ============================================================================= +// Helper Functions - Memory Layout +// ============================================================================= + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Calculate DumpBuffer allocation size (header + records array). + * + * @param capacity Number of TensorDumpRecord entries + * @return Total bytes to allocate for one DumpBuffer + */ +inline size_t calc_dump_buffer_size(int capacity) { + return sizeof(DumpBuffer) + static_cast(capacity) * sizeof(TensorDumpRecord); +} + +/** + * Calculate per-thread arena size from configuration constants. + * + * @return Arena size in bytes per thread + */ +inline uint64_t calc_dump_arena_size() { + return static_cast(PLATFORM_DUMP_BUFFERS_PER_THREAD) * PLATFORM_DUMP_RECORDS_PER_BUFFER * + PLATFORM_DUMP_AVG_TENSOR_BYTES; +} + +/** + * Get DumpSetupHeader pointer from dump base address. + * + * @param base_ptr Dump shared memory base address (kernel_args.dump_data_base) + * @return DumpSetupHeader pointer + */ +inline DumpSetupHeader *get_dump_setup_header(void *base_ptr) { return reinterpret_cast(base_ptr); } + +/** + * Get pointer to the records array of a DumpBuffer. + * + * @param buf DumpBuffer pointer + * @return Pointer to the first TensorDumpRecord + */ +inline TensorDumpRecord *get_dump_buffer_records(DumpBuffer *buf) { + return reinterpret_cast(buf + 1); +} + +#ifdef __cplusplus +} +#endif + +#endif // SRC_A5_PLATFORM_INCLUDE_COMMON_TENSOR_DUMP_H_ diff --git a/src/a5/platform/include/host/tensor_dump_collector.h b/src/a5/platform/include/host/tensor_dump_collector.h new file mode 100644 index 000000000..35ffc9feb --- /dev/null +++ b/src/a5/platform/include/host/tensor_dump_collector.h @@ -0,0 +1,183 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * @file tensor_dump_collector.h + * @brief Host-side tensor dump collector (memcpy-based) + * + * Mirrors PerformanceCollector architecture: + * - Host allocates per-thread DumpBuffers + arenas on device + * - AICPU writes records and payload during execution + * - After stream sync, host copies everything back via rtMemcpy/memcpy + * - Export dump files (JSON manifest + binary payload) + * + * No background threads, no SPSC queues — simple collect-after-sync. + */ + +#ifndef SRC_A5_PLATFORM_INCLUDE_HOST_TENSOR_DUMP_COLLECTOR_H_ +#define SRC_A5_PLATFORM_INCLUDE_HOST_TENSOR_DUMP_COLLECTOR_H_ + +#include +#include +#include + +#include "common/platform_config.h" +#include "common/tensor_dump.h" +#include "data_type.h" + +/** + * Device memory allocation callback. + */ +using DumpAllocCallback = void *(*)(size_t size); + +/** + * Device memory free callback. + */ +using DumpFreeCallback = int (*)(void *dev_ptr); + +/** + * Host -> Device copy callback. + */ +using DumpCopyToDeviceCallback = int (*)(void *dev_dst, const void *host_src, size_t size); + +/** + * Device -> Host copy callback. + */ +using DumpCopyFromDeviceCallback = int (*)(void *host_dst, const void *dev_src, size_t size); + +// ============================================================================= +// DumpedTensor - Collected tensor metadata + payload bytes +// ============================================================================= + +/** + * Collected tensor metadata + payload bytes (identical to A2A3 DumpedTensor). + */ +struct DumpedTensor { + uint64_t task_id; + uint8_t subtask_id; + uint32_t func_id; + uint32_t arg_index; + TensorDumpRole role; + TensorDumpStage stage; + uint8_t dtype; + uint8_t ndims; + uint32_t raw_shapes[PLATFORM_DUMP_MAX_DIMS]; + uint32_t shapes[PLATFORM_DUMP_MAX_DIMS]; + uint32_t offsets[PLATFORM_DUMP_MAX_DIMS]; + bool is_contiguous; + bool truncated; + bool overwritten; + uint64_t payload_size; + uint64_t bin_offset; + std::vector bytes; +}; + +// ============================================================================= +// TensorDumpCollector - Main Collector +// ============================================================================= + +/** + * Host-side tensor dump collector. + * + * Lifecycle (mirrors PerformanceCollector): + * 1. initialize() — allocate DumpSetupHeader + per-thread DumpBuffers + arenas, + * caller reads get_dump_setup_device_ptr() and sets kernel_args.dump_data_base + * 2. (AICPU execution writes records and payload data) + * 3. collect_all() — after stream sync, copy header + buffers + arenas back + * 4. export_dump_files() — write JSON manifest + binary payload + * 5. finalize() — free all device allocations + */ +class TensorDumpCollector { +public: + TensorDumpCollector() = default; + ~TensorDumpCollector(); + + TensorDumpCollector(const TensorDumpCollector &) = delete; + TensorDumpCollector &operator=(const TensorDumpCollector &) = delete; + + /** + * Initialize tensor dump device buffers. + * + * @param num_dump_threads Number of AICPU scheduling threads + * @param device_id Device ID + * @param alloc_cb Device memory alloc + * @param free_cb Device memory free + * @param copy_to_dev_cb Host->device copy + * @param copy_from_dev_cb Device->host copy + * @return 0 on success, error code on failure + */ + int initialize( + int num_dump_threads, int device_id, DumpAllocCallback alloc_cb, DumpFreeCallback free_cb, + DumpCopyToDeviceCallback copy_to_dev_cb, DumpCopyFromDeviceCallback copy_from_dev_cb + ); + + /** + * Copy all dump data back from device and parse into collected_ vector. + * Must be called after execution stream has been fully synchronized. + * + * @return 0 on success, error code on failure + */ + int collect_all(); + + /** + * Export collected data to dump files (JSON manifest + binary payload). + * + * @param output_path Output directory + * @return 0 on success, -1 on failure + */ + int export_dump_files(const std::string &output_path = "outputs"); + + /** + * Free all device buffers and clear host-side state. + * + * @return 0 on success, error code on failure + */ + int finalize(); + + /** + * Check if the collector has been initialized. + */ + bool is_initialized() const { return setup_header_dev_ != nullptr; } + + /** + * Get the device pointer to the DumpSetupHeader. + * Used to set kernel_args.dump_data_base. + */ + void *get_dump_setup_device_ptr() const { return setup_header_dev_; } + +private: + // Device-side allocations + void *setup_header_dev_{nullptr}; + std::vector dump_buffers_dev_; // Per-thread DumpBuffer + std::vector arena_headers_dev_; // Per-thread DumpArenaHeader + std::vector arena_data_dev_; // Per-thread arena data + + // Configuration + int num_dump_threads_{0}; + int device_id_{-1}; + size_t dump_buffer_bytes_{0}; + + // Callbacks + DumpAllocCallback alloc_cb_{nullptr}; + DumpFreeCallback free_cb_{nullptr}; + DumpCopyToDeviceCallback copy_to_dev_cb_{nullptr}; + DumpCopyFromDeviceCallback copy_from_dev_cb_{nullptr}; + + // Collected data + std::vector collected_; + + // Stats + uint32_t total_dropped_count_{0}; + uint32_t total_truncated_count_{0}; + uint32_t total_overwrite_count_{0}; +}; + +#endif // SRC_A5_PLATFORM_INCLUDE_HOST_TENSOR_DUMP_COLLECTOR_H_ diff --git a/src/a5/platform/onboard/aicpu/kernel.cpp b/src/a5/platform/onboard/aicpu/kernel.cpp index d82696201..02eff65ad 100644 --- a/src/a5/platform/onboard/aicpu/kernel.cpp +++ b/src/a5/platform/onboard/aicpu/kernel.cpp @@ -16,6 +16,7 @@ #include "aicpu/device_log.h" #include "aicpu/platform_regs.h" #include "aicpu/platform_aicpu_affinity.h" +#include "aicpu/tensor_dump_aicpu.h" #include "runtime.h" // Forward declaration of aicpu_execute (implemented in aicpu_executor.cpp) @@ -80,6 +81,8 @@ extern "C" __attribute__((visibility("default"))) int DynTileFwkBackendKernelSer // Store platform regs before calling aicpu_execute set_platform_regs(k_args->regs); + set_platform_dump_base(k_args->dump_data_base); + set_enable_dump_tensor(k_args->dump_data_base != 0); // Affinity gate: drop excess threads before entering runtime if (!platform_aicpu_affinity_gate(runtime->sche_cpu_num, PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH)) { diff --git a/src/a5/platform/onboard/host/CMakeLists.txt b/src/a5/platform/onboard/host/CMakeLists.txt index 12c86f4fd..2bef4d9ba 100644 --- a/src/a5/platform/onboard/host/CMakeLists.txt +++ b/src/a5/platform/onboard/host/CMakeLists.txt @@ -38,6 +38,7 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/host_log.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/unified_log_host.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/performance_collector.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/tensor_dump_collector.cpp" ) if(DEFINED CUSTOM_SOURCE_DIRS) foreach(SRC_DIR ${CUSTOM_SOURCE_DIRS}) diff --git a/src/a5/platform/onboard/host/device_runner.cpp b/src/a5/platform/onboard/host/device_runner.cpp index e451a5efa..a3809a619 100644 --- a/src/a5/platform/onboard/host/device_runner.cpp +++ b/src/a5/platform/onboard/host/device_runner.cpp @@ -271,9 +271,8 @@ int DeviceRunner::copy_from_device(void *host_ptr, const void *dev_ptr, size_t b int DeviceRunner::run( Runtime &runtime, int block_dim, int device_id, const std::vector &aicpu_so_binary, - const std::vector &aicore_kernel_binary, int launch_aicpu_num + const std::vector &aicore_kernel_binary, int launch_aicpu_num, bool enable_dump_tensor ) { - // Validate launch_aicpu_num if (launch_aicpu_num < 1 || launch_aicpu_num > PLATFORM_MAX_AICPU_THREADS) { LOG_ERROR("launch_aicpu_num (%d) must be in range [1, %d]", launch_aicpu_num, PLATFORM_MAX_AICPU_THREADS); return -1; @@ -386,6 +385,15 @@ int DeviceRunner::run( } } + // Initialize tensor dump if enabled + if (enable_dump_tensor) { + rc = init_tensor_dump(runtime, num_aicore, device_id); + if (rc != 0) { + LOG_ERROR("init_tensor_dump failed: %d", rc); + return rc; + } + } + std::cout << "\n=== Initialize runtime args ===" << '\n'; // Initialize runtime args rc = kernel_args_.init_runtime_args(runtime, mem_alloc_); @@ -444,6 +452,12 @@ int DeviceRunner::run( export_swimlane_json(); } + // Collect and export tensor dump data + if (enable_dump_tensor) { + dump_collector_.collect_all(); + dump_collector_.export_dump_files(); + } + // Print handshake results (reads from device memory, must be before free) print_handshake_results(); @@ -508,6 +522,11 @@ int DeviceRunner::finalize() { perf_collector_.finalize(); } + // Cleanup tensor dump + if (dump_collector_.is_initialized()) { + dump_collector_.finalize(); + } + // Free all remaining allocations (including handshake buffer and binGmAddr) mem_alloc_.finalize(); @@ -687,3 +706,35 @@ int DeviceRunner::init_performance_profiling(Runtime &runtime, int num_aicore, i int DeviceRunner::export_swimlane_json(const std::string &output_path) { return perf_collector_.export_swimlane_json(output_path); } + +int DeviceRunner::init_tensor_dump(Runtime &runtime, int num_aicore, int device_id) { + (void)num_aicore; + int num_dump_threads = runtime.sche_cpu_num; + + auto alloc_cb = [](size_t size) -> void * { + void *ptr = nullptr; + int rc = rtMalloc(&ptr, size, RT_MEMORY_HBM, 0); + return (rc == 0) ? ptr : nullptr; + }; + + auto free_cb = [](void *dev_ptr) -> int { + return rtFree(dev_ptr); + }; + + auto copy_to_dev_cb = [](void *dev_dst, const void *host_src, size_t size) -> int { + return rtMemcpy(dev_dst, size, host_src, size, RT_MEMCPY_HOST_TO_DEVICE); + }; + + auto copy_from_dev_cb = [](void *host_dst, const void *dev_src, size_t size) -> int { + return rtMemcpy(host_dst, size, dev_src, size, RT_MEMCPY_DEVICE_TO_HOST); + }; + + int rc = + dump_collector_.initialize(num_dump_threads, device_id, alloc_cb, free_cb, copy_to_dev_cb, copy_from_dev_cb); + if (rc != 0) { + return rc; + } + + kernel_args_.args.dump_data_base = reinterpret_cast(dump_collector_.get_dump_setup_device_ptr()); + return 0; +} diff --git a/src/a5/platform/onboard/host/device_runner.h b/src/a5/platform/onboard/host/device_runner.h index 6658f7221..f14a46415 100644 --- a/src/a5/platform/onboard/host/device_runner.h +++ b/src/a5/platform/onboard/host/device_runner.h @@ -43,6 +43,7 @@ #include "host/function_cache.h" #include "host/memory_allocator.h" #include "host/performance_collector.h" +#include "host/tensor_dump_collector.h" #include "runtime.h" /** @@ -227,7 +228,7 @@ class DeviceRunner { */ int run(Runtime &runtime, int block_dim, int device_id, const std::vector &aicpu_so_binary, - const std::vector &aicore_kernel_binary, int launch_aicpu_num = 1); + const std::vector &aicore_kernel_binary, int launch_aicpu_num = 1, bool enable_dump_tensor = false); /** * Print handshake results from device @@ -359,6 +360,9 @@ class DeviceRunner { // Performance profiling PerformanceCollector perf_collector_; + // Tensor dump (independent from profiling) + TensorDumpCollector dump_collector_; + /** * Ensure device is initialized (lazy initialization) * @@ -404,6 +408,16 @@ class DeviceRunner { * @return 0 on success, error code on failure */ int init_performance_profiling(Runtime &runtime, int num_aicore, int device_id); + + /** + * Initialize tensor dump device buffers. + * + * @param runtime Runtime instance to configure + * @param num_aicore Number of AICore instances (unused) + * @param device_id Device ID for allocations + * @return 0 on success, error code on failure + */ + int init_tensor_dump(Runtime &runtime, int num_aicore, int device_id); }; #endif // RUNTIME_DEVICERUNNER_H diff --git a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp index cc65fe997..d672355f6 100644 --- a/src/a5/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/onboard/host/pto_runtime_c_api.cpp @@ -122,7 +122,7 @@ int set_device(DeviceContextHandle ctx, int device_id) { int run_runtime( DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim, int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary, - size_t aicore_size, int enable_profiling + size_t aicore_size, int enable_profiling, int enable_dump_tensor ) { if (ctx == NULL || runtime == NULL) return -1; if (aicpu_binary == NULL || aicpu_size == 0 || aicore_binary == NULL || aicore_size == 0) return -1; @@ -168,7 +168,7 @@ int run_runtime( std::vector aicpu_vec(aicpu_binary, aicpu_binary + aicpu_size); std::vector aicore_vec(aicore_binary, aicore_binary + aicore_size); - rc = runner->run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num); + rc = runner->run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num, enable_dump_tensor != 0); if (rc != 0) { validate_runtime_impl(r); r->~Runtime(); diff --git a/src/a5/platform/sim/host/CMakeLists.txt b/src/a5/platform/sim/host/CMakeLists.txt index 9ad460452..73b0029bd 100644 --- a/src/a5/platform/sim/host/CMakeLists.txt +++ b/src/a5/platform/sim/host/CMakeLists.txt @@ -43,6 +43,7 @@ list(APPEND HOST_RUNTIME_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/host_log.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/unified_log_host.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/performance_collector.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/../../src/host/tensor_dump_collector.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/../aicpu/platform_aicpu_affinity.cpp" ) diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp index 5bf17ffcc..7258642e6 100644 --- a/src/a5/platform/sim/host/device_runner.cpp +++ b/src/a5/platform/sim/host/device_runner.cpp @@ -145,6 +145,20 @@ int DeviceRunner::ensure_binaries_loaded( return -1; } + set_platform_dump_base_func_ = + reinterpret_cast(dlsym(aicpu_so_handle_, "set_platform_dump_base")); + if (set_platform_dump_base_func_ == nullptr) { + LOG_ERROR("dlsym failed for set_platform_dump_base: %s", dlerror()); + return -1; + } + + set_enable_dump_tensor_func_ = + reinterpret_cast(dlsym(aicpu_so_handle_, "set_enable_dump_tensor")); + if (set_enable_dump_tensor_func_ == nullptr) { + LOG_ERROR("dlsym failed for set_enable_dump_tensor: %s", dlerror()); + return -1; + } + LOG_INFO("DeviceRunner(sim): Loaded aicpu_execute from %s", aicpu_so_path_.c_str()); } @@ -209,9 +223,8 @@ int DeviceRunner::copy_from_device(void *host_ptr, const void *dev_ptr, size_t b int DeviceRunner::run( Runtime &runtime, int block_dim, int device_id, const std::vector &aicpu_so_binary, - const std::vector &aicore_kernel_binary, int launch_aicpu_num + const std::vector &aicore_kernel_binary, int launch_aicpu_num, bool enable_dump_tensor ) { - clear_cpu_sim_shared_storage(); // Validate launch_aicpu_num if (launch_aicpu_num < 1 || launch_aicpu_num > PLATFORM_MAX_AICPU_THREADS) { LOG_ERROR("launch_aicpu_num (%d) must be in range [1, %d]", launch_aicpu_num, PLATFORM_MAX_AICPU_THREADS); @@ -307,6 +320,15 @@ int DeviceRunner::run( } } + // Initialize tensor dump if enabled + if (enable_dump_tensor) { + rc = init_tensor_dump(runtime, num_aicore, device_id); + if (rc != 0) { + LOG_ERROR("init_tensor_dump failed: %d", rc); + return rc; + } + } + // Allocate simulated register blocks for all AICore cores // Using sparse mapping: 2 x 4KB pages per core instead of 24KB contiguous block size_t total_reg_size = num_aicore * SIM_REG_TOTAL_SIZE; @@ -353,6 +375,8 @@ int DeviceRunner::run( // Set platform regs in the AICPU .so before launching threads set_platform_regs_func_(kernel_args_.regs); + set_platform_dump_base_func_(kernel_args_.dump_data_base); + set_enable_dump_tensor_func_(enable_dump_tensor); // Launch AICPU threads (over-launch for affinity gate) constexpr int over_launch = PLATFORM_MAX_AICPU_THREADS_JUST_FOR_LAUNCH; @@ -396,6 +420,12 @@ int DeviceRunner::run( export_swimlane_json(); } + // Collect and export tensor dump data + if (enable_dump_tensor) { + dump_collector_.collect_all(); + dump_collector_.export_dump_files(); + } + // Print handshake results at end of run print_handshake_results(); @@ -428,6 +458,8 @@ void DeviceRunner::unload_executor_binaries() { aicpu_so_handle_ = nullptr; aicpu_execute_func_ = nullptr; set_platform_regs_func_ = nullptr; + set_platform_dump_base_func_ = nullptr; + set_enable_dump_tensor_func_ = nullptr; } if (!aicpu_so_path_.empty()) { std::remove(aicpu_so_path_.c_str()); @@ -456,6 +488,11 @@ int DeviceRunner::finalize() { perf_collector_.finalize(); } + // Cleanup tensor dump + if (dump_collector_.is_initialized()) { + dump_collector_.finalize(); + } + // Kernel binaries should have been removed by validate_runtime_impl() if (!func_id_to_addr_.empty()) { LOG_ERROR("finalize() called with %zu kernel binaries still cached", func_id_to_addr_.size()); @@ -619,3 +656,36 @@ int DeviceRunner::init_performance_profiling(Runtime &runtime, int num_aicore, i int DeviceRunner::export_swimlane_json(const std::string &output_path) { return perf_collector_.export_swimlane_json(output_path); } + +int DeviceRunner::init_tensor_dump(Runtime &runtime, int num_aicore, int device_id) { + (void)num_aicore; + int num_dump_threads = runtime.sche_cpu_num; + + auto alloc_cb = [](size_t size) -> void * { + return malloc(size); + }; + + auto free_cb = [](void *dev_ptr) -> int { + free(dev_ptr); + return 0; + }; + + auto copy_to_dev_cb = [](void *dev_dst, const void *host_src, size_t size) -> int { + std::memcpy(dev_dst, host_src, size); + return 0; + }; + + auto copy_from_dev_cb = [](void *host_dst, const void *dev_src, size_t size) -> int { + std::memcpy(host_dst, dev_src, size); + return 0; + }; + + int rc = + dump_collector_.initialize(num_dump_threads, device_id, alloc_cb, free_cb, copy_to_dev_cb, copy_from_dev_cb); + if (rc != 0) { + return rc; + } + + kernel_args_.dump_data_base = reinterpret_cast(dump_collector_.get_dump_setup_device_ptr()); + return 0; +} diff --git a/src/a5/platform/sim/host/device_runner.h b/src/a5/platform/sim/host/device_runner.h index 1728992fe..9653a425d 100644 --- a/src/a5/platform/sim/host/device_runner.h +++ b/src/a5/platform/sim/host/device_runner.h @@ -48,6 +48,7 @@ #include "host/function_cache.h" #include "host/memory_allocator.h" #include "host/performance_collector.h" +#include "host/tensor_dump_collector.h" #include "runtime.h" /** @@ -140,7 +141,7 @@ class DeviceRunner { */ int run(Runtime &runtime, int block_dim, int device_id, const std::vector &aicpu_so_binary, - const std::vector &aicore_kernel_binary, int launch_aicpu_num = 1); + const std::vector &aicore_kernel_binary, int launch_aicpu_num = 1, bool enable_dump_tensor = false); /** * Print handshake results @@ -220,12 +221,17 @@ class DeviceRunner { int (*aicpu_execute_func_)(Runtime *){nullptr}; void (*aicore_execute_func_)(Runtime *, int, CoreType, uint32_t, uint64_t){nullptr}; void (*set_platform_regs_func_)(uint64_t){nullptr}; + void (*set_platform_dump_base_func_)(uint64_t){nullptr}; + void (*set_enable_dump_tensor_func_)(bool){nullptr}; std::string aicpu_so_path_; std::string aicore_so_path_; // Performance profiling PerformanceCollector perf_collector_; + // Tensor dump (independent from profiling) + TensorDumpCollector dump_collector_; + // Private helper methods int ensure_device_initialized( int device_id, const std::vector &aicpu_so_binary, const std::vector &aicore_kernel_binary @@ -246,6 +252,11 @@ class DeviceRunner { * @return 0 on success, error code on failure */ int init_performance_profiling(Runtime &runtime, int num_aicore, int device_id); + + /** + * Initialize tensor dump for simulation. + */ + int init_tensor_dump(Runtime &runtime, int num_aicore, int device_id); }; #endif // SRC_A5_PLATFORM_SIM_HOST_DEVICE_RUNNER_H_ diff --git a/src/a5/platform/sim/host/pto_runtime_c_api.cpp b/src/a5/platform/sim/host/pto_runtime_c_api.cpp index 28e382724..3e7dfd89e 100644 --- a/src/a5/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a5/platform/sim/host/pto_runtime_c_api.cpp @@ -124,7 +124,7 @@ int set_device(DeviceContextHandle ctx, int device_id) { int run_runtime( DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim, int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary, - size_t aicore_size, int enable_profiling + size_t aicore_size, int enable_profiling, int enable_dump_tensor ) { if (ctx == NULL || runtime == NULL) return -1; @@ -167,7 +167,7 @@ int run_runtime( if (aicore_binary != NULL && aicore_size > 0) { aicore_vec.assign(aicore_binary, aicore_binary + aicore_size); } - rc = runner->run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num); + rc = runner->run(*r, block_dim, device_id, aicpu_vec, aicore_vec, aicpu_thread_num, enable_dump_tensor != 0); if (rc != 0) { validate_runtime_impl(r); r->~Runtime(); diff --git a/src/a5/platform/src/aicpu/tensor_dump_aicpu.cpp b/src/a5/platform/src/aicpu/tensor_dump_aicpu.cpp new file mode 100644 index 000000000..cec7ad9c4 --- /dev/null +++ b/src/a5/platform/src/aicpu/tensor_dump_aicpu.cpp @@ -0,0 +1,374 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * @file tensor_dump_aicpu.cpp + * @brief AICPU tensor dump collection implementation (memcpy-based) + * + * Simplified version of A2A3's tensor_dump_aicpu.cpp: + * - No SPSC free queues or ready queues + * - Per-thread DumpBuffer with count-first layout (like PerfBuffer) + * - Per-thread circular arena for tensor payload data + * - Silently drops records when DumpBuffer is full + * - Host copies everything back after stream sync + */ + +#include "aicpu/tensor_dump_aicpu.h" + +#include + +#include "common/memory_barrier.h" +#include "common/platform_config.h" +#include "common/unified_log.h" + +// ============================================================================= +// Static State +// ============================================================================= + +static uint64_t g_platform_dump_base = 0; +static bool g_enable_dump_tensor = false; + +static DumpSetupHeader *s_setup_header = nullptr; +static DumpBuffer *s_dump_buffers[PLATFORM_MAX_AICPU_THREADS] = {}; +static DumpArenaHeader *s_arena_headers[PLATFORM_MAX_AICPU_THREADS] = {}; +static char *s_arena_data[PLATFORM_MAX_AICPU_THREADS] = {}; + +static bool s_logged_dump_layout_mismatch = false; +static uint32_t s_records_written[PLATFORM_MAX_AICPU_THREADS] = {}; + +// ============================================================================= +// Extern "C" API +// ============================================================================= + +extern "C" void set_platform_dump_base(uint64_t dump_data_base) { g_platform_dump_base = dump_data_base; } + +extern "C" uint64_t get_platform_dump_base() { return g_platform_dump_base; } + +extern "C" void set_enable_dump_tensor(bool enable) { g_enable_dump_tensor = enable; } + +extern "C" bool get_enable_dump_tensor() { return g_enable_dump_tensor; } + +// ============================================================================= +// Helper Functions (same as A2A3) +// ============================================================================= + +bool get_tensor_dump_role_from_direction(ArgDirection dir, TensorDumpRole *role) { + switch (dir) { + case ArgDirection::IN: + *role = TensorDumpRole::INPUT; + return true; + case ArgDirection::OUT: + *role = TensorDumpRole::OUTPUT; + return true; + case ArgDirection::INOUT: + *role = TensorDumpRole::INOUT; + return true; + case ArgDirection::SCALAR: + return false; + } + return false; +} + +int32_t count_callable_tensor_args(const CoreCallable &callable) { + int32_t tensor_count = 0; + for (int32_t i = 0; i < callable.sig_count(); i++) { + if (callable.sig(i) != ArgDirection::SCALAR) { + tensor_count++; + } + } + return tensor_count; +} + +bool should_dump_tensor_at_stage(TensorDumpRole role, TensorDumpStage stage) { + switch (role) { + case TensorDumpRole::INPUT: + return stage == TensorDumpStage::BEFORE_DISPATCH; + case TensorDumpRole::OUTPUT: + return stage == TensorDumpStage::AFTER_COMPLETION; + case TensorDumpRole::INOUT: + return true; + } + return false; +} + +bool try_log_tensor_dump_layout_mismatch() { + if (s_logged_dump_layout_mismatch) { + return false; + } + s_logged_dump_layout_mismatch = true; + return true; +} + +// ============================================================================= +// Circular Arena Writer (same as A2A3) +// ============================================================================= + +struct CircularArenaWriter { + char *arena; + uint64_t arena_size; + uint64_t base_offset; + uint64_t bytes_written; + + void write(const void *src, uint64_t size) { + if (size == 0) { + return; + } + uint64_t pos = (base_offset + bytes_written) % arena_size; + if (pos + size <= arena_size) { + memcpy(arena + pos, src, size); + } else { + uint64_t first = arena_size - pos; + memcpy(arena + pos, src, first); + memcpy(arena, reinterpret_cast(src) + first, size - first); + } + bytes_written += size; + } +}; + +static inline uint64_t get_tensor_dump_num_elements(const TensorDumpInfo &info) { + uint64_t elements = 1; + for (uint32_t d = 0; d < info.ndims && d < PLATFORM_DUMP_MAX_DIMS; d++) { + elements *= info.shapes[d]; + } + return elements; +} + +static inline bool tensor_dump_is_contiguous(const TensorDumpInfo &info) { + if (info.ndims == 0) { + return true; + } + for (uint32_t d = 1; d < info.ndims && d < PLATFORM_DUMP_MAX_DIMS; d++) { + if (info.shapes[d] != info.raw_shapes[d]) { + return false; + } + } + return true; +} + +static inline uint64_t tensor_dump_start_offset_elements(const TensorDumpInfo &info) { + uint64_t result = 0; + uint64_t stride = 1; + for (int d = static_cast(info.ndims) - 1; d >= 0; d--) { + result += static_cast(info.offsets[d]) * stride; + stride *= info.raw_shapes[d]; + } + return result; +} + +static inline void write_tensor_dump_contiguous_prefix( + CircularArenaWriter *writer, const TensorDumpInfo &info, uint64_t elem_sz, uint64_t copy_bytes +) { + uint64_t start_offset = tensor_dump_start_offset_elements(info); + const char *src = reinterpret_cast(info.buffer_addr) + start_offset * elem_sz; + writer->write(src, copy_bytes); +} + +static void gather_tensor_dump_dim( + CircularArenaWriter *writer, const TensorDumpInfo &info, uint64_t elem_sz, uint32_t dim, + uint64_t base_element_index, uint64_t *remaining_bytes +) { + if (*remaining_bytes == 0 || dim >= PLATFORM_DUMP_MAX_DIMS) { + return; + } + if (dim + 1 >= info.ndims) { + uint64_t row_start = base_element_index + info.offsets[dim]; + const char *src = reinterpret_cast(info.buffer_addr) + row_start * elem_sz; + uint64_t row_bytes = static_cast(info.shapes[dim]) * elem_sz; + uint64_t bytes_to_copy = (row_bytes < *remaining_bytes) ? row_bytes : *remaining_bytes; + writer->write(src, bytes_to_copy); + *remaining_bytes -= bytes_to_copy; + return; + } + + uint64_t inner_stride = 1; + for (uint32_t d = dim + 1; d < info.ndims && d < PLATFORM_DUMP_MAX_DIMS; d++) { + inner_stride *= info.raw_shapes[d]; + } + for (uint32_t i = 0; i < info.shapes[dim] && *remaining_bytes > 0; i++) { + uint64_t next_base = base_element_index + (static_cast(info.offsets[dim]) + i) * inner_stride; + gather_tensor_dump_dim(writer, info, elem_sz, dim + 1, next_base, remaining_bytes); + } +} + +static inline void write_tensor_dump_logical_prefix( + CircularArenaWriter *writer, const TensorDumpInfo &info, uint64_t elem_sz, uint64_t copy_bytes +) { + if (copy_bytes == 0) { + return; + } + if (tensor_dump_is_contiguous(info)) { + write_tensor_dump_contiguous_prefix(writer, info, elem_sz, copy_bytes); + return; + } + + uint64_t remaining_bytes = copy_bytes; + gather_tensor_dump_dim(writer, info, elem_sz, 0, 0, &remaining_bytes); +} + +// ============================================================================= +// Public API Implementation +// ============================================================================= + +void dump_tensor_init(int num_dump_threads) { + void *dump_base = reinterpret_cast(get_platform_dump_base()); + if (dump_base == nullptr) { + LOG_ERROR("platform dump base is NULL, cannot initialize tensor dump"); + return; + } + + s_setup_header = get_dump_setup_header(dump_base); + + LOG_INFO("Initializing tensor dump for %d threads (memcpy-based)", num_dump_threads); + + for (int t = 0; t < num_dump_threads && t < PLATFORM_MAX_AICPU_THREADS; t++) { + uint64_t buf_ptr = s_setup_header->dump_buffer_ptrs[t]; + uint64_t arena_hdr_ptr = s_setup_header->arena_header_ptrs[t]; + uint64_t arena_data_ptr = s_setup_header->arena_data_ptrs[t]; + + if (buf_ptr == 0) { + LOG_ERROR("Thread %d: dump_buffer_ptrs[%d] is NULL during init!", t, t); + s_dump_buffers[t] = nullptr; + continue; + } + + DumpBuffer *buf = reinterpret_cast(buf_ptr); + buf->count = 0; + buf->dropped_count = 0; + s_dump_buffers[t] = buf; + + s_arena_headers[t] = reinterpret_cast(arena_hdr_ptr); + s_arena_data[t] = reinterpret_cast(arena_data_ptr); + + if (s_arena_headers[t] != nullptr) { + s_arena_headers[t]->write_offset = 0; + } + + LOG_DEBUG( + "Thread %d: DumpBuffer at 0x%lx, arena at 0x%lx (size=%lu)", t, buf_ptr, arena_data_ptr, + s_setup_header->arena_sizes[t] + ); + } + + memset(s_records_written, 0, sizeof(s_records_written)); + s_logged_dump_layout_mismatch = false; + + wmb(); + LOG_INFO("Tensor dump initialized for %d threads", num_dump_threads); +} + +int dump_tensor_record(int thread_idx, const TensorDumpInfo &info) { + if (s_setup_header == nullptr) { + return -1; + } + + DumpBuffer *buf = s_dump_buffers[thread_idx]; + if (buf == nullptr) { + return -1; + } + + TensorDumpRecord *records = get_dump_buffer_records(buf); + + // Check capacity — drop if full + uint32_t count = buf->count; + if (count >= buf->capacity) { + uint32_t prev = buf->dropped_count; + uint32_t next = prev + 1; + buf->dropped_count = (next < prev) ? UINT32_MAX : next; + return 0; + } + + // Compute tensor data size + uint64_t actual_elements = get_tensor_dump_num_elements(info); + uint64_t elem_sz = get_element_size(static_cast(info.dtype)); + uint64_t bytes = actual_elements * elem_sz; + uint64_t copy_bytes = bytes; + bool truncated = false; + bool is_contiguous = tensor_dump_is_contiguous(info); + + DumpArenaHeader *arena_hdr = s_arena_headers[thread_idx]; + char *arena = s_arena_data[thread_idx]; + + if (arena_hdr != nullptr && arena != nullptr) { + uint64_t arena_sz = arena_hdr->arena_size; + if (bytes > arena_sz) { + copy_bytes = arena_sz / 2; + truncated = true; + } + + uint64_t offset = arena_hdr->write_offset; + arena_hdr->write_offset = offset + copy_bytes; + + CircularArenaWriter writer = {arena, arena_sz, offset, 0}; + write_tensor_dump_logical_prefix(&writer, info, elem_sz, copy_bytes); + wmb(); + + // Fill metadata record + TensorDumpRecord *rec = &records[count]; + rec->task_id = info.task_id; + rec->subtask_id = info.subtask_id; + rec->func_id = info.func_id; + rec->arg_index = info.arg_index; + rec->is_contiguous = is_contiguous ? 1 : 0; + rec->role = static_cast(info.role); + rec->stage = static_cast(info.stage); + rec->ndims = info.ndims; + rec->dtype = info.dtype; + rec->truncated = truncated ? 1 : 0; + rec->payload_offset = offset; + rec->payload_size = copy_bytes; + for (int d = 0; d < info.ndims && d < PLATFORM_DUMP_MAX_DIMS; d++) { + rec->raw_shapes[d] = info.raw_shapes[d]; + rec->shapes[d] = info.shapes[d]; + rec->offsets[d] = info.offsets[d]; + } + } else { + // No arena — record metadata only, no payload + TensorDumpRecord *rec = &records[count]; + rec->task_id = info.task_id; + rec->subtask_id = info.subtask_id; + rec->func_id = info.func_id; + rec->arg_index = info.arg_index; + rec->is_contiguous = is_contiguous ? 1 : 0; + rec->role = static_cast(info.role); + rec->stage = static_cast(info.stage); + rec->ndims = info.ndims; + rec->dtype = info.dtype; + rec->truncated = 1; + rec->payload_offset = 0; + rec->payload_size = 0; + for (int d = 0; d < info.ndims && d < PLATFORM_DUMP_MAX_DIMS; d++) { + rec->raw_shapes[d] = info.raw_shapes[d]; + rec->shapes[d] = info.shapes[d]; + rec->offsets[d] = info.offsets[d]; + } + } + + buf->count = count + 1; + wmb(); + + if (thread_idx >= 0 && thread_idx < PLATFORM_MAX_AICPU_THREADS) { + s_records_written[thread_idx]++; + } + + return 0; +} + +void dump_tensor_flush(int thread_idx) { + // In the memcpy design, flush is a no-op for data — host reads after sync. + // Log per-thread statistics for diagnostics. + if (thread_idx >= 0 && thread_idx < PLATFORM_MAX_AICPU_THREADS) { + DumpBuffer *buf = s_dump_buffers[thread_idx]; + uint32_t dropped = (buf != nullptr) ? buf->dropped_count : 0; + LOG_INFO( + "Thread %d: dump_tensor_flush (records=%u, dropped=%u)", thread_idx, s_records_written[thread_idx], dropped + ); + } +} diff --git a/src/a5/platform/src/host/tensor_dump_collector.cpp b/src/a5/platform/src/host/tensor_dump_collector.cpp new file mode 100644 index 000000000..b6131e13e --- /dev/null +++ b/src/a5/platform/src/host/tensor_dump_collector.cpp @@ -0,0 +1,517 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * @file tensor_dump_collector.cpp + * @brief Host-side tensor dump collector implementation (memcpy-based) + * + * Mirrors performance_collector.cpp patterns: + * - Allocate device buffers, copy header to device + * - After stream sync, two-step copy (header then data) + * - Export to JSON manifest + binary payload + */ + +#include "host/tensor_dump_collector.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "common/memory_barrier.h" +#include "common/unified_log.h" + +// ============================================================================= +// TensorDumpCollector +// ============================================================================= + +TensorDumpCollector::~TensorDumpCollector() { + if (is_initialized()) { + LOG_ERROR("TensorDumpCollector destroyed without finalize()"); + } +} + +int TensorDumpCollector::initialize( + int num_dump_threads, int device_id, DumpAllocCallback alloc_cb, DumpFreeCallback free_cb, + DumpCopyToDeviceCallback copy_to_dev_cb, DumpCopyFromDeviceCallback copy_from_dev_cb +) { + if (is_initialized()) { + LOG_ERROR("TensorDumpCollector already initialized"); + return -1; + } + + num_dump_threads_ = num_dump_threads; + device_id_ = device_id; + alloc_cb_ = alloc_cb; + free_cb_ = free_cb; + copy_to_dev_cb_ = copy_to_dev_cb; + copy_from_dev_cb_ = copy_from_dev_cb; + + int capacity = PLATFORM_DUMP_RECORDS_PER_BUFFER; + dump_buffer_bytes_ = calc_dump_buffer_size(capacity); + uint64_t arena_size = calc_dump_arena_size(); + + LOG_INFO( + "Initializing tensor dump: %d threads, %d records/buffer, %zu bytes/buffer, %lu bytes/arena", num_dump_threads, + capacity, dump_buffer_bytes_, arena_size + ); + + // Allocate DumpSetupHeader on device + setup_header_dev_ = alloc_cb_(sizeof(DumpSetupHeader)); + if (setup_header_dev_ == nullptr) { + LOG_ERROR("Failed to allocate DumpSetupHeader (%zu bytes)", sizeof(DumpSetupHeader)); + return -1; + } + + // Build host-side setup header + DumpSetupHeader host_header = {}; + host_header.num_dump_threads = static_cast(num_dump_threads); + host_header.records_per_buffer = static_cast(capacity); + host_header.magic = TENSOR_DUMP_MAGIC; + + // Allocate per-thread buffers and arenas + dump_buffers_dev_.resize(num_dump_threads, nullptr); + arena_headers_dev_.resize(num_dump_threads, nullptr); + arena_data_dev_.resize(num_dump_threads, nullptr); + + for (int t = 0; t < num_dump_threads; t++) { + // Allocate DumpBuffer + void *buf = alloc_cb_(dump_buffer_bytes_); + if (buf == nullptr) { + LOG_ERROR("Failed to allocate DumpBuffer for thread %d (%zu bytes)", t, dump_buffer_bytes_); + finalize(); + return -1; + } + dump_buffers_dev_[t] = buf; + + // Initialize DumpBuffer on host then copy to device + std::vector buf_init(dump_buffer_bytes_, 0); + DumpBuffer *buf_host = reinterpret_cast(buf_init.data()); + buf_host->count = 0; + buf_host->capacity = static_cast(capacity); + buf_host->dropped_count = 0; + int rc = copy_to_dev_cb_(buf, buf_init.data(), dump_buffer_bytes_); + if (rc != 0) { + LOG_ERROR("Failed to initialize DumpBuffer for thread %d: %d", t, rc); + finalize(); + return rc; + } + + // Allocate DumpArenaHeader + void *arena_hdr = alloc_cb_(sizeof(DumpArenaHeader)); + if (arena_hdr == nullptr) { + LOG_ERROR("Failed to allocate DumpArenaHeader for thread %d", t); + finalize(); + return -1; + } + arena_headers_dev_[t] = arena_hdr; + + // Initialize arena header + DumpArenaHeader host_arena_hdr = {}; + host_arena_hdr.write_offset = 0; + host_arena_hdr.arena_size = arena_size; + rc = copy_to_dev_cb_(arena_hdr, &host_arena_hdr, sizeof(DumpArenaHeader)); + if (rc != 0) { + LOG_ERROR("Failed to initialize DumpArenaHeader for thread %d: %d", t, rc); + finalize(); + return rc; + } + + // Allocate arena data + void *arena = alloc_cb_(static_cast(arena_size)); + if (arena == nullptr) { + LOG_ERROR("Failed to allocate arena for thread %d (%lu bytes)", t, arena_size); + finalize(); + return -1; + } + arena_data_dev_[t] = arena; + + // Fill setup header pointers + host_header.dump_buffer_ptrs[t] = reinterpret_cast(buf); + host_header.arena_header_ptrs[t] = reinterpret_cast(arena_hdr); + host_header.arena_data_ptrs[t] = reinterpret_cast(arena); + host_header.arena_sizes[t] = arena_size; + } + + // Copy setup header to device + int rc = copy_to_dev_cb_(setup_header_dev_, &host_header, sizeof(DumpSetupHeader)); + if (rc != 0) { + LOG_ERROR("Failed to copy DumpSetupHeader to device: %d", rc); + finalize(); + return rc; + } + + LOG_INFO("Tensor dump initialized: %d threads, header at %p", num_dump_threads, setup_header_dev_); + return 0; +} + +int TensorDumpCollector::collect_all() { + if (!is_initialized()) { + return -1; + } + + LOG_INFO("Collecting tensor dump data from %d threads...", num_dump_threads_); + + uint64_t arena_size = calc_dump_arena_size(); + + for (int t = 0; t < num_dump_threads_; t++) { + // Step 1: Copy back DumpBuffer header (64 bytes) to read count + DumpBuffer host_buf_header = {}; + int rc = copy_from_dev_cb_(&host_buf_header, dump_buffers_dev_[t], sizeof(DumpBuffer)); + if (rc != 0) { + LOG_ERROR("Thread %d: failed to copy DumpBuffer header: %d", t, rc); + continue; + } + + uint32_t count = host_buf_header.count; + uint32_t dropped = host_buf_header.dropped_count; + total_dropped_count_ += dropped; + + if (count == 0) { + LOG_DEBUG("Thread %d: no dump records", t); + continue; + } + + // Step 2: Copy back the actual records (count * sizeof(TensorDumpRecord)) + size_t records_bytes = static_cast(count) * sizeof(TensorDumpRecord); + std::vector records_buf(records_bytes); + void *dev_records = reinterpret_cast(dump_buffers_dev_[t]) + sizeof(DumpBuffer); + rc = copy_from_dev_cb_(records_buf.data(), dev_records, records_bytes); + if (rc != 0) { + LOG_ERROR("Thread %d: failed to copy %u dump records: %d", t, count, rc); + continue; + } + + // Step 3: Copy back arena header + DumpArenaHeader host_arena_hdr = {}; + rc = copy_from_dev_cb_(&host_arena_hdr, arena_headers_dev_[t], sizeof(DumpArenaHeader)); + if (rc != 0) { + LOG_ERROR("Thread %d: failed to copy arena header: %d", t, rc); + continue; + } + + // Step 4: Copy back arena data (only up to min(write_offset, arena_size)) + uint64_t arena_bytes_to_copy = host_arena_hdr.write_offset; + if (arena_bytes_to_copy > arena_size) { + arena_bytes_to_copy = arena_size; // Circular wraparound — copy entire arena + } + std::vector arena_buf(static_cast(arena_bytes_to_copy)); + if (arena_bytes_to_copy > 0) { + rc = copy_from_dev_cb_(arena_buf.data(), arena_data_dev_[t], static_cast(arena_bytes_to_copy)); + if (rc != 0) { + LOG_ERROR("Thread %d: failed to copy arena data (%lu bytes): %d", t, arena_bytes_to_copy, rc); + continue; + } + } + + // Step 5: Reconstruct DumpedTensor entries + const TensorDumpRecord *records = reinterpret_cast(records_buf.data()); + for (uint32_t i = 0; i < count; i++) { + const TensorDumpRecord &rec = records[i]; + + DumpedTensor dt = {}; + dt.task_id = rec.task_id; + dt.subtask_id = rec.subtask_id; + dt.func_id = rec.func_id; + dt.arg_index = rec.arg_index; + dt.role = static_cast(rec.role); + dt.stage = static_cast(rec.stage); + dt.dtype = rec.dtype; + dt.ndims = rec.ndims; + dt.is_contiguous = (rec.is_contiguous != 0); + dt.truncated = (rec.truncated != 0); + dt.payload_size = rec.payload_size; + + for (int d = 0; d < rec.ndims && d < PLATFORM_DUMP_MAX_DIMS; d++) { + dt.shapes[d] = rec.shapes[d]; + dt.offsets[d] = rec.offsets[d]; + dt.raw_shapes[d] = rec.raw_shapes[d]; + } + + if (dt.truncated) { + total_truncated_count_++; + } + + // Check for arena overwrite + bool overwritten = false; + if (host_arena_hdr.write_offset > arena_size) { + uint64_t oldest_valid = host_arena_hdr.write_offset - arena_size; + if (rec.payload_offset < oldest_valid) { + overwritten = true; + total_overwrite_count_++; + } + } + dt.overwritten = overwritten; + + // Extract payload from arena + if (rec.payload_size > 0 && !overwritten && arena_bytes_to_copy > 0) { + uint64_t arena_sz = arena_size; + uint64_t pos = rec.payload_offset % arena_sz; + uint64_t sz = rec.payload_size; + dt.bytes.resize(static_cast(sz)); + + if (pos + sz <= arena_sz) { + memcpy(dt.bytes.data(), arena_buf.data() + pos, static_cast(sz)); + } else { + // Circular wraparound read + uint64_t first = arena_sz - pos; + memcpy(dt.bytes.data(), arena_buf.data() + pos, static_cast(first)); + memcpy(dt.bytes.data() + first, arena_buf.data(), static_cast(sz - first)); + } + } + + collected_.push_back(std::move(dt)); + } + + LOG_INFO("Thread %d: collected %u records (dropped=%u)", t, count, dropped); + } + + LOG_INFO("Tensor dump collection complete: %zu tensors total", collected_.size()); + return 0; +} + +// ============================================================================= +// Export Helpers +// ============================================================================= + +static const char *tensor_dump_role_name(TensorDumpRole role) { + switch (role) { + case TensorDumpRole::INPUT: + return "input"; + case TensorDumpRole::OUTPUT: + return "output"; + case TensorDumpRole::INOUT: + return "inout"; + } + return "unknown"; +} + +static const char *tensor_dump_stage_name(TensorDumpStage stage) { + switch (stage) { + case TensorDumpStage::BEFORE_DISPATCH: + return "before_dispatch"; + case TensorDumpStage::AFTER_COMPLETION: + return "after_completion"; + } + return "unknown"; +} + +static std::string dims_to_string(const uint32_t dims[], int ndims) { + std::ostringstream ss; + ss << "["; + for (int d = 0; d < ndims; d++) { + if (d > 0) { + ss << ", "; + } + ss << dims[d]; + } + ss << "]"; + return ss.str(); +} + +static std::string get_dtype_name_from_raw(uint8_t dtype) { return get_dtype_name(static_cast(dtype)); } + +static uint64_t get_num_elements(const DumpedTensor &dt) { + uint64_t numel = 1; + for (int d = 0; d < dt.ndims; d++) { + numel *= dt.shapes[d]; + } + return (dt.ndims == 0) ? 1 : numel; +} + +int TensorDumpCollector::export_dump_files(const std::string &output_path) { + if (collected_.empty()) { + LOG_WARN("No tensor dump data to export"); + return 0; + } + auto export_start = std::chrono::steady_clock::now(); + + // Sort by task_id then subtask_id then func_id + std::sort(collected_.begin(), collected_.end(), [](const DumpedTensor &a, const DumpedTensor &b) { + if (a.task_id != b.task_id) return a.task_id < b.task_id; + if (a.subtask_id != b.subtask_id) return a.subtask_id < b.subtask_id; + if (a.func_id != b.func_id) return a.func_id < b.func_id; + if (a.stage != b.stage) return static_cast(a.stage) < static_cast(b.stage); + if (a.arg_index != b.arg_index) return a.arg_index < b.arg_index; + return static_cast(a.role) < static_cast(b.role); + }); + + // Create timestamped output directory + auto now = std::chrono::system_clock::now(); + auto time_t = std::chrono::system_clock::to_time_t(now); + std::tm tm = {}; + localtime_r(&time_t, &tm); + std::ostringstream ts; + ts << std::put_time(&tm, "%Y%m%d_%H%M%S"); + std::string timestamp = ts.str(); + + std::filesystem::path run_dir = std::filesystem::path(output_path) / ("tensor_dump_" + timestamp); + std::filesystem::create_directories(run_dir); + + std::string base_name = run_dir.filename().string(); + + // Write binary payload file + std::ofstream bin_file(run_dir / (base_name + ".bin"), std::ios::binary); + uint64_t bin_offset = 0; + for (auto &dt : collected_) { + dt.bin_offset = bin_offset; + if (!dt.bytes.empty()) { + bin_file.write( + reinterpret_cast(dt.bytes.data()), static_cast(dt.bytes.size()) + ); + bin_offset += dt.bytes.size(); + } + dt.bytes.clear(); // Free memory after writing + dt.bytes.shrink_to_fit(); + } + bin_file.close(); + + // Count stats + uint32_t num_before_dispatch = 0; + uint32_t num_after_completion = 0; + uint32_t num_input_tensors = 0; + uint32_t num_output_tensors = 0; + uint32_t num_inout_tensors = 0; + for (const auto &dt : collected_) { + if (dt.stage == TensorDumpStage::BEFORE_DISPATCH) { + num_before_dispatch++; + } else { + num_after_completion++; + } + switch (dt.role) { + case TensorDumpRole::INPUT: + num_input_tensors++; + break; + case TensorDumpRole::OUTPUT: + num_output_tensors++; + break; + case TensorDumpRole::INOUT: + num_inout_tensors++; + break; + } + } + + // Write JSON manifest + LOG_INFO("Writing JSON manifest for %zu tensors...", collected_.size()); + std::ofstream json(run_dir / (base_name + ".json")); + json << "{\n"; + json << " \"timestamp\": \"" << timestamp << "\",\n"; + json << " \"run_dir\": \"" << base_name << "\",\n"; + json << " \"bin_format\": {\n"; + json << " \"type\": \"logical_contiguous\",\n"; + json << " \"byte_order\": \"little_endian\"\n"; + json << " },\n"; + json << " \"total_tensors\": " << collected_.size() << ",\n"; + json << " \"before_dispatch\": " << num_before_dispatch << ",\n"; + json << " \"after_completion\": " << num_after_completion << ",\n"; + json << " \"input_tensors\": " << num_input_tensors << ",\n"; + json << " \"output_tensors\": " << num_output_tensors << ",\n"; + json << " \"inout_tensors\": " << num_inout_tensors << ",\n"; + json << " \"truncated_tensors\": " << total_truncated_count_ << ",\n"; + json << " \"dropped_records\": " << total_dropped_count_ << ",\n"; + json << " \"dropped_overwrite\": " << total_overwrite_count_ << ",\n"; + json << " \"bin_file\": \"" << base_name << ".bin\",\n"; + json << " \"tensors\": [\n"; + + bool first_entry = true; + for (size_t i = 0; i < collected_.size(); i++) { + const DumpedTensor &dt = collected_[i]; + std::string dtype_name = get_dtype_name_from_raw(dt.dtype); + uint64_t numel = get_num_elements(dt); + + std::string shape_str = dims_to_string(dt.shapes, dt.ndims); + std::string raw_shape_str = dims_to_string(dt.raw_shapes, dt.ndims); + std::string offsets_str = dims_to_string(dt.offsets, dt.ndims); + + if (!first_entry) json << ",\n"; + first_entry = false; + + json << " {\"task_id\": \"0x" << std::hex << std::setfill('0') << std::setw(16) << dt.task_id << std::dec + << "\", \"subtask_id\": " << static_cast(dt.subtask_id) << ", \"func_id\": " << dt.func_id + << ", \"role\": \"" << tensor_dump_role_name(dt.role) << "\", \"stage\": \"" + << tensor_dump_stage_name(dt.stage) << "\", \"arg_index\": " << dt.arg_index << ", \"dtype\": \"" + << dtype_name << "\", \"is_contiguous\": " << (dt.is_contiguous ? "true" : "false") + << ", \"shape\": " << shape_str << ", \"raw_shape\": " << raw_shape_str << ", \"offsets\": " << offsets_str + << ", \"numel\": " << numel << ", \"bin_offset\": " << dt.bin_offset + << ", \"bin_size\": " << dt.payload_size << ", \"truncated\": " << (dt.truncated ? "true" : "false") + << ", \"overwritten\": " << (dt.overwritten ? "true" : "false") << "}"; + } + + json << "\n ]\n}\n"; + json.close(); + + auto export_end = std::chrono::steady_clock::now(); + auto total_ms = std::chrono::duration_cast(export_end - export_start).count(); + LOG_INFO( + "Wrote dump files (%zu tensors, %lu bytes payload) to %s (%ldms)", collected_.size(), bin_offset, + run_dir.c_str(), total_ms + ); + + if (total_truncated_count_ > 0 || total_dropped_count_ > 0 || total_overwrite_count_ > 0) { + LOG_WARN( + "Tensor dump anomalies: truncated=%u, dropped_records=%u, overwritten=%u", total_truncated_count_, + total_dropped_count_, total_overwrite_count_ + ); + } + + // Clear state for potential subsequent runs + collected_.clear(); + total_dropped_count_ = 0; + total_truncated_count_ = 0; + total_overwrite_count_ = 0; + return 0; +} + +int TensorDumpCollector::finalize() { + if (!is_initialized()) { + return 0; + } + + // Free per-thread arena data + for (auto *ptr : arena_data_dev_) { + if (ptr != nullptr && free_cb_ != nullptr) { + free_cb_(ptr); + } + } + arena_data_dev_.clear(); + + // Free per-thread arena headers + for (auto *ptr : arena_headers_dev_) { + if (ptr != nullptr && free_cb_ != nullptr) { + free_cb_(ptr); + } + } + arena_headers_dev_.clear(); + + // Free per-thread DumpBuffers + for (auto *ptr : dump_buffers_dev_) { + if (ptr != nullptr && free_cb_ != nullptr) { + free_cb_(ptr); + } + } + dump_buffers_dev_.clear(); + + // Free setup header + if (setup_header_dev_ != nullptr && free_cb_ != nullptr) { + free_cb_(setup_header_dev_); + } + setup_header_dev_ = nullptr; + + collected_.clear(); + num_dump_threads_ = 0; + device_id_ = -1; + + LOG_INFO("TensorDumpCollector finalized"); + return 0; +} diff --git a/src/a5/runtime/host_build_graph/aicpu/aicpu_executor.cpp b/src/a5/runtime/host_build_graph/aicpu/aicpu_executor.cpp index 9b0745f04..44664e02a 100644 --- a/src/a5/runtime/host_build_graph/aicpu/aicpu_executor.cpp +++ b/src/a5/runtime/host_build_graph/aicpu/aicpu_executor.cpp @@ -17,13 +17,16 @@ #include "aicpu/device_log.h" #include "aicpu/device_time.h" #include "aicpu/performance_collector_aicpu.h" +#include "aicpu/tensor_dump_aicpu.h" #include "aicpu/platform_regs.h" +#include "callable.h" #include "common/memory_barrier.h" #include "common/perf_profiling.h" #include "common/platform_config.h" #include "common/unified_log.h" #include "runtime.h" #include "spin_hint.h" +#include "tensor_info.h" constexpr int MAX_AICPU_THREADS = PLATFORM_MAX_AICPU_THREADS; constexpr int MAX_CORES_PER_THREAD = PLATFORM_MAX_CORES_PER_THREAD; @@ -99,6 +102,9 @@ struct AicpuExecutor { // ===== Performance profiling state ===== uint64_t dispatch_timestamps_[RUNTIME_MAX_WORKER]; // Per-core AICPU dispatch timestamp + // ===== Dump tensor state ===== + Runtime *runtime_{nullptr}; // Cached for dump_tensor access in try_dispatch_task + // ===== Methods ===== int init(Runtime *runtime); int handshake_all_cores(Runtime *runtime); @@ -114,8 +120,8 @@ struct AicpuExecutor { // Helper functions (inline to avoid linker issues, not always_inline to preserve barriers) inline void resolve_task_dependencies( - Task *task, Runtime &runtime, int *cur_ready_queue_aic, int &cur_aic_tail, int &cur_aic_ready_count, - int *cur_ready_queue_aiv, int &cur_aiv_tail, int &cur_aiv_ready_count + Task *task, Runtime &runtime, int thread_idx, int *cur_ready_queue_aic, int &cur_aic_tail, + int &cur_aic_ready_count, int *cur_ready_queue_aiv, int &cur_aiv_tail, int &cur_aiv_ready_count ); inline bool try_dispatch_task( @@ -126,13 +132,56 @@ struct AicpuExecutor { static AicpuExecutor g_aicpu_executor; +#if PTO2_DUMP_TENSOR +static int +collect_task_tensor_buffer_addrs(const Runtime &runtime, const Task &task, uint64_t *buffer_addrs, int max_count) { + int found = 0; + for (int arg_idx = 0; arg_idx < task.num_args; arg_idx++) { + uint64_t arg = task.args[arg_idx]; + if (!runtime.is_tensor_buffer_addr(arg)) { + continue; + } + if (found < max_count) { + buffer_addrs[found] = arg; + } + found++; + } + return found; +} +#endif + // ===== Helper Function Implementations ===== // Resolve dependencies: decrement fanin and enqueue newly ready tasks inline void AicpuExecutor::resolve_task_dependencies( - Task *task, Runtime &runtime, int *cur_ready_queue_aic, int &cur_aic_tail, int &cur_aic_ready_count, + Task *task, Runtime &runtime, int thread_idx, int *cur_ready_queue_aic, int &cur_aic_tail, int &cur_aic_ready_count, int *cur_ready_queue_aiv, int &cur_aiv_tail, int &cur_aiv_ready_count ) { + if (task == nullptr) { + return; + } + +#if PTO2_DUMP_TENSOR + if (get_enable_dump_tensor()) { + uint64_t callable_addr = runtime.get_function_bin_addr(task->func_id); + if (callable_addr != 0) { + const CoreCallable *callable = reinterpret_cast(callable_addr); + int tensor_info_count = 0; + const TensorInfo *tensor_info = runtime.get_tensor_info(task->task_id, &tensor_info_count); + uint64_t tensor_buffer_addrs[RUNTIME_MAX_ARGS] = {}; + int tensor_buffer_count = + collect_task_tensor_buffer_addrs(runtime, *task, tensor_buffer_addrs, RUNTIME_MAX_ARGS); + dump_tensors_for_task( + thread_idx, static_cast(task->task_id), 0, task->num_args, task->func_id, *callable, + tensor_info, tensor_info_count, tensor_buffer_addrs, tensor_buffer_count, + TensorDumpStage::AFTER_COMPLETION + ); + } + } +#else + (void)thread_idx; +#endif + for (int j = 0; j < task->fanout_count; j++) { int dep_id = task->fanout[j]; Task *dep = runtime.get_task(dep_id); @@ -186,6 +235,28 @@ inline bool AicpuExecutor::try_dispatch_task( running_task_ids_[core_id] ); +#if PTO2_DUMP_TENSOR + if (get_enable_dump_tensor()) { + Task *task = runtime_->get_task(task_id); + if (task != nullptr) { + uint64_t callable_addr = runtime_->get_function_bin_addr(task->func_id); + if (callable_addr != 0) { + const CoreCallable *callable = reinterpret_cast(callable_addr); + int tensor_info_count = 0; + const TensorInfo *tensor_info = runtime_->get_tensor_info(task_id, &tensor_info_count); + uint64_t tensor_buffer_addrs[RUNTIME_MAX_ARGS] = {}; + int tensor_buffer_count = + collect_task_tensor_buffer_addrs(*runtime_, *task, tensor_buffer_addrs, RUNTIME_MAX_ARGS); + dump_tensors_for_task( + thread_idx, static_cast(task_id), 0, task->num_args, task->func_id, *callable, + tensor_info, tensor_info_count, tensor_buffer_addrs, tensor_buffer_count, + TensorDumpStage::BEFORE_DISPATCH + ); + } + } + } +#endif + // Set state before writing register to avoid race with AICore ACK pending_task_ids_[core_id] = task_id; @@ -217,6 +288,7 @@ int AicpuExecutor::init(Runtime *runtime) { // Read execution parameters from runtime thread_num_ = runtime->sche_cpu_num; + runtime_ = runtime; // Simplified defensive check if (thread_num_ < 1 || thread_num_ > MAX_AICPU_THREADS) { @@ -259,6 +331,11 @@ int AicpuExecutor::init(Runtime *runtime) { if (runtime->enable_profiling) { perf_aicpu_init_profiling(runtime); } +#if PTO2_DUMP_TENSOR + if (get_enable_dump_tensor()) { + dump_tensor_init(thread_num_); + } +#endif init_done_.store(true, std::memory_order_release); LOG_INFO("AicpuExecutor: Init complete"); @@ -689,7 +766,7 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const Task *prev_running_task = runtime.get_task(prev_running_id); resolve_task_dependencies( - prev_running_task, runtime, cur_ready_queue_aic, cur_aic_tail, cur_aic_ready_count, + prev_running_task, runtime, thread_idx, cur_ready_queue_aic, cur_aic_tail, cur_aic_ready_count, cur_ready_queue_aiv, cur_aiv_tail, cur_aiv_ready_count ); @@ -698,8 +775,8 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const Task *task = runtime.get_task(completed_task_id); resolve_task_dependencies( - task, runtime, cur_ready_queue_aic, cur_aic_tail, cur_aic_ready_count, cur_ready_queue_aiv, - cur_aiv_tail, cur_aiv_ready_count + task, runtime, thread_idx, cur_ready_queue_aic, cur_aic_tail, cur_aic_ready_count, + cur_ready_queue_aiv, cur_aiv_tail, cur_aiv_ready_count ); made_progress = true; @@ -753,7 +830,7 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const Task *prev_running_task = runtime.get_task(prev_running_id); resolve_task_dependencies( - prev_running_task, runtime, cur_ready_queue_aic, cur_aic_tail, cur_aic_ready_count, + prev_running_task, runtime, thread_idx, cur_ready_queue_aic, cur_aic_tail, cur_aic_ready_count, cur_ready_queue_aiv, cur_aiv_tail, cur_aiv_ready_count ); @@ -811,8 +888,8 @@ int AicpuExecutor::resolve_and_dispatch(Runtime &runtime, int thread_idx, const Task *task = runtime.get_task(completed_task_id); resolve_task_dependencies( - task, runtime, cur_ready_queue_aic, cur_aic_tail, cur_aic_ready_count, cur_ready_queue_aiv, - cur_aiv_tail, cur_aiv_ready_count + task, runtime, thread_idx, cur_ready_queue_aic, cur_aic_tail, cur_aic_ready_count, + cur_ready_queue_aiv, cur_aiv_tail, cur_aiv_ready_count ); made_progress = true; @@ -977,6 +1054,12 @@ int AicpuExecutor::run(Runtime *runtime) { return rc; } +#if PTO2_DUMP_TENSOR + if (get_enable_dump_tensor()) { + dump_tensor_flush(thread_idx); + } +#endif + LOG_INFO("Thread %d: Completed", thread_idx); int prev_finished = finished_count_.fetch_add(1, std::memory_order_acq_rel); @@ -995,6 +1078,17 @@ void AicpuExecutor::deinit(Runtime *runtime) { // Next round's Host DMA (rtMemcpy) writes fresh Runtime to HBM but // bypasses this cache. Invalidating now ensures next round reads from HBM. cache_invalidate_range(runtime, sizeof(Runtime)); + if (runtime->get_tensor_info_storage() != nullptr && runtime->get_tensor_info_storage_bytes() > 0) { + cache_invalidate_range( + runtime->get_tensor_info_storage(), static_cast(runtime->get_tensor_info_storage_bytes()) + ); + } + if (runtime->get_tensor_allocation_storage() != nullptr && runtime->get_tensor_allocation_storage_bytes() > 0) { + cache_invalidate_range( + runtime->get_tensor_allocation_storage(), + static_cast(runtime->get_tensor_allocation_storage_bytes()) + ); + } // === Existing reset logic === ready_count_aic_.store(0, std::memory_order_release); diff --git a/src/a5/runtime/host_build_graph/host/runtime_maker.cpp b/src/a5/runtime/host_build_graph/host/runtime_maker.cpp index bac4c3052..57f4e1586 100644 --- a/src/a5/runtime/host_build_graph/host/runtime_maker.cpp +++ b/src/a5/runtime/host_build_graph/host/runtime_maker.cpp @@ -35,6 +35,7 @@ #include #include #include +#include #include "callable.h" #include "orchestration_api.h" @@ -46,16 +47,84 @@ namespace { struct OrchestrationRuntimeImpl { const OrchestrationRuntimeOps *ops; Runtime *runtime; + struct TensorInfoBuilder *tensor_info_builder; + struct TensorAllocationBuilder *tensor_allocation_builder; +}; + +struct TensorInfoBuilder { + std::vector> tensor_info_by_task; + + int set_tensor_info_to_task(int task_id, const TensorInfo *tensor_info, int tensor_count) { + if (task_id < 0 || tensor_count < 0 || tensor_count > RUNTIME_MAX_ARGS) { + return -1; + } + if (static_cast(task_id) >= tensor_info_by_task.size()) { + tensor_info_by_task.resize(static_cast(task_id) + 1); + } + std::vector &task_info = tensor_info_by_task[static_cast(task_id)]; + task_info.assign(tensor_info, tensor_info + tensor_count); + return 0; + } +}; + +struct TensorAllocationBuilder { + std::vector allocations; + + void record_allocation(void *ptr, size_t size) { + if (ptr == nullptr || size == 0) { + return; + } + allocations.push_back({reinterpret_cast(ptr), static_cast(size)}); + } + + void erase_allocation(void *ptr) { + if (ptr == nullptr) { + return; + } + uint64_t base_addr = reinterpret_cast(ptr); + for (auto it = allocations.begin(); it != allocations.end(); ++it) { + if (it->base_addr == base_addr) { + allocations.erase(it); + return; + } + } + } }; Runtime *unwrap_runtime(OrchestrationRuntime *runtime) { return reinterpret_cast(runtime)->runtime; } +TensorInfoBuilder *unwrap_tensor_info_builder(OrchestrationRuntime *runtime) { + return reinterpret_cast(runtime)->tensor_info_builder; +} + +TensorAllocationBuilder *unwrap_tensor_allocation_builder(OrchestrationRuntime *runtime) { + return reinterpret_cast(runtime)->tensor_allocation_builder; +} + int runtime_add_task(OrchestrationRuntime *runtime, uint64_t *args, int num_args, int func_id, CoreType core_type) { return unwrap_runtime(runtime)->add_task(args, num_args, func_id, core_type); } +int runtime_set_tensor_info_to_task( + OrchestrationRuntime *runtime, int task_id, const TensorInfo *tensor_info, int tensor_count +) { + Runtime *host_runtime = unwrap_runtime(runtime); + if (task_id < 0 || task_id >= host_runtime->get_task_count()) { + LOG_ERROR("Invalid task_id %d for task tensor info", task_id); + return -1; + } + if (tensor_count == 0) { + return 0; + } + if (tensor_info == nullptr) { + LOG_ERROR("Task %d tensor info pointer is null", task_id); + return -1; + } + return unwrap_tensor_info_builder(runtime)->set_tensor_info_to_task(task_id, tensor_info, tensor_count); +} + void runtime_add_successor(OrchestrationRuntime *runtime, int from_task, int to_task) { unwrap_runtime(runtime)->add_successor(from_task, to_task); } @@ -69,10 +138,13 @@ int runtime_get_task_count(OrchestrationRuntime *runtime) { return unwrap_runtim void runtime_print_runtime(OrchestrationRuntime *runtime) { unwrap_runtime(runtime)->print_runtime(); } void *runtime_device_malloc(OrchestrationRuntime *runtime, size_t size) { - return unwrap_runtime(runtime)->host_api.device_malloc(size); + void *ptr = unwrap_runtime(runtime)->host_api.device_malloc(size); + unwrap_tensor_allocation_builder(runtime)->record_allocation(ptr, size); + return ptr; } void runtime_device_free(OrchestrationRuntime *runtime, void *ptr) { + unwrap_tensor_allocation_builder(runtime)->erase_allocation(ptr); unwrap_runtime(runtime)->host_api.device_free(ptr); } @@ -81,8 +153,9 @@ int runtime_copy_to_device(OrchestrationRuntime *runtime, void *dev_ptr, const v } const OrchestrationRuntimeOps k_orchestration_runtime_ops = { - runtime_add_task, runtime_add_successor, runtime_record_tensor_pair, runtime_get_task_count, - runtime_print_runtime, runtime_device_malloc, runtime_device_free, runtime_copy_to_device, + runtime_add_task, runtime_set_tensor_info_to_task, runtime_add_successor, runtime_record_tensor_pair, + runtime_get_task_count, runtime_print_runtime, runtime_device_malloc, runtime_device_free, + runtime_copy_to_device, }; bool write_all_bytes(int fd, const uint8_t *data, size_t size) { @@ -124,6 +197,78 @@ bool create_temp_so_file(const uint8_t *data, size_t size, std::string *out_path return true; } +int upload_tensor_info_storage(Runtime *runtime, const TensorInfoBuilder &builder) { + runtime->clear_tensor_info_storage(); + for (int task_id = 0; task_id < RUNTIME_MAX_TASKS; task_id++) { + runtime->set_tensor_info_range(task_id, 0, 0); + } + + int task_count = runtime->get_task_count(); + std::vector compact_tensor_info; + for (int task_id = 0; task_id < task_count; task_id++) { + const std::vector *task_info = nullptr; + if (static_cast(task_id) < builder.tensor_info_by_task.size()) { + task_info = &builder.tensor_info_by_task[static_cast(task_id)]; + } + uint32_t offset = static_cast(compact_tensor_info.size()); + uint16_t count = 0; + if (task_info != nullptr) { + count = static_cast(task_info->size()); + compact_tensor_info.insert(compact_tensor_info.end(), task_info->begin(), task_info->end()); + } + runtime->set_tensor_info_range(task_id, offset, count); + } + + if (compact_tensor_info.empty()) { + return 0; + } + + size_t tensor_info_bytes = compact_tensor_info.size() * sizeof(TensorInfo); + void *dev_tensor_info_storage = runtime->host_api.device_malloc(tensor_info_bytes); + if (dev_tensor_info_storage == nullptr) { + LOG_ERROR("Failed to allocate tensor info storage (%zu bytes)", tensor_info_bytes); + return -1; + } + + int rc = runtime->host_api.copy_to_device(dev_tensor_info_storage, compact_tensor_info.data(), tensor_info_bytes); + if (rc != 0) { + LOG_ERROR("Failed to copy tensor info storage to device: %d", rc); + runtime->host_api.device_free(dev_tensor_info_storage); + return rc; + } + + runtime->set_tensor_info_storage(dev_tensor_info_storage, tensor_info_bytes); + LOG_INFO("Uploaded %zu tensor info entries (%zu bytes)", compact_tensor_info.size(), tensor_info_bytes); + return 0; +} + +int upload_tensor_allocation_storage(Runtime *runtime, const TensorAllocationBuilder &builder) { + runtime->clear_tensor_allocation_storage(); + if (builder.allocations.empty()) { + return 0; + } + + size_t allocation_bytes = builder.allocations.size() * sizeof(TensorAllocationInfo); + void *dev_allocation_storage = runtime->host_api.device_malloc(allocation_bytes); + if (dev_allocation_storage == nullptr) { + LOG_ERROR("Failed to allocate tensor allocation storage (%zu bytes)", allocation_bytes); + return -1; + } + + int rc = runtime->host_api.copy_to_device(dev_allocation_storage, builder.allocations.data(), allocation_bytes); + if (rc != 0) { + LOG_ERROR("Failed to copy tensor allocation storage to device: %d", rc); + runtime->host_api.device_free(dev_allocation_storage); + return rc; + } + + runtime->set_tensor_allocation_storage( + dev_allocation_storage, static_cast(builder.allocations.size()), allocation_bytes + ); + LOG_INFO("Uploaded %zu tensor allocation ranges (%zu bytes)", builder.allocations.size(), allocation_bytes); + return 0; +} + } // namespace #ifdef __cplusplus @@ -215,7 +360,11 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip orch_args->tensor_count(), orch_args->scalar_count() ); - OrchestrationRuntimeImpl orchestration_runtime = {&k_orchestration_runtime_ops, runtime}; + TensorInfoBuilder tensor_info_builder; + TensorAllocationBuilder tensor_allocation_builder; + OrchestrationRuntimeImpl orchestration_runtime = { + &k_orchestration_runtime_ops, runtime, &tensor_info_builder, &tensor_allocation_builder + }; // Call orchestration function to build task graph // The orchestration function handles device memory allocation and copy-to-device @@ -227,6 +376,26 @@ int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const Chip return rc; } + rc = upload_tensor_allocation_storage(runtime, tensor_allocation_builder); + if (rc != 0) { + LOG_ERROR("Failed to upload tensor allocations: %d", rc); + runtime->clear_tensor_pairs(); + dlclose(handle); + return rc; + } + + rc = upload_tensor_info_storage(runtime, tensor_info_builder); + if (rc != 0) { + LOG_ERROR("Failed to upload tensor info storage: %d", rc); + if (runtime->get_tensor_allocation_storage() != nullptr) { + runtime->host_api.device_free(runtime->get_tensor_allocation_storage()); + runtime->clear_tensor_allocation_storage(); + } + runtime->clear_tensor_pairs(); + dlclose(handle); + return rc; + } + LOG_INFO("Runtime initialized. Ready for execution from Python."); // Host orchestration is complete once orch_func returns. The task graph now @@ -294,6 +463,15 @@ int validate_runtime_impl(Runtime *runtime) { } runtime->clear_registered_kernels(); + if (runtime->get_tensor_info_storage() != nullptr) { + runtime->host_api.device_free(runtime->get_tensor_info_storage()); + runtime->clear_tensor_info_storage(); + } + if (runtime->get_tensor_allocation_storage() != nullptr) { + runtime->host_api.device_free(runtime->get_tensor_allocation_storage()); + runtime->clear_tensor_allocation_storage(); + } + // Clear tensor pairs runtime->clear_tensor_pairs(); diff --git a/src/a5/runtime/host_build_graph/orchestration/orchestration_api.h b/src/a5/runtime/host_build_graph/orchestration/orchestration_api.h index 5d646434c..0f3387480 100644 --- a/src/a5/runtime/host_build_graph/orchestration/orchestration_api.h +++ b/src/a5/runtime/host_build_graph/orchestration/orchestration_api.h @@ -23,11 +23,15 @@ #include "common/core_type.h" #include "task_args.h" +#include "tensor_info.h" typedef struct OrchestrationRuntime OrchestrationRuntime; typedef struct OrchestrationRuntimeOps { int (*add_task)(OrchestrationRuntime *runtime, uint64_t *args, int num_args, int func_id, CoreType core_type); + int (*set_tensor_info_to_task)( + OrchestrationRuntime *runtime, int task_id, const TensorInfo *tensor_info, int tensor_count + ); void (*add_successor)(OrchestrationRuntime *runtime, int from_task, int to_task); void (*record_tensor_pair)(OrchestrationRuntime *runtime, void *host_ptr, void *dev_ptr, size_t size); int (*get_task_count)(OrchestrationRuntime *runtime); @@ -47,6 +51,25 @@ add_task(OrchestrationRuntime *runtime, uint64_t *args, int num_args, int func_i return runtime->ops->add_task(runtime, args, num_args, func_id, core_type); } +static inline int +set_tensor_info_to_task(OrchestrationRuntime *runtime, int task_id, const TensorInfo *tensor_info, int tensor_count) { + return runtime->ops->set_tensor_info_to_task(runtime, task_id, tensor_info, tensor_count); +} + +static inline int add_task_with_tensor_info( + OrchestrationRuntime *runtime, uint64_t *args, int num_args, int func_id, CoreType core_type, + const TensorInfo *tensor_info, int tensor_count +) { + int task_id = add_task(runtime, args, num_args, func_id, core_type); + if (task_id < 0) { + return task_id; + } + if (set_tensor_info_to_task(runtime, task_id, tensor_info, tensor_count) != 0) { + return -1; + } + return task_id; +} + static inline void add_successor(OrchestrationRuntime *runtime, int from_task, int to_task) { runtime->ops->add_successor(runtime, from_task, to_task); } diff --git a/src/a5/runtime/host_build_graph/runtime/runtime.cpp b/src/a5/runtime/host_build_graph/runtime/runtime.cpp index 9899b1a48..25af6e4c7 100644 --- a/src/a5/runtime/host_build_graph/runtime/runtime.cpp +++ b/src/a5/runtime/host_build_graph/runtime/runtime.cpp @@ -48,6 +48,11 @@ Runtime::Runtime() { enable_profiling = false; perf_data_base = 0; tensor_pair_count = 0; + tensor_info_storage_ = nullptr; + tensor_info_storage_bytes_ = 0; + tensor_allocation_storage_ = nullptr; + tensor_allocation_storage_bytes_ = 0; + tensor_allocation_count_ = 0; // Initialize kernel binary tracking registered_kernel_count_ = 0; @@ -56,6 +61,8 @@ Runtime::Runtime() { for (int i = 0; i < RUNTIME_MAX_FUNC_ID; i++) { func_id_to_addr_[i] = 0; } + memset(tensor_info_offsets_, 0, sizeof(tensor_info_offsets_)); + memset(tensor_info_counts_, 0, sizeof(tensor_info_counts_)); } // ============================================================================= diff --git a/src/a5/runtime/host_build_graph/runtime/runtime.h b/src/a5/runtime/host_build_graph/runtime/runtime.h index f29775bf0..551cc0599 100644 --- a/src/a5/runtime/host_build_graph/runtime/runtime.h +++ b/src/a5/runtime/host_build_graph/runtime/runtime.h @@ -39,6 +39,7 @@ #include "common/core_type.h" #include "common/perf_profiling.h" #include "common/platform_config.h" +#include "tensor_info.h" // Logging macros using unified logging interface #include "common/unified_log.h" @@ -226,6 +227,17 @@ class Runtime { int registered_kernel_func_ids_[RUNTIME_MAX_FUNC_ID]; int registered_kernel_count_; + // Tensor info metadata for tensor dump + void *tensor_info_storage_; + uint64_t tensor_info_storage_bytes_; + uint32_t tensor_info_offsets_[RUNTIME_MAX_TASKS]; + uint16_t tensor_info_counts_[RUNTIME_MAX_TASKS]; + + // Device allocation ranges used to recover tensor buffer addresses from task.args[] + void *tensor_allocation_storage_; + uint64_t tensor_allocation_storage_bytes_; + uint32_t tensor_allocation_count_; + public: /** * Constructor - zero-initialize all arrays @@ -336,6 +348,78 @@ class Runtime { */ void clear_tensor_pairs(); + // ========================================================================= + // Tensor Info Metadata + // ========================================================================= + + void set_tensor_info_storage(void *ptr, uint64_t bytes) { + tensor_info_storage_ = ptr; + tensor_info_storage_bytes_ = bytes; + } + + void clear_tensor_info_storage() { + tensor_info_storage_ = nullptr; + tensor_info_storage_bytes_ = 0; + } + + void set_tensor_info_range(int task_id, uint32_t offset, uint16_t count) { + if (task_id < 0 || task_id >= RUNTIME_MAX_TASKS) return; + tensor_info_offsets_[task_id] = offset; + tensor_info_counts_[task_id] = count; + } + + const TensorInfo *get_tensor_info(int task_id, int *count) const { + if (count != nullptr) { + *count = 0; + } + if (task_id < 0 || task_id >= RUNTIME_MAX_TASKS || tensor_info_storage_ == nullptr) { + return nullptr; + } + uint16_t tensor_info_count = tensor_info_counts_[task_id]; + if (tensor_info_count == 0) { + return nullptr; + } + if (count != nullptr) { + *count = static_cast(tensor_info_count); + } + const TensorInfo *base = reinterpret_cast(tensor_info_storage_); + return base + tensor_info_offsets_[task_id]; + } + + void *get_tensor_info_storage() const { return tensor_info_storage_; } + + uint64_t get_tensor_info_storage_bytes() const { return tensor_info_storage_bytes_; } + + void set_tensor_allocation_storage(void *ptr, uint32_t count, uint64_t bytes) { + tensor_allocation_storage_ = ptr; + tensor_allocation_count_ = count; + tensor_allocation_storage_bytes_ = bytes; + } + + void clear_tensor_allocation_storage() { + tensor_allocation_storage_ = nullptr; + tensor_allocation_count_ = 0; + tensor_allocation_storage_bytes_ = 0; + } + + bool is_tensor_buffer_addr(uint64_t addr) const { + if (tensor_allocation_storage_ == nullptr || tensor_allocation_count_ == 0) { + return false; + } + const TensorAllocationInfo *allocations = + reinterpret_cast(tensor_allocation_storage_); + for (uint32_t i = 0; i < tensor_allocation_count_; i++) { + if (allocations[i].contains(addr)) { + return true; + } + } + return false; + } + + void *get_tensor_allocation_storage() const { return tensor_allocation_storage_; } + + uint64_t get_tensor_allocation_storage_bytes() const { return tensor_allocation_storage_bytes_; } + // ========================================================================= // Device Orchestration (stub for API compatibility) // ========================================================================= diff --git a/src/a5/runtime/host_build_graph/runtime/tensor_info.h b/src/a5/runtime/host_build_graph/runtime/tensor_info.h new file mode 100644 index 000000000..c27d65f4e --- /dev/null +++ b/src/a5/runtime/host_build_graph/runtime/tensor_info.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef SRC_A5_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_TENSOR_INFO_H_ +#define SRC_A5_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_TENSOR_INFO_H_ + +#include + +#include "common/platform_config.h" +#include "data_type.h" +#include "tensor_arg.h" + +// ============================================================================= +// Dump Tensor Configuration +// ============================================================================= + +#ifndef PTO2_DUMP_TENSOR +#define PTO2_DUMP_TENSOR 1 +#endif + +struct TensorInfo { + DataType dtype; + uint8_t ndims; + uint16_t reserved; + uint32_t shapes[PLATFORM_DUMP_MAX_DIMS]; + uint32_t raw_shapes[PLATFORM_DUMP_MAX_DIMS]; + uint32_t offsets[PLATFORM_DUMP_MAX_DIMS]; +}; + +static_assert(sizeof(TensorInfo) == 64, "TensorInfo must stay compact"); + +struct TensorAllocationInfo { + uint64_t base_addr; + uint64_t size_bytes; + + bool contains(uint64_t addr) const { return addr >= base_addr && addr < base_addr + size_bytes; } +}; + +static_assert(sizeof(TensorAllocationInfo) == 16, "TensorAllocationInfo must stay compact"); + +inline TensorInfo make_tensor_info( + DataType dtype, uint32_t ndims, const uint32_t *shapes, const uint32_t *raw_shapes = nullptr, + const uint32_t *offsets = nullptr +) { + TensorInfo info = {}; + info.dtype = dtype; + info.ndims = static_cast(ndims); + for (uint32_t i = 0; i < ndims && i < PLATFORM_DUMP_MAX_DIMS; i++) { + info.shapes[i] = shapes[i]; + info.raw_shapes[i] = (raw_shapes != nullptr) ? raw_shapes[i] : shapes[i]; + info.offsets[i] = (offsets != nullptr) ? offsets[i] : 0; + } + return info; +} + +inline TensorInfo make_tensor_info_from_tensor_arg(const ContinuousTensor &tensor) { + return make_tensor_info(tensor.dtype, tensor.ndims, tensor.shapes); +} + +#endif // SRC_A5_RUNTIME_HOST_BUILD_GRAPH_RUNTIME_TENSOR_INFO_H_ diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index a46dfbce1..d80dae551 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -36,6 +36,7 @@ // Performance profiling headers #include "aicpu/performance_collector_aicpu.h" +#include "aicpu/tensor_dump_aicpu.h" #include "common/memory_barrier.h" #include "common/perf_profiling.h" #include "common/unified_log.h" @@ -491,6 +492,19 @@ struct AicpuExecutor { #endif bool mixed_complete = rt->scheduler.on_subtask_complete(slot_state); if (mixed_complete) { +#if PTO2_DUMP_TENSOR + if (get_enable_dump_tensor()) { + dump_tensors_for_task( + thread_idx, slot_state, TensorDumpStage::AFTER_COMPLETION, + [](uint8_t active_mask, uint8_t raw_subtask_id) { + return pto2_subtask_active(active_mask, static_cast(raw_subtask_id)); + }, + [this](int32_t func_id) { + return get_function_bin_addr(func_id); + } + ); + } +#endif #if PTO2_SCHED_PROFILING PTO2CompletionStats cstats = rt->scheduler.on_mixed_task_complete(slot_state, thread_idx, local_bufs); notify_edges_total += cstats.fanout_edges; @@ -931,6 +945,19 @@ struct AicpuExecutor { #endif ) { CoreTracker &tracker = core_trackers_[thread_idx]; +#if PTO2_DUMP_TENSOR + if (get_enable_dump_tensor()) { + dump_tensors_for_task( + thread_idx, slot_state, TensorDumpStage::BEFORE_DISPATCH, + [](uint8_t active_mask, uint8_t raw_subtask_id) { + return pto2_subtask_active(active_mask, static_cast(raw_subtask_id)); + }, + [this](int32_t func_id) { + return get_function_bin_addr(func_id); + } + ); + } +#endif if (shape == PTO2ResourceShape::MIX) { dispatch_mix_block_to_cluster( thread_idx, cluster_offset, slot_state @@ -1493,6 +1520,11 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa perf_aicpu_set_orch_thread_idx(sched_thread_num_); } #endif +#if PTO2_DUMP_TENSOR + if (get_enable_dump_tensor()) { + dump_tensor_init(orch_to_sched_ ? thread_num_ : sched_thread_num_); + } +#endif DEV_INFO("Thread %d: one-time init done", thread_idx); pto2_init_complete_.store(true, std::memory_order_release); @@ -2219,6 +2251,12 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa ); #endif +#if PTO2_DUMP_TENSOR + if (get_enable_dump_tensor()) { + dump_tensor_flush(thread_idx); + } +#endif + return cur_thread_completed; } diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h index d521123ae..67c9c1980 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2_types.h @@ -69,6 +69,14 @@ #error "PTO2_TENSORMAP_PROFILING requires PTO2_ORCH_PROFILING=1" #endif +// ============================================================================= +// Dump Tensor Configuration +// ============================================================================= + +#ifndef PTO2_DUMP_TENSOR +#define PTO2_DUMP_TENSOR 1 +#endif + // ============================================================================= // Configuration Constants // ============================================================================= diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp index ff0cbda6d..e868830bf 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp @@ -39,6 +39,10 @@ Runtime::Runtime() { pto2_dep_pool_size = 0; orch_to_sched = false; + // Initialize profiling state + enable_profiling = false; + perf_data_base = 0; + // Initialize tensor pairs tensor_pair_count = 0; diff --git a/src/common/task_interface/chip_call_config.h b/src/common/task_interface/chip_call_config.h index 78a912588..1f1eb7721 100644 --- a/src/common/task_interface/chip_call_config.h +++ b/src/common/task_interface/chip_call_config.h @@ -22,4 +22,5 @@ struct ChipCallConfig { int block_dim = 24; int aicpu_thread_num = 3; bool enable_profiling = false; + bool enable_dump_tensor = false; }; diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp index 2ef0a9b00..5b115e1fc 100644 --- a/src/common/worker/chip_worker.cpp +++ b/src/common/worker/chip_worker.cpp @@ -204,7 +204,8 @@ void ChipWorker::run(const void *callable, const void *args, const ChipCallConfi int rc = run_runtime_fn_( device_ctx_, rt, callable, args, config.block_dim, config.aicpu_thread_num, device_id_, aicpu_binary_.data(), - aicpu_binary_.size(), aicore_binary_.data(), aicore_binary_.size(), config.enable_profiling ? 1 : 0 + aicpu_binary_.size(), aicore_binary_.data(), aicore_binary_.size(), config.enable_profiling ? 1 : 0, + config.enable_dump_tensor ? 1 : 0 ); if (rc != 0) { throw std::runtime_error("run_runtime failed with code " + std::to_string(rc)); diff --git a/src/common/worker/chip_worker.h b/src/common/worker/chip_worker.h index e51ad2bfc..81af41148 100644 --- a/src/common/worker/chip_worker.h +++ b/src/common/worker/chip_worker.h @@ -62,7 +62,8 @@ class ChipWorker : public IWorker { using SetDeviceFn = int (*)(void *, int); using GetRuntimeSizeFn = size_t (*)(); using RunRuntimeFn = int (*)( - void *, void *, const void *, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *, size_t, int + void *, void *, const void *, const void *, int, int, int, const uint8_t *, size_t, const uint8_t *, size_t, + int, int ); using FinalizeDeviceFn = int (*)(void *); diff --git a/src/common/worker/pto_runtime_c_api.h b/src/common/worker/pto_runtime_c_api.h index 382806aff..f8a811d94 100644 --- a/src/common/worker/pto_runtime_c_api.h +++ b/src/common/worker/pto_runtime_c_api.h @@ -74,12 +74,13 @@ int set_device(DeviceContextHandle ctx, int device_id); * @param aicore_binary AICore executor binary blob * @param aicore_size Size of AICore binary * @param enable_profiling 1 to enable profiling, 0 to disable + * @param enable_dump_tensor 1 to enable tensor dump, 0 to disable * @return 0 on success, negative on error */ int run_runtime( DeviceContextHandle ctx, RuntimeHandle runtime, const void *callable, const void *args, int block_dim, int aicpu_thread_num, int device_id, const uint8_t *aicpu_binary, size_t aicpu_size, const uint8_t *aicore_binary, - size_t aicore_size, int enable_profiling + size_t aicore_size, int enable_profiling, int enable_dump_tensor ); /** diff --git a/tests/st/a2a3/host_build_graph/dump_tensor/kernels/aiv/kernel_add.cpp b/tests/st/a2a3/host_build_graph/dump_tensor/kernels/aiv/kernel_add.cpp new file mode 100644 index 000000000..b94784ee8 --- /dev/null +++ b/tests/st/a2a3/host_build_graph/dump_tensor/kernels/aiv/kernel_add.cpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include +#include + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + __gm__ float *src0 = reinterpret_cast<__gm__ float *>(args[0]); + __gm__ float *src1 = reinterpret_cast<__gm__ float *>(args[1]); + __gm__ float *out = reinterpret_cast<__gm__ float *>(args[2]); + + constexpr int kTRows_ = 128; + constexpr int kTCols_ = 128; + constexpr int vRows = 128; + constexpr int vCols = 128; + + using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>; + using DynStridDim5 = Stride<1, 1, 1, kTCols_, 1>; + using GlobalData = GlobalTensor; + using TileData = Tile; + + TileData src0Tile(vRows, vCols); + TileData src1Tile(vRows, vCols); + TileData dstTile(vRows, vCols); + TASSIGN(src0Tile, 0x0); + TASSIGN(src1Tile, 0x10000); + TASSIGN(dstTile, 0x20000); + + GlobalData src0Global(src0); + GlobalData src1Global(src1); + GlobalData dstGlobal(out); + + TLOAD(src0Tile, src0Global); + TLOAD(src1Tile, src1Global); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TADD(dstTile, src0Tile, src1Tile); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(dstGlobal, dstTile); + + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); +} diff --git a/tests/st/a2a3/host_build_graph/dump_tensor/kernels/aiv/kernel_add_scalar_inplace.cpp b/tests/st/a2a3/host_build_graph/dump_tensor/kernels/aiv/kernel_add_scalar_inplace.cpp new file mode 100644 index 000000000..fa38098e0 --- /dev/null +++ b/tests/st/a2a3/host_build_graph/dump_tensor/kernels/aiv/kernel_add_scalar_inplace.cpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include +#include + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + __gm__ float *inout = reinterpret_cast<__gm__ float *>(args[0]); + + union { + uint64_t u64; + float f32; + } converter; + converter.u64 = args[1]; + float scalar = converter.f32; + + constexpr int kTRows_ = 128; + constexpr int kTCols_ = 128; + constexpr int vRows = 128; + constexpr int vCols = 128; + + using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>; + using DynStridDim5 = Stride<1, 1, 1, kTCols_, 1>; + using GlobalData = GlobalTensor; + using TileData = Tile; + + TileData srcTile(vRows, vCols); + TileData dstTile(vRows, vCols); + TASSIGN(srcTile, 0x0); + TASSIGN(dstTile, 0x10000); + + GlobalData inoutGlobal(inout); + + TLOAD(srcTile, inoutGlobal); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TADDS(dstTile, srcTile, scalar); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(inoutGlobal, dstTile); + + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); +} diff --git a/tests/st/a2a3/host_build_graph/dump_tensor/kernels/orchestration/dump_tensor_orch.cpp b/tests/st/a2a3/host_build_graph/dump_tensor/kernels/orchestration/dump_tensor_orch.cpp new file mode 100644 index 000000000..8c8d807c4 --- /dev/null +++ b/tests/st/a2a3/host_build_graph/dump_tensor/kernels/orchestration/dump_tensor_orch.cpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Dump-tensor interface demo for host_build_graph. + * + * Demonstrates the two ways to register tensor metadata for dump: + * Task 0 (add): add_task() + set_tensor_info_to_task() + * Task 1 (add_scalar_inplace): add_task_with_tensor_info() + * + * Computation: f = (a + b) + 1 (a=2, b=3 → f=6) + */ + +#include "orchestration_api.h" // NOLINT(build/include_subdir) + +extern "C" { + +int build_dump_tensor_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) { + void *host_a = orch_args.tensor(0).data_as(); + void *host_b = orch_args.tensor(1).data_as(); + void *host_f = orch_args.tensor(2).data_as(); + size_t size_a = orch_args.tensor(0).nbytes(); + size_t size_b = orch_args.tensor(1).nbytes(); + size_t size_f = orch_args.tensor(2).nbytes(); + uint32_t size = orch_args.tensor(0).shapes[0]; + + TensorInfo ext_a_info = make_tensor_info_from_tensor_arg(orch_args.tensor(0)); + TensorInfo ext_b_info = make_tensor_info_from_tensor_arg(orch_args.tensor(1)); + TensorInfo ext_f_info = make_tensor_info_from_tensor_arg(orch_args.tensor(2)); + + void *dev_a = device_malloc(runtime, size_a); + copy_to_device(runtime, dev_a, host_a, size_a); + + void *dev_b = device_malloc(runtime, size_b); + copy_to_device(runtime, dev_b, host_b, size_b); + + void *dev_f = device_malloc(runtime, size_f); + record_tensor_pair(runtime, host_f, dev_f, size_f); + + // Task 0: a + b → f (add_task + set_tensor_info_to_task) + uint64_t args_t0[4] = { + reinterpret_cast(dev_a), + reinterpret_cast(dev_b), + reinterpret_cast(dev_f), + size, + }; + int t0 = add_task(runtime, args_t0, 4, 0, CoreType::AIV); + TensorInfo t0_info[] = {ext_a_info, ext_b_info, ext_f_info}; + set_tensor_info_to_task(runtime, t0, t0_info, 3); + + // Task 1: f += 1.0 (add_task_with_tensor_info) + union { + float f32; + uint64_t u64; + } sc; + sc.f32 = 1.0f; + uint64_t args_t1[3] = {reinterpret_cast(dev_f), sc.u64, size}; + TensorInfo t1_info[] = {ext_f_info}; + int t1 = add_task_with_tensor_info(runtime, args_t1, 3, 1, CoreType::AIV, t1_info, 1); + + add_successor(runtime, t0, t1); + + return 0; +} + +} // extern "C" diff --git a/tests/st/a2a3/host_build_graph/dump_tensor/test_dump_tensor_example.py b/tests/st/a2a3/host_build_graph/dump_tensor/test_dump_tensor_example.py new file mode 100644 index 000000000..531d98f5f --- /dev/null +++ b/tests/st/a2a3/host_build_graph/dump_tensor/test_dump_tensor_example.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Host-build-graph dump tensor example: f = (a + b) + 1. + +Demonstrates the two dump-tensor metadata registration APIs: + Task 0 (add): add_task() + set_tensor_info_to_task() + Task 1 (add_scalar_inplace): add_task_with_tensor_info() +""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test + + +@scene_test(level=2, runtime="host_build_graph") +class TestDumpTensorExample(SceneTestCase): + """f = (a + b) + 1, where a=2.0, b=3.0 -> f=6.0.""" + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/dump_tensor_orch.cpp", + "function_name": "build_dump_tensor_graph", + "signature": [D.IN, D.IN, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "source": "kernels/aiv/kernel_add.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "source": "kernels/aiv/kernel_add_scalar_inplace.cpp", + "core_type": "aiv", + "signature": [D.INOUT], + }, + ], + } + + CASES = [ + { + "name": "default", + "platforms": ["a2a3"], + "config": {"aicpu_thread_num": 3, "block_dim": 3}, + "params": {}, + }, + ] + + def generate_args(self, params): + SIZE = 128 * 128 + return TaskArgsBuilder( + Tensor("a", torch.full((SIZE,), 2.0, dtype=torch.float32)), + Tensor("b", torch.full((SIZE,), 3.0, dtype=torch.float32)), + Tensor("f", torch.zeros(SIZE, dtype=torch.float32)), + ) + + def compute_golden(self, args, params): + args.f[:] = (args.a + args.b) + 1 + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/tests/st/a5/host_build_graph/dump_tensor/kernels/aiv/kernel_add.cpp b/tests/st/a5/host_build_graph/dump_tensor/kernels/aiv/kernel_add.cpp new file mode 100644 index 000000000..bdc99577c --- /dev/null +++ b/tests/st/a5/host_build_graph/dump_tensor/kernels/aiv/kernel_add.cpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include +#include + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + __gm__ float *src0 = reinterpret_cast<__gm__ float *>(args[0]); + __gm__ float *src1 = reinterpret_cast<__gm__ float *>(args[1]); + __gm__ float *out = reinterpret_cast<__gm__ float *>(args[2]); + + constexpr int kTRows_ = 128; + constexpr int kTCols_ = 128; + constexpr int vRows = 128; + constexpr int vCols = 128; + + using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>; + using DynStridDim5 = pto::Stride<1, 1, 1, kTCols_, 1>; + using GlobalData = GlobalTensor; + using TileData = Tile; + + TileData src0Tile(vRows, vCols); + TileData src1Tile(vRows, vCols); + TileData dstTile(vRows, vCols); + TASSIGN(src0Tile, 0x0); + TASSIGN(src1Tile, 0x10000); + TASSIGN(dstTile, 0x20000); + + GlobalData src0Global(src0); + GlobalData src1Global(src1); + GlobalData dstGlobal(out); + + TLOAD(src0Tile, src0Global); + TLOAD(src1Tile, src1Global); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TADD(dstTile, src0Tile, src1Tile); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(dstGlobal, dstTile); + + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); +} diff --git a/tests/st/a5/host_build_graph/dump_tensor/kernels/aiv/kernel_add_scalar_inplace.cpp b/tests/st/a5/host_build_graph/dump_tensor/kernels/aiv/kernel_add_scalar_inplace.cpp new file mode 100644 index 000000000..5b4b506d4 --- /dev/null +++ b/tests/st/a5/host_build_graph/dump_tensor/kernels/aiv/kernel_add_scalar_inplace.cpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include +#include + +using namespace pto; + +#ifndef __gm__ +#define __gm__ +#endif + +#ifndef __aicore__ +#define __aicore__ [aicore] +#endif + +extern "C" __aicore__ __attribute__((always_inline)) void kernel_entry(__gm__ int64_t *args) { + __gm__ float *inout = reinterpret_cast<__gm__ float *>(args[0]); + + union { + uint64_t u64; + float f32; + } converter; + converter.u64 = args[1]; + float scalar = converter.f32; + + constexpr int kTRows_ = 128; + constexpr int kTCols_ = 128; + constexpr int vRows = 128; + constexpr int vCols = 128; + + using DynShapeDim5 = Shape<1, 1, 1, vRows, vCols>; + using DynStridDim5 = pto::Stride<1, 1, 1, kTCols_, 1>; + using GlobalData = GlobalTensor; + using TileData = Tile; + + TileData srcTile(vRows, vCols); + TileData dstTile(vRows, vCols); + TASSIGN(srcTile, 0x0); + TASSIGN(dstTile, 0x10000); + + GlobalData inoutGlobal(inout); + + TLOAD(srcTile, inoutGlobal); + set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0); + TADDS(dstTile, srcTile, scalar); + set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0); + TSTORE(inoutGlobal, dstTile); + + set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); + wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7); +} diff --git a/tests/st/a5/host_build_graph/dump_tensor/kernels/orchestration/dump_tensor_orch.cpp b/tests/st/a5/host_build_graph/dump_tensor/kernels/orchestration/dump_tensor_orch.cpp new file mode 100644 index 000000000..8c8d807c4 --- /dev/null +++ b/tests/st/a5/host_build_graph/dump_tensor/kernels/orchestration/dump_tensor_orch.cpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +/** + * Dump-tensor interface demo for host_build_graph. + * + * Demonstrates the two ways to register tensor metadata for dump: + * Task 0 (add): add_task() + set_tensor_info_to_task() + * Task 1 (add_scalar_inplace): add_task_with_tensor_info() + * + * Computation: f = (a + b) + 1 (a=2, b=3 → f=6) + */ + +#include "orchestration_api.h" // NOLINT(build/include_subdir) + +extern "C" { + +int build_dump_tensor_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) { + void *host_a = orch_args.tensor(0).data_as(); + void *host_b = orch_args.tensor(1).data_as(); + void *host_f = orch_args.tensor(2).data_as(); + size_t size_a = orch_args.tensor(0).nbytes(); + size_t size_b = orch_args.tensor(1).nbytes(); + size_t size_f = orch_args.tensor(2).nbytes(); + uint32_t size = orch_args.tensor(0).shapes[0]; + + TensorInfo ext_a_info = make_tensor_info_from_tensor_arg(orch_args.tensor(0)); + TensorInfo ext_b_info = make_tensor_info_from_tensor_arg(orch_args.tensor(1)); + TensorInfo ext_f_info = make_tensor_info_from_tensor_arg(orch_args.tensor(2)); + + void *dev_a = device_malloc(runtime, size_a); + copy_to_device(runtime, dev_a, host_a, size_a); + + void *dev_b = device_malloc(runtime, size_b); + copy_to_device(runtime, dev_b, host_b, size_b); + + void *dev_f = device_malloc(runtime, size_f); + record_tensor_pair(runtime, host_f, dev_f, size_f); + + // Task 0: a + b → f (add_task + set_tensor_info_to_task) + uint64_t args_t0[4] = { + reinterpret_cast(dev_a), + reinterpret_cast(dev_b), + reinterpret_cast(dev_f), + size, + }; + int t0 = add_task(runtime, args_t0, 4, 0, CoreType::AIV); + TensorInfo t0_info[] = {ext_a_info, ext_b_info, ext_f_info}; + set_tensor_info_to_task(runtime, t0, t0_info, 3); + + // Task 1: f += 1.0 (add_task_with_tensor_info) + union { + float f32; + uint64_t u64; + } sc; + sc.f32 = 1.0f; + uint64_t args_t1[3] = {reinterpret_cast(dev_f), sc.u64, size}; + TensorInfo t1_info[] = {ext_f_info}; + int t1 = add_task_with_tensor_info(runtime, args_t1, 3, 1, CoreType::AIV, t1_info, 1); + + add_successor(runtime, t0, t1); + + return 0; +} + +} // extern "C" diff --git a/tests/st/a5/host_build_graph/dump_tensor/test_dump_tensor_example.py b/tests/st/a5/host_build_graph/dump_tensor/test_dump_tensor_example.py new file mode 100644 index 000000000..2295d8985 --- /dev/null +++ b/tests/st/a5/host_build_graph/dump_tensor/test_dump_tensor_example.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Host-build-graph dump tensor example: f = (a + b) + 1. + +Demonstrates the two dump-tensor metadata registration APIs: + Task 0 (add): add_task() + set_tensor_info_to_task() + Task 1 (add_scalar_inplace): add_task_with_tensor_info() +""" + +import torch +from simpler.task_interface import ArgDirection as D + +from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test + + +@scene_test(level=2, runtime="host_build_graph") +class TestDumpTensorExample(SceneTestCase): + """f = (a + b) + 1, where a=2.0, b=3.0 -> f=6.0.""" + + CALLABLE = { + "orchestration": { + "source": "kernels/orchestration/dump_tensor_orch.cpp", + "function_name": "build_dump_tensor_graph", + "signature": [D.IN, D.IN, D.OUT], + }, + "incores": [ + { + "func_id": 0, + "source": "kernels/aiv/kernel_add.cpp", + "core_type": "aiv", + "signature": [D.IN, D.IN, D.OUT], + }, + { + "func_id": 1, + "source": "kernels/aiv/kernel_add_scalar_inplace.cpp", + "core_type": "aiv", + "signature": [D.INOUT], + }, + ], + } + + CASES = [ + { + "name": "default", + "platforms": ["a5"], + "config": {"aicpu_thread_num": 3, "block_dim": 3}, + "params": {}, + }, + ] + + def generate_args(self, params): + SIZE = 128 * 128 + return TaskArgsBuilder( + Tensor("a", torch.full((SIZE,), 2.0, dtype=torch.float32)), + Tensor("b", torch.full((SIZE,), 3.0, dtype=torch.float32)), + Tensor("f", torch.zeros(SIZE, dtype=torch.float32)), + ) + + def compute_golden(self, args, params): + args.f[:] = (args.a + args.b) + 1 + + +if __name__ == "__main__": + SceneTestCase.run_module(__name__) diff --git a/tools/dump_viewer.py b/tools/dump_viewer.py new file mode 100644 index 000000000..557710ed9 --- /dev/null +++ b/tools/dump_viewer.py @@ -0,0 +1,318 @@ +#!/usr/bin/env python3 +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""Tensor dump viewer — extract tensors from tensors.bin to txt files. + +Filters (freely combinable): + --task Filter by task_id (hex, e.g. 0x0000000200000a00) + --func Filter by func_id (int) + --stage Filter by stage (before / after) + --role Filter by role (input / output / inout) + --arg Filter by arg_index (int) + +With no filters: lists all tensors. +With filters: lists matching tensors. Add --export to save them to txt. + +Usage: + # List all tensors (auto-picks latest outputs/tensor_dump_* dir) + python tools/dump_viewer.py + + # List all tensors in a specific dump dir + python tools/dump_viewer.py outputs/tensor_dump_xxx/ + + # List before-dispatch inputs of func_id=3 (latest dir) + python tools/dump_viewer.py --func 3 --stage before --role input + + # List before-dispatch inputs of func_id=3 (specific dir) + python tools/dump_viewer.py outputs/tensor_dump_xxx/ --func 3 --stage before --role input + + # Export them to txt + python tools/dump_viewer.py outputs/tensor_dump_xxx/ --func 3 --stage before --role input --export + + # Export a specific tensor by index + python tools/dump_viewer.py outputs/tensor_dump_xxx/ --index 42 --export +""" + +from __future__ import annotations + +import argparse +import json +import struct +import sys +from pathlib import Path + +DTYPE_INFO = { + "float32": ("f", 4), + "float16": ("e", 2), + "bfloat16": (None, 2), + "int32": ("i", 4), + "int64": ("q", 8), + "uint64": ("Q", 8), + "int16": ("h", 2), + "int8": ("b", 1), + "uint8": ("B", 1), +} + + +def bfloat16_to_float32(raw: int) -> float: + return struct.unpack("f", struct.pack("I", raw << 16))[0] + + +def read_tensor_data(bin_path: Path, offset: int, size: int) -> bytes: + with open(bin_path, "rb") as f: + f.seek(offset) + return f.read(size) + + +def decode_elements(data: bytes, dtype: str, count: int) -> list: + dtype_lower = dtype.lower() + fmt, elem_sz = DTYPE_INFO.get(dtype_lower, (None, 1)) + if dtype_lower == "bfloat16": + return [bfloat16_to_float32(struct.unpack_from("H", data, i * 2)[0]) for i in range(count)] + if fmt is None: + return [f"0x{data[i]:02x}" for i in range(count)] + return [struct.unpack_from(fmt, data, i * elem_sz)[0] for i in range(count)] + + +def format_element(val, dtype: str) -> str: + dtype_lower = dtype.lower() + if isinstance(val, float): + if dtype_lower == "float32": + return f"{val:.6g}" + elif dtype_lower == "float16": + return f"{val:.4g}" + elif dtype_lower == "bfloat16": + return f"{val:.3g}" + return str(val) + + +def tensor_filename(t: dict) -> str: + stage_map = {"before_dispatch": "before", "after_completion": "after"} + role_map = {"input": "in", "output": "out", "inout": "inout"} + stage_str = stage_map.get(t["stage"], t["stage"]) + role_str = role_map.get(t["role"], t["role"]) + return f"task_{t['task_id']}_s{t['subtask_id']}_{stage_str}_{role_str}{t['arg_index']}.txt" + + +def write_tensor(tensor: dict, bin_path: Path, out): + t = tensor + out.write(f"# task_id: {t['task_id']}\n") + out.write(f"# subtask_id: {t['subtask_id']}\n") + out.write(f"# role: {t['role']}\n") + out.write(f"# stage: {t['stage']}\n") + out.write(f"# arg_index: {t['arg_index']}\n") + out.write(f"# func_id: {t['func_id']}\n") + out.write(f"# dtype: {t['dtype']}\n") + out.write(f"# is_contiguous: {t['is_contiguous']}\n") + out.write(f"# shape: {t['shape']}\n") + out.write(f"# raw_shape: {t['raw_shape']}\n") + out.write(f"# offsets: {t['offsets']}\n") + + if t.get("overwritten"): + out.write("# DATA OVERWRITTEN (host too slow)\n") + return + if t.get("truncated"): + out.write("# DATA TRUNCATED (tensor too large for arena)\n") + + bin_size = t.get("bin_size", 0) + if bin_size == 0: + out.write("# (no data)\n") + return + + data = read_tensor_data(bin_path, t["bin_offset"], bin_size) + shape = t["shape"] + numel = 1 + for d in shape: + numel *= d + if numel == 0: + numel = 1 + + _, elem_sz = DTYPE_INFO.get(t["dtype"].lower(), (None, 1)) + max_from_bytes = len(data) // elem_sz + numel = min(numel, max_from_bytes) + + elements = decode_elements(data, t["dtype"], numel) + formatted = [format_element(v, t["dtype"]) for v in elements] + + out.write("\n# Overview:\n") + col_width = max(len(s) for s in formatted) if formatted else 1 + last_dim = shape[-1] if shape else numel + if last_dim == 0: + last_dim = numel + + for i, s in enumerate(formatted): + if i > 0 and (i % last_dim) == 0: + out.write("\n") + elif i > 0: + out.write(" ") + out.write(f"{s:>{col_width}}") + out.write("\n") + + out.write("\n# Detail:\n") + strides = [1] * len(shape) + for d in range(len(shape) - 2, -1, -1): + strides[d] = strides[d + 1] * shape[d + 1] + + for i, s in enumerate(formatted): + idx = [] + rem = i + for d in range(len(shape)): + idx.append(rem // strides[d]) + rem %= strides[d] + idx_str = ", ".join(str(x) for x in idx) + out.write(f"[{idx_str}] {s}\n") + + +def export_tensor(tensor: dict, bin_path: Path, dump_dir: Path): + txt_dir = dump_dir / "txt" + txt_dir.mkdir(exist_ok=True) + fname = tensor_filename(tensor) + txt_path = txt_dir / fname + with open(txt_path, "w") as f: + write_tensor(tensor, bin_path, f) + return txt_path + + +def collect_valid_values(tensors: list, field: str) -> list: + return sorted(set(str(t[field]) for t in tensors)) + + +def list_tensors(tensors: list): + print( + f"{'idx':>6} {'task_id':>18} {'s':>1} {'stage':>7} {'role':>5}" + f" {'arg':>3} {'func':>4} {'dtype':>8} {'shape':<20} {'bytes':>10}" + ) + print("-" * 100) + for i, t in enumerate(tensors): + stage_short = "before" if t["stage"] == "before_dispatch" else "after" + print( + f"{i:>6} {t['task_id']:>18} {t['subtask_id']:>1} {stage_short:>7} {t['role']:>5} " + f"{t['arg_index']:>3} {t['func_id']:>4} {t['dtype']:>8} {str(t['shape']):<20} {t['bin_size']:>10}" + ) + + +def _resolve_dump_dir(dump_dir_arg: str | None) -> Path: + if dump_dir_arg is not None: + return Path(dump_dir_arg) + candidates = sorted(Path("outputs").glob("tensor_dump_*"), key=lambda p: p.name) + if not candidates: + print("Error: no tensor_dump_* directory found in outputs/", file=sys.stderr) + sys.exit(1) + print(f"Using latest dump directory: {candidates[-1]}") + return candidates[-1] + + +def _apply_filters(tensors: list, args: argparse.Namespace) -> list: + filtered = tensors + + if args.task: + valid = collect_valid_values(tensors, "task_id") + if args.task not in valid: + print(f"Error: --task {args.task} not found.", file=sys.stderr) + sample = valid[:5] + print(f" Valid task_ids (showing first {len(sample)}): {', '.join(sample)}", file=sys.stderr) + sys.exit(1) + filtered = [t for t in filtered if t["task_id"] == args.task] + + if args.func is not None: + valid = collect_valid_values(filtered, "func_id") + if str(args.func) not in valid: + print(f"Error: --func {args.func} not found in current selection.", file=sys.stderr) + print(f" Valid func_ids: {', '.join(valid)}", file=sys.stderr) + sys.exit(1) + filtered = [t for t in filtered if t["func_id"] == args.func] + + if args.stage: + stage_map = {"before": "before_dispatch", "after": "after_completion"} + if args.stage not in stage_map: + print(f"Error: --stage must be 'before' or 'after', got '{args.stage}'", file=sys.stderr) + sys.exit(1) + filtered = [t for t in filtered if t["stage"] == stage_map[args.stage]] + + if args.role: + valid_roles = {"input", "output", "inout"} + if args.role not in valid_roles: + print(f"Error: --role must be one of {valid_roles}, got '{args.role}'", file=sys.stderr) + sys.exit(1) + filtered = [t for t in filtered if t["role"] == args.role] + + if args.arg is not None: + valid = collect_valid_values(filtered, "arg_index") + if str(args.arg) not in valid: + print(f"Error: --arg {args.arg} not found in current selection.", file=sys.stderr) + print(f" Valid arg_indices: {', '.join(valid)}", file=sys.stderr) + sys.exit(1) + filtered = [t for t in filtered if t["arg_index"] == args.arg] + + return filtered + + +def main(): + parser = argparse.ArgumentParser( + description="Tensor dump viewer — extract tensors from tensors.bin to txt files", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "dump_dir", + nargs="?", + default=None, + help="Path to tensor_dump_YYYYMMDD_HHMMSS directory (default: latest outputs/tensor_dump_* dir)", + ) + parser.add_argument("--task", "-t", help="Filter by task_id (e.g. 0x0000000200000a00)") + parser.add_argument("--func", "-f", type=int, help="Filter by func_id") + parser.add_argument("--stage", "-s", help="Filter by stage (before / after)") + parser.add_argument("--role", "-r", help="Filter by role (input / output / inout)") + parser.add_argument("--arg", "-a", type=int, help="Filter by arg_index") + parser.add_argument("--index", "-i", type=int, help="Select tensor by index in manifest") + parser.add_argument("--export", "-e", action="store_true", help="Export filtered tensors to txt") + args = parser.parse_args() + + dump_dir = _resolve_dump_dir(args.dump_dir) + manifest_files = list(dump_dir.glob("*.json")) + if not manifest_files: + print(f"Error: no manifest JSON found in {dump_dir}", file=sys.stderr) + sys.exit(1) + + with open(manifest_files[0]) as f: + manifest = json.load(f) + + bin_path = dump_dir / manifest.get("bin_file", "tensors.bin") + tensors = manifest["tensors"] + + filtered = _apply_filters(tensors, args) + + # --- Select by index --- + if args.index is not None: + if args.index < 0 or args.index >= len(tensors): + print(f"Error: --index {args.index} out of range (0-{len(tensors) - 1})", file=sys.stderr) + sys.exit(1) + filtered = [tensors[args.index]] + args.export = True # --index always exports + + if not filtered: + print("No tensors match the given filters.", file=sys.stderr) + sys.exit(1) + + # --- Export or list --- + has_filters = any([args.task, args.func is not None, args.stage, args.role, args.arg is not None]) + + if args.export or args.index is not None: + for t in filtered: + txt_path = export_tensor(t, bin_path, dump_dir) + print(f"Saved: {txt_path}") + print(f"\nExported {len(filtered)} tensor(s) to {dump_dir / 'txt/'}") + else: + if has_filters: + print(f"Filtered: {len(filtered)}/{len(tensors)} tensors") + print(f"Add --export to save these tensors to {dump_dir / 'txt/'}\n") + list_tensors(filtered) + + +if __name__ == "__main__": + main()