From 0b39cb0f25db4a9143455f316017dfa17b60001d Mon Sep 17 00:00:00 2001 From: wangzihao122 Date: Fri, 10 Apr 2026 02:46:58 +0800 Subject: [PATCH] Refactor: introduce tiered profiling levels for a2a3 tensormap_and_ringbuffer swimlane export Replace the boolean `enable_profiling` flag with a 4-level `perf_level` (0=off, 1=AICore-only, 2=task+fanout, 3=full with AICPU phase records). The tensormap_and_ringbuffer runtime honors all four levels, while legacy host_build_graph / aicpu_build_graph paths continue to treat any non-zero value as a simple on/off and stay on their existing bool member (synchronized via a shared SFINAE helper in runtime_profiling_mode.h). CLI and JSON are lifted to match: - `--enable-profiling` in run_example.py now takes an optional int (default 3 when flag given, 0 otherwise). - The swimlane JSON schema gains a new version=0 (level 1: AICore-only) that omits dispatch/finish/fanout fields, and swimlane_converter.py accepts it. - Phase buffer allocation, scheduler-phase recording and orchestrator summary writes in aicpu_executor.cpp are gated on perf_level>=3 so lower levels no longer pay the phase-profiling overhead; fanout/dispatch_timestamp collection is gated on perf_level>=2. Additionally: - CallConfig and WorkerPayload switch from bool to int; Python bindings accept both bool and int for backward compatibility (_normalize_perf_level in code_runner.py, getter/setter shim in task_interface.cpp). - PerformanceCollector skips phase-buffer shared-memory allocation and phase-thread management when perf_level < 3 (calc_perf_data_size path). - device_runner.cpp (onboard + sim): all enable_profiling guards replaced with perf_level > 0; set_perf_level() called before initialize(). - Unit tests updated for int-based profiling values. --- examples/scripts/code_runner.py | 16 ++- examples/scripts/run_example.py | 8 +- python/bindings/task_interface.cpp | 22 ++- python/simpler/worker.py | 2 +- .../include/host/performance_collector.h | 9 ++ .../include/host/runtime_profiling_mode.h | 37 +++++ .../platform/onboard/host/device_runner.cpp | 11 +- .../onboard/host/pto_runtime_c_api.cpp | 5 +- src/a2a3/platform/sim/host/device_runner.cpp | 11 +- .../platform/sim/host/pto_runtime_c_api.cpp | 5 +- .../src/aicpu/performance_collector_aicpu.cpp | 2 +- .../src/host/performance_collector.cpp | 134 ++++++++++-------- .../aicpu_build_graph/runtime/runtime.h | 1 + .../host_build_graph/runtime/runtime.cpp | 1 + .../host_build_graph/runtime/runtime.h | 3 +- .../aicore/aicore_executor.cpp | 2 +- .../aicpu/aicpu_executor.cpp | 107 +++++++------- .../runtime/pto_orchestrator.cpp | 4 +- .../runtime/pto_orchestrator.h | 3 +- .../runtime/runtime.h | 2 +- src/common/distributed/dist_chip_process.cpp | 2 +- src/common/distributed/dist_types.h | 2 +- src/common/worker/chip_worker.cpp | 4 +- src/common/worker/chip_worker.h | 2 +- tests/ut/py/test_chip_worker.py | 6 +- tools/swimlane_converter.py | 11 +- 26 files changed, 256 insertions(+), 156 deletions(-) create mode 100644 src/a2a3/platform/include/host/runtime_profiling_mode.h diff --git a/examples/scripts/code_runner.py b/examples/scripts/code_runner.py index 38bd3eb2a..358c087c9 100644 --- a/examples/scripts/code_runner.py +++ b/examples/scripts/code_runner.py @@ -133,6 +133,12 @@ def _get_project_root() -> Path: return Path(__file__).parent.parent.parent # examples/scripts/ -> examples/ -> simpler/ +def _normalize_perf_level(v) -> int: + if isinstance(v, bool): + return 3 if v else 0 + return int(v) + + def _get_pto_isa_clone_path() -> Path: """Get the expected path to pto-isa clone.""" return _get_project_root() / "examples" / "scripts" / "_deps" / "pto-isa" @@ -477,7 +483,7 @@ def __init__( # noqa: PLR0913 golden_path: str, device_id: Optional[int] = None, platform: str = "a2a3", - enable_profiling: bool = False, + enable_profiling: int = 0, run_all_cases: bool = False, case_name: Optional[str] = None, pto_isa_commit: Optional[str] = None, @@ -492,7 +498,7 @@ def __init__( # noqa: PLR0913 self.kernels_dir = Path(kernels_dir).resolve() self.golden_path = Path(golden_path).resolve() self.platform = platform - self.enable_profiling = enable_profiling + self._perf_level = _normalize_perf_level(enable_profiling) self.skip_golden = skip_golden self.project_root = _get_project_root() @@ -887,9 +893,9 @@ def _compile_one_kernel(kernel): config = ChipCallConfig() config.block_dim = self.block_dim config.aicpu_thread_num = self.aicpu_thread_num - if self.enable_profiling and round_idx == 0: - config.enable_profiling = True - logger.info("Profiling enabled") + if self._perf_level > 0 and round_idx == 0: + config.enable_profiling = self._perf_level + logger.info(f"Swimlane profiling enabled (mode={self._perf_level})") with _temporary_env(run_env): worker.run(chip_callable, orch_args, config) diff --git a/examples/scripts/run_example.py b/examples/scripts/run_example.py index 89ab84199..2a65dfeda 100644 --- a/examples/scripts/run_example.py +++ b/examples/scripts/run_example.py @@ -148,8 +148,12 @@ def compute_golden(tensors: dict, params: dict) -> None: parser.add_argument( "--enable-profiling", - action="store_true", - help="Enable profiling and generate swimlane.json", + type=int, + nargs="?", + const=3, + default=0, + metavar="LEVEL", + help="Swimlane profiling mode: 1=AICore only, 2=task+fanout, 3=full (default when flag given: 3)", ) parser.add_argument( diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp index 2e1962df0..87d252a2b 100644 --- a/python/bindings/task_interface.cpp +++ b/python/bindings/task_interface.cpp @@ -579,11 +579,23 @@ NB_MODULE(_task_interface, m) { .def(nb::init<>()) .def_rw("block_dim", &ChipCallConfig::block_dim) .def_rw("aicpu_thread_num", &ChipCallConfig::aicpu_thread_num) - .def_rw("enable_profiling", &ChipCallConfig::enable_profiling) + .def_prop_rw( + "enable_profiling", + [](const ChipCallConfig &self) { + return self.perf_level; + }, + [](ChipCallConfig &self, nb::object v) { + if (nb::isinstance(v)) { + self.perf_level = nb::cast(v) ? 3 : 0; + } else { + self.perf_level = nb::cast(v); + } + } + ) .def("__repr__", [](const ChipCallConfig &self) -> std::string { std::ostringstream os; os << "ChipCallConfig(block_dim=" << self.block_dim << ", aicpu_thread_num=" << self.aicpu_thread_num - << ", enable_profiling=" << (self.enable_profiling ? "True" : "False") << ")"; + << ", enable_profiling=" << self.perf_level << ")"; return os.str(); }); @@ -608,15 +620,15 @@ NB_MODULE(_task_interface, m) { .def( "run_raw", [](ChipWorker &self, uint64_t callable, uint64_t args, int block_dim, int aicpu_thread_num, - bool enable_profiling) { + int perf_level) { ChipCallConfig config; config.block_dim = block_dim; config.aicpu_thread_num = aicpu_thread_num; - config.enable_profiling = enable_profiling; + config.perf_level = perf_level; self.run(reinterpret_cast(callable), reinterpret_cast(args), config); }, nb::arg("callable"), nb::arg("args"), nb::arg("block_dim") = 1, nb::arg("aicpu_thread_num") = 3, - nb::arg("enable_profiling") = false, "Run with raw pointer arguments (used from forked chip process)." + nb::arg("perf_level") = 0, "Run with raw pointer arguments (used from forked chip process)." ) .def_prop_ro("device_id", &ChipWorker::device_id) .def_prop_ro("initialized", &ChipWorker::initialized) diff --git a/python/simpler/worker.py b/python/simpler/worker.py index 3e8d2e9aa..5508f8c0b 100644 --- a/python/simpler/worker.py +++ b/python/simpler/worker.py @@ -174,7 +174,7 @@ def _chip_process_loop( error = 0 try: - cw.run_raw(callable_ptr, heap_args_ptr, block_dim, aicpu_tn, bool(profiling)) + cw.run_raw(callable_ptr, heap_args_ptr, block_dim, aicpu_tn, profiling) except Exception: # noqa: BLE001 error = 1 struct.pack_into("i", buf, _CHIP_OFF_ERROR, error) diff --git a/src/a2a3/platform/include/host/performance_collector.h b/src/a2a3/platform/include/host/performance_collector.h index cf6a52e2b..98644abbf 100644 --- a/src/a2a3/platform/include/host/performance_collector.h +++ b/src/a2a3/platform/include/host/performance_collector.h @@ -332,6 +332,12 @@ class PerformanceCollector { */ bool is_initialized() const { return perf_shared_mem_host_ != nullptr; } + /** + * Set profiling level before initialize(). + * 0=off, 1=AICore-only, 2=task+fanout, 3=full (with phase buffers) + */ + void set_perf_level(int level) { perf_level_ = level; } + /** * Drain remaining buffers from the memory manager's ready queue * @@ -387,6 +393,9 @@ class PerformanceCollector { PerfRegisterCallback register_cb_{nullptr}; PerfFreeCallback free_cb_{nullptr}; + // Profiling level: 0=off, 1=AICore-only, 2=task+fanout, 3=full (with phase) + int perf_level_{0}; + // Memory manager ProfMemoryManager memory_manager_; diff --git a/src/a2a3/platform/include/host/runtime_profiling_mode.h b/src/a2a3/platform/include/host/runtime_profiling_mode.h new file mode 100644 index 000000000..a3999a300 --- /dev/null +++ b/src/a2a3/platform/include/host/runtime_profiling_mode.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * Shared helper: set perf_level and legacy enable_profiling on a Runtime struct. + * + * Used by both onboard and sim pto_runtime_c_api.cpp implementations. + * Some runtime structs still carry a bool enable_profiling member alongside + * the newer int perf_level. This template detects the legacy member at + * compile time and keeps both in sync. + */ + +#pragma once + +#include + +template +struct HasEnableProfilingMember : std::false_type {}; + +template +struct HasEnableProfilingMember().enable_profiling)>> : std::true_type {}; + +template +static inline void set_runtime_profiling_mode(R *runtime, int enable_profiling) { + runtime->perf_level = enable_profiling; + if constexpr (HasEnableProfilingMember::value) { + runtime->enable_profiling = (enable_profiling > 0); + } +} diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp index 41d2235c8..24cc86b04 100644 --- a/src/a2a3/platform/onboard/host/device_runner.cpp +++ b/src/a2a3/platform/onboard/host/device_runner.cpp @@ -461,7 +461,7 @@ int DeviceRunner::run( }); // Initialize performance profiling if enabled - if (runtime.enable_profiling) { + if (runtime.perf_level > 0) { rc = init_performance_profiling(runtime, num_aicore, device_id); if (rc != 0) { LOG_ERROR("init_performance_profiling failed: %d", rc); @@ -530,13 +530,13 @@ int DeviceRunner::run( { // Poll and collect performance data in a separate collector thread std::thread collector_thread; - if (runtime.enable_profiling) { + if (runtime.perf_level > 0) { collector_thread = create_thread([this, &runtime]() { poll_and_collect_performance_data(runtime.get_task_count()); }); } auto thread_guard = RAIIScopeGuard([&]() { - if (runtime.enable_profiling && collector_thread.joinable()) { + if (runtime.perf_level > 0 && collector_thread.joinable()) { collector_thread.join(); } }); @@ -557,13 +557,13 @@ int DeviceRunner::run( } // Signal collector that device execution is complete - if (runtime.enable_profiling) { + if (runtime.perf_level > 0) { perf_collector_.signal_execution_complete(); } } // Stop memory management, drain remaining buffers, collect phase data, export - if (runtime.enable_profiling) { + if (runtime.perf_level > 0) { perf_collector_.stop_memory_manager(); perf_collector_.drain_remaining_buffers(); perf_collector_.scan_remaining_perf_buffers(); @@ -822,6 +822,7 @@ int DeviceRunner::init_performance_profiling(Runtime &runtime, int num_aicore, i return rtFree(dev_ptr); }; + perf_collector_.set_perf_level(runtime.perf_level); return perf_collector_.initialize(runtime, num_aicore, device_id, alloc_cb, register_cb, free_cb); } diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp index ceab62b52..b945c5133 100644 --- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp @@ -26,6 +26,7 @@ #include "common/unified_log.h" #include "device_runner.h" // NOLINT(build/include_subdir) #include "runtime.h" // NOLINT(build/include_subdir) +#include "host/runtime_profiling_mode.h" extern "C" { @@ -162,9 +163,7 @@ int run_runtime( } // Phase 2: profiling - if (enable_profiling) { - r->enable_profiling = true; - } + set_runtime_profiling_mode(r, enable_profiling); // Phase 3: launch std::vector aicpu_vec(aicpu_binary, aicpu_binary + aicpu_size); diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp index e60ceed58..3e38a1f4e 100644 --- a/src/a2a3/platform/sim/host/device_runner.cpp +++ b/src/a2a3/platform/sim/host/device_runner.cpp @@ -299,7 +299,7 @@ int DeviceRunner::run( last_runtime_ = &runtime; // Initialize performance profiling if enabled - if (runtime.enable_profiling) { + if (runtime.perf_level > 0) { rc = init_performance_profiling(runtime, num_aicore, device_id); if (rc != 0) { LOG_ERROR("init_performance_profiling failed: %d", rc); @@ -388,7 +388,7 @@ int DeviceRunner::run( // Poll and collect performance data during execution (if enabled) std::thread collector_thread; - if (runtime.enable_profiling) { + if (runtime.perf_level > 0) { collector_thread = create_thread([this, &runtime]() { poll_and_collect_performance_data(runtime.get_task_count()); }); @@ -404,19 +404,19 @@ int DeviceRunner::run( } // Signal collector that device execution is complete - if (runtime.enable_profiling) { + if (runtime.perf_level > 0) { perf_collector_.signal_execution_complete(); } // Wait for collector thread if it was launched - if (runtime.enable_profiling && collector_thread.joinable()) { + if (runtime.perf_level > 0 && collector_thread.joinable()) { collector_thread.join(); } LOG_INFO("All threads completed"); // Stop memory management, drain remaining buffers, collect phase data, export - if (runtime.enable_profiling) { + if (runtime.perf_level > 0) { perf_collector_.stop_memory_manager(); perf_collector_.drain_remaining_buffers(); perf_collector_.scan_remaining_perf_buffers(); @@ -623,6 +623,7 @@ void DeviceRunner::remove_kernel_binary(int func_id) { // ============================================================================= int DeviceRunner::init_performance_profiling(Runtime &runtime, int num_aicore, int device_id) { + perf_collector_.set_perf_level(runtime.perf_level); // Define allocation callback (a2a3sim: use malloc) auto alloc_cb = [](size_t size) -> void * { return malloc(size); diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp index 37028f27d..6d85847b8 100644 --- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp +++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp @@ -28,6 +28,7 @@ #include "cpu_sim_context.h" // NOLINT(build/include_subdir) #include "device_runner.h" // NOLINT(build/include_subdir) #include "runtime.h" // NOLINT(build/include_subdir) +#include "host/runtime_profiling_mode.h" extern "C" { @@ -154,9 +155,7 @@ int run_runtime( } // Phase 2: profiling - if (enable_profiling) { - r->enable_profiling = true; - } + set_runtime_profiling_mode(r, enable_profiling); // Phase 3: launch std::vector aicpu_vec; diff --git a/src/a2a3/platform/src/aicpu/performance_collector_aicpu.cpp b/src/a2a3/platform/src/aicpu/performance_collector_aicpu.cpp index 57098e111..2a5c26619 100644 --- a/src/a2a3/platform/src/aicpu/performance_collector_aicpu.cpp +++ b/src/a2a3/platform/src/aicpu/performance_collector_aicpu.cpp @@ -219,7 +219,7 @@ void perf_aicpu_switch_buffer(Runtime *runtime, int core_id, int thread_idx) { } void perf_aicpu_flush_buffers(Runtime *runtime, int thread_idx, const int *cur_thread_cores, int core_num) { - if (!runtime->enable_profiling) { + if (runtime->perf_level <= 0) { return; } diff --git a/src/a2a3/platform/src/host/performance_collector.cpp b/src/a2a3/platform/src/host/performance_collector.cpp index 79a3bd78b..7ee695437 100644 --- a/src/a2a3/platform/src/host/performance_collector.cpp +++ b/src/a2a3/platform/src/host/performance_collector.cpp @@ -77,7 +77,7 @@ void ProfMemoryManager::stop() { // Drain remaining done_queue and free buffers { - std::lock_guard lock(done_mutex_); + std::scoped_lock lock(done_mutex_); while (!done_queue_.empty()) { CopyDoneInfo info = done_queue_.front(); done_queue_.pop(); @@ -99,7 +99,7 @@ void ProfMemoryManager::stop() { } bool ProfMemoryManager::try_pop_ready(ReadyBufferInfo &info) { - std::lock_guard lock(ready_mutex_); + std::scoped_lock lock(ready_mutex_); if (ready_queue_.empty()) { return false; } @@ -121,7 +121,7 @@ bool ProfMemoryManager::wait_pop_ready(ReadyBufferInfo &info, std::chrono::milli } void ProfMemoryManager::notify_copy_done(const CopyDoneInfo &info) { - std::lock_guard lock(done_mutex_); + std::scoped_lock lock(done_mutex_); done_queue_.push(info); } @@ -210,7 +210,7 @@ void ProfMemoryManager::process_ready_entry( host_ptr = resolve_host_ptr(new_dev_ptr); } if (new_dev_ptr == nullptr) { - std::lock_guard lock(done_mutex_); + std::scoped_lock lock(done_mutex_); while (!done_queue_.empty()) { CopyDoneInfo dinfo = done_queue_.front(); done_queue_.pop(); @@ -258,7 +258,7 @@ void ProfMemoryManager::process_ready_entry( info.buffer_seq = seq; { - std::lock_guard lock(ready_mutex_); + std::scoped_lock lock(ready_mutex_); ready_queue_.push(info); } ready_cv_.notify_one(); @@ -289,7 +289,7 @@ void ProfMemoryManager::process_ready_entry( host_ptr = resolve_host_ptr(new_dev_ptr); } if (new_dev_ptr == nullptr) { - std::lock_guard lock(done_mutex_); + std::scoped_lock lock(done_mutex_); while (!done_queue_.empty()) { CopyDoneInfo dinfo = done_queue_.front(); done_queue_.pop(); @@ -335,7 +335,7 @@ void ProfMemoryManager::process_ready_entry( info.buffer_seq = seq; { - std::lock_guard lock(ready_mutex_); + std::scoped_lock lock(ready_mutex_); ready_queue_.push(info); } ready_cv_.notify_one(); @@ -348,7 +348,7 @@ void ProfMemoryManager::mgmt_loop() { while (running_.load()) { // 1. Recycle done queue: move completed buffers to recycled pools for reuse { - std::lock_guard lock(done_mutex_); + std::scoped_lock lock(done_mutex_); while (!done_queue_.empty()) { CopyDoneInfo info = done_queue_.front(); done_queue_.pop(); @@ -561,8 +561,9 @@ int PerformanceCollector::initialize( free_cb_ = free_cb; // Step 1: Calculate shared memory size (slot arrays only, no actual buffers) - int num_phase_threads = PLATFORM_MAX_AICPU_THREADS; - size_t total_size = calc_perf_data_size_with_phases(num_aicore, num_phase_threads); + int num_phase_threads = (perf_level_ >= 3) ? PLATFORM_MAX_AICPU_THREADS : 0; + size_t total_size = (num_phase_threads > 0) ? calc_perf_data_size_with_phases(num_aicore, num_phase_threads) : + calc_perf_data_size(num_aicore); LOG_DEBUG("Shared memory allocation plan:"); LOG_DEBUG(" Number of cores: %d", num_aicore); @@ -651,41 +652,43 @@ int PerformanceCollector::initialize( num_aicore * (PLATFORM_PROF_BUFFERS_PER_CORE - 1) ); - // Step 6: Initialize PhaseBufferStates — 1 buffer per thread in free_queue, rest to recycled pool - for (int t = 0; t < num_phase_threads; t++) { - PhaseBufferState *state = get_phase_buffer_state(perf_host_ptr, num_aicore, t); - memset(state, 0, sizeof(PhaseBufferState)); - - state->free_queue.head = 0; - state->free_queue.tail = 0; - state->current_buf_ptr = 0; - state->current_buf_seq = 0; - - for (int s = 0; s < PLATFORM_PROF_BUFFERS_PER_THREAD; s++) { - void *host_buf_ptr = nullptr; - void *dev_buf_ptr = alloc_single_buffer(sizeof(PhaseBuffer), &host_buf_ptr); - if (dev_buf_ptr == nullptr) { - LOG_ERROR("Failed to allocate PhaseBuffer for thread %d, buffer %d", t, s); - return -1; - } - PhaseBuffer *buf = reinterpret_cast(host_buf_ptr); - memset(buf, 0, sizeof(PhaseBuffer)); - buf->count = 0; + // Step 6: Initialize PhaseBufferStates (only when phase recording enabled) + if (num_phase_threads > 0) { + for (int t = 0; t < num_phase_threads; t++) { + PhaseBufferState *state = get_phase_buffer_state(perf_host_ptr, num_aicore, t); + memset(state, 0, sizeof(PhaseBufferState)); + + state->free_queue.head = 0; + state->free_queue.tail = 0; + state->current_buf_ptr = 0; + state->current_buf_seq = 0; + + for (int s = 0; s < PLATFORM_PROF_BUFFERS_PER_THREAD; s++) { + void *host_buf_ptr = nullptr; + void *dev_buf_ptr = alloc_single_buffer(sizeof(PhaseBuffer), &host_buf_ptr); + if (dev_buf_ptr == nullptr) { + LOG_ERROR("Failed to allocate PhaseBuffer for thread %d, buffer %d", t, s); + return -1; + } + PhaseBuffer *buf = reinterpret_cast(host_buf_ptr); + memset(buf, 0, sizeof(PhaseBuffer)); + buf->count = 0; - if (s == 0) { - state->free_queue.buffer_ptrs[0] = reinterpret_cast(dev_buf_ptr); - } else { - memory_manager_.recycled_phase_buffers_.push_back(dev_buf_ptr); + if (s == 0) { + state->free_queue.buffer_ptrs[0] = reinterpret_cast(dev_buf_ptr); + } else { + memory_manager_.recycled_phase_buffers_.push_back(dev_buf_ptr); + } } + wmb(); + state->free_queue.tail = 1; + wmb(); } - wmb(); - state->free_queue.tail = 1; - wmb(); + LOG_DEBUG( + "Initialized %d PhaseBufferStates: 1 buffer/thread, %d in recycled pool", num_phase_threads, + num_phase_threads * (PLATFORM_PROF_BUFFERS_PER_THREAD - 1) + ); } - LOG_DEBUG( - "Initialized %d PhaseBufferStates: 1 buffer/thread, %d in recycled pool", num_phase_threads, - num_phase_threads * (PLATFORM_PROF_BUFFERS_PER_THREAD - 1) - ); wmb(); @@ -706,8 +709,8 @@ void PerformanceCollector::start_memory_manager(const ThreadFactory &thread_fact } memory_manager_.start( - perf_shared_mem_host_, num_aicore_, PLATFORM_MAX_AICPU_THREADS, alloc_cb_, register_cb_, free_cb_, device_id_, - thread_factory + perf_shared_mem_host_, num_aicore_, (perf_level_ >= 3) ? PLATFORM_MAX_AICPU_THREADS : 0, alloc_cb_, + register_cb_, free_cb_, device_id_, thread_factory ); } @@ -1245,7 +1248,17 @@ int PerformanceCollector::export_swimlane_json(const std::string &output_path) { } // Step 7: Write JSON data - int version = has_phase_data_ ? 2 : 1; + int version; + if (perf_level_ <= 1) { + version = 0; + } else if (has_phase_data_) { + version = 2; + } else { + if (perf_level_ >= 3) { + LOG_WARN("perf_level=%d but no phase data collected; writing version=1", perf_level_); + } + version = 1; + } outfile << "{\n"; outfile << " \"version\": " << version << ",\n"; outfile << " \"tasks\": [\n"; @@ -1258,8 +1271,6 @@ int PerformanceCollector::export_swimlane_json(const std::string &output_path) { double start_us = cycles_to_us(record.start_time - base_time_cycles); double end_us = cycles_to_us(record.end_time - base_time_cycles); double duration_us = end_us - start_us; - double dispatch_us = (record.dispatch_time > 0) ? cycles_to_us(record.dispatch_time - base_time_cycles) : 0.0; - double finish_us = (record.finish_time > 0) ? cycles_to_us(record.finish_time - base_time_cycles) : 0.0; const char *core_type_str = (record.core_type == CoreType::AIC) ? "aic" : "aiv"; @@ -1271,20 +1282,27 @@ int PerformanceCollector::export_swimlane_json(const std::string &output_path) { outfile << " \"ring_id\": " << static_cast(record.task_id >> 32) << ",\n"; outfile << " \"start_time_us\": " << std::fixed << std::setprecision(3) << start_us << ",\n"; outfile << " \"end_time_us\": " << std::fixed << std::setprecision(3) << end_us << ",\n"; - outfile << " \"duration_us\": " << std::fixed << std::setprecision(3) << duration_us << ",\n"; - outfile << " \"dispatch_time_us\": " << std::fixed << std::setprecision(3) << dispatch_us << ",\n"; - outfile << " \"finish_time_us\": " << std::fixed << std::setprecision(3) << finish_us << ",\n"; - outfile << " \"fanout\": ["; - int safe_fanout_count = - (record.fanout_count >= 0 && record.fanout_count <= RUNTIME_MAX_FANOUT) ? record.fanout_count : 0; - for (int j = 0; j < safe_fanout_count; ++j) { - outfile << record.fanout[j]; - if (j < safe_fanout_count - 1) { - outfile << ", "; + if (perf_level_ >= 2) { + double dispatch_us = + (record.dispatch_time > 0) ? cycles_to_us(record.dispatch_time - base_time_cycles) : 0.0; + double finish_us = (record.finish_time > 0) ? cycles_to_us(record.finish_time - base_time_cycles) : 0.0; + outfile << " \"duration_us\": " << std::fixed << std::setprecision(3) << duration_us << ",\n"; + outfile << " \"dispatch_time_us\": " << std::fixed << std::setprecision(3) << dispatch_us << ",\n"; + outfile << " \"finish_time_us\": " << std::fixed << std::setprecision(3) << finish_us << ",\n"; + outfile << " \"fanout\": ["; + int safe_fanout_count = + (record.fanout_count >= 0 && record.fanout_count <= RUNTIME_MAX_FANOUT) ? record.fanout_count : 0; + for (int j = 0; j < safe_fanout_count; ++j) { + outfile << record.fanout[j]; + if (j < safe_fanout_count - 1) { + outfile << ", "; + } } + outfile << "],\n"; + outfile << " \"fanout_count\": " << record.fanout_count << "\n"; + } else { + outfile << " \"duration_us\": " << std::fixed << std::setprecision(3) << duration_us << "\n"; } - outfile << "],\n"; - outfile << " \"fanout_count\": " << record.fanout_count << "\n"; outfile << " }"; if (i < tagged_records.size() - 1) { outfile << ","; diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h b/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h index eb01973dd..6ad207d76 100644 --- a/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h +++ b/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h @@ -168,6 +168,7 @@ class Runtime { // Profiling support bool enable_profiling; // Enable profiling flag + int perf_level = 0; // Derived from enable_profiling: 0=off, 2=task+fanout // Orchestrator-to-scheduler transition control // When true, orchestrator threads convert to scheduler threads after orchestration completes. diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.cpp b/src/a2a3/runtime/host_build_graph/runtime/runtime.cpp index 9899b1a48..852c6c297 100644 --- a/src/a2a3/runtime/host_build_graph/runtime/runtime.cpp +++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.cpp @@ -46,6 +46,7 @@ Runtime::Runtime() { worker_count = 0; sche_cpu_num = 1; enable_profiling = false; + perf_level = 0; perf_data_base = 0; tensor_pair_count = 0; diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.h b/src/a2a3/runtime/host_build_graph/runtime/runtime.h index 0c92ed234..9d7432a11 100644 --- a/src/a2a3/runtime/host_build_graph/runtime/runtime.h +++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.h @@ -202,7 +202,8 @@ class Runtime { int sche_cpu_num; // Number of AICPU threads for scheduling // Profiling support - bool enable_profiling; // Enable profiling flag + bool enable_profiling; // Legacy flag for host_build_graph runtime paths + int perf_level; // 0=off, >0=profiling on (extended mode encoding) uint64_t perf_data_base; // Performance data shared memory base address (device-side) // Task storage diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp index cb2977227..10ab0bf82 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp @@ -88,7 +88,7 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in // Cache per-core dispatch payload pointer (set by AICPU before aicpu_ready) __gm__ PTO2DispatchPayload *payload = reinterpret_cast<__gm__ PTO2DispatchPayload *>(my_hank->task); - bool profiling_enabled = runtime->enable_profiling; + bool profiling_enabled = (runtime->perf_level > 0); // Phase 4: Main execution loop - poll register for tasks until exit signal // Register encoding: AICPU_IDLE_TASK_ID=idle, task_id=task, AICORE_EXIT_SIGNAL=exit diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 7b8b60538..9d808bb1d 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -456,7 +456,7 @@ struct AicpuExecutor { PTO2LocalReadyBuffer *local_bufs, CoreType ct #if PTO2_PROFILING , - bool profiling_enabled, uint32_t &phase_complete_count, uint64_t dispatch_ts + bool task_recording_enabled, int perf_level, uint32_t &phase_complete_count, uint64_t dispatch_ts #endif #if PTO2_SCHED_PROFILING , @@ -506,26 +506,31 @@ struct AicpuExecutor { } #if PTO2_PROFILING - if (profiling_enabled) { + if (task_recording_enabled) { #if PTO2_SCHED_PROFILING uint64_t t_perf_start = get_sys_cnt_aicpu(); #endif Handshake *h = &hank[core_id]; - uint64_t finish_ts = get_sys_cnt_aicpu(); PerfBuffer *perf_buf = reinterpret_cast(h->perf_records_addr); uint64_t fanout_arr[RUNTIME_MAX_FANOUT]; int32_t fanout_n = 0; - PTO2DepListEntry *cur = slot_state.fanout_head; - while (cur != nullptr && fanout_n < RUNTIME_MAX_FANOUT) { - fanout_arr[fanout_n++] = cur->slot_state->task->task_id.raw; - cur = cur->next; + uint64_t finish_ts = 0; + + if (perf_level >= 2) { + finish_ts = get_sys_cnt_aicpu(); + PTO2DepListEntry *cur = slot_state.fanout_head; + while (cur != nullptr && fanout_n < RUNTIME_MAX_FANOUT) { + fanout_arr[fanout_n++] = cur->slot_state->task->task_id.raw; + cur = cur->next; + } } int32_t perf_slot_idx = static_cast(subslot); if (perf_aicpu_complete_record( perf_buf, static_cast(expected_reg_task_id), slot_state.task->task_id.raw, - slot_state.task->kernel_id[perf_slot_idx], ct, dispatch_ts, finish_ts, fanout_arr, fanout_n + slot_state.task->kernel_id[perf_slot_idx], ct, dispatch_ts, finish_ts, + (perf_level >= 2) ? fanout_arr : nullptr, fanout_n ) != 0) { DEV_ERROR( "Core %d: perf_aicpu_complete_record failed for task 0x%" PRIx64, core_id, @@ -564,7 +569,7 @@ struct AicpuExecutor { PTO2LocalReadyBuffer *local_bufs #if PTO2_PROFILING , - bool profiling_enabled, uint32_t &phase_complete_count + bool task_recording_enabled, int perf_level, uint32_t &phase_complete_count #endif #if PTO2_SCHED_PROFILING , @@ -586,7 +591,7 @@ struct AicpuExecutor { int32_t reg_state = EXTRACT_TASK_STATE(reg_val); #if PTO2_SCHED_PROFILING - if (profiling_enabled) { + if (task_recording_enabled) { complete_probe_count++; } #endif @@ -596,7 +601,7 @@ struct AicpuExecutor { if (!t.matched) continue; #if PTO2_SCHED_PROFILING - if (profiling_enabled && (t.running_done || t.pending_done)) { + if (task_recording_enabled && (t.running_done || t.pending_done)) { complete_hit_count++; } #endif @@ -610,7 +615,7 @@ struct AicpuExecutor { completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs, CT #if PTO2_PROFILING , - profiling_enabled, phase_complete_count, core.pending_dispatch_timestamp + task_recording_enabled, perf_level, phase_complete_count, core.pending_dispatch_timestamp #endif #if PTO2_SCHED_PROFILING , @@ -626,7 +631,7 @@ struct AicpuExecutor { completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs, CT #if PTO2_PROFILING , - profiling_enabled, phase_complete_count, core.running_dispatch_timestamp + task_recording_enabled, perf_level, phase_complete_count, core.running_dispatch_timestamp #endif #if PTO2_SCHED_PROFILING , @@ -773,7 +778,7 @@ struct AicpuExecutor { PTO2SubtaskSlot subslot, bool to_pending #if PTO2_PROFILING , - bool profiling_enabled + bool task_recording_enabled #endif ) { CoreTracker &tracker = core_trackers_[thread_idx]; @@ -813,7 +818,7 @@ struct AicpuExecutor { core_exec_state.pending_slot_state = &slot_state; core_exec_state.pending_reg_task_id = static_cast(reg_task_id); #if PTO2_PROFILING - if (profiling_enabled) { + if (task_recording_enabled && runtime->perf_level >= 2) { core_exec_state.pending_dispatch_timestamp = get_sys_cnt_aicpu(); } #endif @@ -822,7 +827,7 @@ struct AicpuExecutor { core_exec_state.running_slot_state = &slot_state; core_exec_state.running_reg_task_id = static_cast(reg_task_id); #if PTO2_PROFILING - if (profiling_enabled) { + if (task_recording_enabled && runtime->perf_level >= 2) { core_exec_state.running_dispatch_timestamp = get_sys_cnt_aicpu(); } #endif @@ -830,7 +835,7 @@ struct AicpuExecutor { tracker.change_core_state(core_offset); } #if PTO2_PROFILING - if (profiling_enabled) { + if (task_recording_enabled) { if (core_exec_state.dispatch_count >= PLATFORM_PROF_BUFFER_SIZE) { perf_aicpu_switch_buffer(runtime, core_id, thread_idx); core_exec_state.dispatch_count = 0; @@ -852,7 +857,7 @@ struct AicpuExecutor { Runtime *runtime, int32_t thread_idx, int32_t cluster_offset, PTO2TaskSlotState &slot_state #if PTO2_PROFILING , - bool profiling_enabled + bool task_recording_enabled #endif ) { CoreTracker &tracker = core_trackers_[thread_idx]; @@ -863,7 +868,7 @@ struct AicpuExecutor { false #if PTO2_PROFILING , - profiling_enabled + task_recording_enabled #endif ); } @@ -873,7 +878,7 @@ struct AicpuExecutor { false #if PTO2_PROFILING , - profiling_enabled + task_recording_enabled #endif ); } @@ -883,7 +888,7 @@ struct AicpuExecutor { false #if PTO2_PROFILING , - profiling_enabled + task_recording_enabled #endif ); } @@ -923,7 +928,7 @@ struct AicpuExecutor { PTO2ResourceShape shape #if PTO2_PROFILING , - bool profiling_enabled, uint32_t &phase_dispatch_count + bool task_recording_enabled, bool phase_recording_enabled, uint32_t &phase_dispatch_count #endif ) { CoreTracker &tracker = core_trackers_[thread_idx]; @@ -932,7 +937,7 @@ struct AicpuExecutor { runtime, thread_idx, cluster_offset, slot_state #if PTO2_PROFILING , - profiling_enabled + task_recording_enabled #endif ); } else if (shape == PTO2ResourceShape::AIC) { @@ -941,7 +946,7 @@ struct AicpuExecutor { false #if PTO2_PROFILING , - profiling_enabled + task_recording_enabled #endif ); } else { // AIV @@ -952,12 +957,14 @@ struct AicpuExecutor { runtime, thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIV0, false #if PTO2_PROFILING , - profiling_enabled + task_recording_enabled #endif ); } #if PTO2_PROFILING - phase_dispatch_count += __builtin_popcount(pto2_core_mask(slot_state.active_mask)); + if (phase_recording_enabled) { + phase_dispatch_count += __builtin_popcount(pto2_core_mask(slot_state.active_mask)); + } #endif } @@ -981,7 +988,7 @@ struct AicpuExecutor { Runtime *runtime, int32_t block_num #if PTO2_PROFILING , - bool profiling_enabled, uint32_t &phase_dispatch_count + bool task_recording_enabled, bool phase_recording_enabled, uint32_t &phase_dispatch_count #endif ) { PTO2TaskSlotState *slot_state = drain_state_.pending_task; @@ -998,7 +1005,7 @@ struct AicpuExecutor { runtime, t, valid.pop_first(), *slot_state, shape #if PTO2_PROFILING , - profiling_enabled, phase_dispatch_count + task_recording_enabled, phase_recording_enabled, phase_dispatch_count #endif ); slot_state->next_block_idx++; @@ -1032,7 +1039,7 @@ struct AicpuExecutor { Runtime *runtime, int32_t thread_idx #if PTO2_PROFILING , - bool profiling_enabled, uint32_t &phase_dispatch_count + bool task_recording_enabled, bool phase_recording_enabled, uint32_t &phase_dispatch_count #endif ) { // Spin until drain is fully initialized (sentinel -1 → block_num > 0). @@ -1081,7 +1088,7 @@ struct AicpuExecutor { runtime, block_num #if PTO2_PROFILING , - profiling_enabled, phase_dispatch_count + task_recording_enabled, phase_recording_enabled, phase_dispatch_count #endif ); } @@ -1474,9 +1481,10 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa #if PTO2_PROFILING // Assign perf buffers to cores early so profiling captures all tasks // (total_tasks written to header later when orchestrator completes) - if (runtime->enable_profiling) { + if (runtime->perf_level > 0) { perf_aicpu_init_profiling(runtime); - // Initialize phase profiling for scheduler threads + orchestrator threads + } + if (runtime->perf_level >= 3) { perf_aicpu_init_phase_profiling(runtime, sched_thread_num_); perf_aicpu_set_orch_thread_idx(sched_thread_num_); } @@ -1495,7 +1503,9 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa int32_t idle_iterations = 0; int32_t last_progress_count = 0; #if PTO2_PROFILING - bool profiling_enabled = runtime->enable_profiling; + int perf_level = runtime->perf_level; + bool task_recording_enabled = (perf_level > 0); + bool phase_recording_enabled = (perf_level >= 3); #endif // Scheduler profiling counters @@ -1618,7 +1628,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa deferred_release_slot_states, deferred_release_count, local_bufs #if PTO2_PROFILING , - profiling_enabled, phase_complete_count + task_recording_enabled, perf_level, phase_complete_count #endif #if PTO2_SCHED_PROFILING , @@ -1636,7 +1646,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa deferred_release_slot_states, deferred_release_count, local_bufs #if PTO2_PROFILING , - profiling_enabled, phase_complete_count + task_recording_enabled, perf_level, phase_complete_count #endif #if PTO2_SCHED_PROFILING , @@ -1668,7 +1678,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa CYCLE_COUNT_LAP(sched_idle_cycle); } else { CYCLE_COUNT_LAP(sched_complete_cycle); - if (profiling_enabled && phase_complete_count > 0) { + if (phase_recording_enabled && phase_complete_count > 0) { perf_aicpu_record_phase( thread_idx, AicpuPhaseId::SCHED_COMPLETE, _t0_phase, _t1, sched_loop_count, phase_complete_count ); @@ -1689,7 +1699,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa runtime, thread_idx #if PTO2_PROFILING , - profiling_enabled, phase_dispatch_count + task_recording_enabled, phase_recording_enabled, phase_dispatch_count #endif ); continue; @@ -1773,7 +1783,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa runtime, thread_idx, current_valid_cluster_offset, *slot_state, shape #if PTO2_PROFILING , - profiling_enabled, phase_dispatch_count + task_recording_enabled, phase_recording_enabled, phase_dispatch_count #endif ); slot_state->next_block_idx++; @@ -1841,7 +1851,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa PTO2SubtaskSlot::AIC, true #if PTO2_PROFILING , - profiling_enabled + task_recording_enabled #endif ); slot_state->next_block_idx++; @@ -1879,7 +1889,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa CYCLE_COUNT_LAP(sched_idle_cycle); } else { CYCLE_COUNT_LAP(sched_dispatch_cycle); - if (profiling_enabled && phase_dispatch_count > 0) { + if (phase_recording_enabled && phase_dispatch_count > 0) { perf_aicpu_record_phase( thread_idx, AicpuPhaseId::SCHED_DISPATCH, _t0_phase, _t1, sched_loop_count, phase_dispatch_count ); @@ -2039,7 +2049,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa } #if PTO2_PROFILING CYCLE_COUNT_LAP(sched_idle_cycle); - if (profiling_enabled) { + if (phase_recording_enabled) { perf_aicpu_record_phase(thread_idx, AicpuPhaseId::SCHED_IDLE_WAIT, _t0_phase, _t1, sched_loop_count, 0); _t0_phase = _t1; } @@ -2207,9 +2217,10 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa #endif #if PTO2_PROFILING - // Flush performance buffers for cores managed by this thread - if (profiling_enabled) { + if (task_recording_enabled) { perf_aicpu_flush_buffers(runtime, thread_idx, core_assignments_[thread_idx], core_num); + } + if (phase_recording_enabled) { perf_aicpu_flush_phase_buffers(thread_idx); } #endif @@ -2388,7 +2399,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { } #if PTO2_PROFILING - rt->orchestrator.enable_profiling = runtime->enable_profiling; + rt->orchestrator.perf_level = runtime->perf_level; #endif // Total core counts = aic_count_ / aiv_count_ (set once at runtime init). @@ -2412,7 +2423,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { } #if PTO2_PROFILING - if (runtime->enable_profiling) { + if (runtime->perf_level >= 3) { perf_aicpu_set_orch_thread_idx(thread_idx); } #endif @@ -2496,7 +2507,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { #if PTO2_PROFILING // Write orchestrator summary to shared memory for host-side export (only if profiling enabled) - if (runtime->enable_profiling) { + if (runtime->perf_level >= 3) { AicpuOrchSummary orch_summary = {}; orch_summary.start_time = orch_cycle_start; orch_summary.end_time = orch_cycle_end; @@ -2516,7 +2527,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { #if PTO2_PROFILING // Write core-to-thread mapping (one-time, after orchestration) - if (runtime->enable_profiling) { + if (runtime->perf_level >= 3) { perf_aicpu_write_core_assignments( core_assignments_, core_count_per_thread_, sched_thread_num_, cores_total_num_ ); @@ -2540,7 +2551,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) { pto2_submitted_tasks = pto2_task_count; #endif total_tasks_ = pto2_task_count; - if (runtime->enable_profiling && pto2_task_count > 0) { + if (runtime->perf_level > 0 && pto2_task_count > 0) { perf_aicpu_update_total_tasks(runtime, static_cast(pto2_task_count)); } int32_t inline_completed = static_cast(rt->orchestrator.inline_completed_tasks); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index e201f3d30..eed777d19 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -94,8 +94,8 @@ __attribute__((weak, visibility("hidden"))) void perf_aicpu_record_orch_phase(AicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {} // submit_idx needed for swimlane task_id tagging (no cycle accumulation at this level) static uint32_t g_orch_submit_idx = 0; -#define CYCLE_COUNT_START() \ - bool _prof_active = orch->enable_profiling; \ +#define CYCLE_COUNT_START() \ + bool _prof_active = (orch->perf_level >= 3); \ uint64_t _t0 = _prof_active ? get_sys_cnt_aicpu() : 0, _t1 = 0 #define CYCLE_COUNT_LAP(acc) \ do { \ diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h index 9db96eaa1..8bf27e791 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h @@ -75,8 +75,7 @@ struct PTO2OrchestratorState { int32_t total_cluster_count{0}; // AIC cores = MIX clusters int32_t total_aiv_count{0}; // AIV cores (= 2 × clusters on standard hardware) #if PTO2_PROFILING - // Runtime profiling switch copied from Runtime::enable_profiling. - bool enable_profiling; + int perf_level; #endif // === GM HEAP (for output buffers) === diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h index 18a069ab2..7b4308134 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -170,7 +170,7 @@ class Runtime { uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID]; // Profiling support - bool enable_profiling; // Enable profiling flag + int perf_level = 0; // 0=off, 1=AICore-only, 2=task+fanout, 3=full // Orchestrator-to-scheduler transition control // When true, orchestrator threads convert to scheduler threads after orchestration completes. diff --git a/src/common/distributed/dist_chip_process.cpp b/src/common/distributed/dist_chip_process.cpp index dd8f7621a..dece07f5f 100644 --- a/src/common/distributed/dist_chip_process.cpp +++ b/src/common/distributed/dist_chip_process.cpp @@ -57,7 +57,7 @@ void DistChipProcess::run(const WorkerPayload &payload) { // Write config fields int32_t block_dim = payload.block_dim; int32_t aicpu_tn = payload.aicpu_thread_num; - int32_t profiling = payload.enable_profiling ? 1 : 0; + int32_t profiling = payload.enable_profiling; std::memcpy(base() + OFF_BLOCK_DIM, &block_dim, sizeof(int32_t)); std::memcpy(base() + OFF_AICPU_THREAD_NUM, &aicpu_tn, sizeof(int32_t)); std::memcpy(base() + OFF_ENABLE_PROFILING, &profiling, sizeof(int32_t)); diff --git a/src/common/distributed/dist_types.h b/src/common/distributed/dist_types.h index b3a9695ee..a3bc802b7 100644 --- a/src/common/distributed/dist_types.h +++ b/src/common/distributed/dist_types.h @@ -81,7 +81,7 @@ struct WorkerPayload { const void *args = nullptr; // ChipStorageTaskArgs* int32_t block_dim = 1; int32_t aicpu_thread_num = 3; - bool enable_profiling = false; + int enable_profiling = 0; // --- SubWorker fields --- int32_t callable_id = -1; diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp index 2ef0a9b00..1ade6d818 100644 --- a/src/common/worker/chip_worker.cpp +++ b/src/common/worker/chip_worker.cpp @@ -191,7 +191,7 @@ void ChipWorker::run(const WorkerPayload &payload) { ChipCallConfig config; config.block_dim = payload.block_dim; config.aicpu_thread_num = payload.aicpu_thread_num; - config.enable_profiling = payload.enable_profiling; + config.perf_level = payload.enable_profiling; run(payload.callable, payload.args, config); } @@ -204,7 +204,7 @@ void ChipWorker::run(const void *callable, const void *args, const ChipCallConfi int rc = run_runtime_fn_( device_ctx_, rt, callable, args, config.block_dim, config.aicpu_thread_num, device_id_, aicpu_binary_.data(), - aicpu_binary_.size(), aicore_binary_.data(), aicore_binary_.size(), config.enable_profiling ? 1 : 0 + aicpu_binary_.size(), aicore_binary_.data(), aicore_binary_.size(), config.perf_level ); if (rc != 0) { throw std::runtime_error("run_runtime failed with code " + std::to_string(rc)); diff --git a/src/common/worker/chip_worker.h b/src/common/worker/chip_worker.h index b6fd7c4b5..cf614ae22 100644 --- a/src/common/worker/chip_worker.h +++ b/src/common/worker/chip_worker.h @@ -21,7 +21,7 @@ struct ChipCallConfig { int block_dim = 24; int aicpu_thread_num = 3; - bool enable_profiling = false; + int perf_level = 0; // 0=off, 1=AICore-only, 2=task+fanout, 3=full }; class ChipWorker : public IWorker { diff --git a/tests/ut/py/test_chip_worker.py b/tests/ut/py/test_chip_worker.py index 59b85f985..8a3947e3b 100644 --- a/tests/ut/py/test_chip_worker.py +++ b/tests/ut/py/test_chip_worker.py @@ -31,7 +31,7 @@ def test_defaults(self): config = ChipCallConfig() assert config.block_dim == 24 assert config.aicpu_thread_num == 3 - assert config.enable_profiling is False + assert config.enable_profiling == 0 def test_setters(self): config = ChipCallConfig() @@ -40,13 +40,13 @@ def test_setters(self): config.enable_profiling = True assert config.block_dim == 32 assert config.aicpu_thread_num == 4 - assert config.enable_profiling is True + assert config.enable_profiling > 0 def test_repr(self): config = ChipCallConfig() r = repr(config) assert "block_dim=24" in r - assert "enable_profiling=False" in r + assert "enable_profiling=0" in r # ============================================================================ diff --git a/tools/swimlane_converter.py b/tools/swimlane_converter.py index f47ae9b47..f7cb2670b 100644 --- a/tools/swimlane_converter.py +++ b/tools/swimlane_converter.py @@ -120,8 +120,8 @@ def read_perf_data(filepath): raise ValueError(f"Missing required field: {field}") # Validate version - if data["version"] not in [1, 2]: - raise ValueError(f"Unsupported version: {data['version']} (expected 1 or 2)") + if data["version"] not in [0, 1, 2]: + raise ValueError(f"Unsupported version: {data['version']} (expected 0/1/2)") return data @@ -495,7 +495,8 @@ def generate_chrome_trace_json( # noqa: PLR0912, PLR0915 dur = task["duration_us"] # Build fanout hint string (packed ids → rXtY / tY for readability) - fanout_str = "[" + ", ".join(format_task_display(x) for x in task["fanout"]) + "]" + fanout_list = task.get("fanout", []) + fanout_str = "[" + ", ".join(format_task_display(x) for x in fanout_list) + "]" # Get function name if available func_id = task["func_id"] @@ -579,7 +580,7 @@ def generate_chrome_trace_json( # noqa: PLR0912, PLR0915 src_tid = core_to_tid[task["core_id"]] src_ts_end = task["end_time_us"] - for succ_task_id in task["fanout"]: + for succ_task_id in task.get("fanout", []): if succ_task_id not in task_map: if verbose: print( @@ -810,7 +811,7 @@ def generate_chrome_trace_json( # noqa: PLR0912, PLR0915 src_tid = core_to_tid[task["core_id"]] - for succ_task_id in task["fanout"]: + for succ_task_id in task.get("fanout", []): if succ_task_id not in task_map: continue