Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions examples/scripts/code_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,12 @@ def _get_project_root() -> Path:
return Path(__file__).parent.parent.parent # examples/scripts/ -> examples/ -> simpler/


def _normalize_perf_level(v) -> int:
if isinstance(v, bool):
return 3 if v else 0
return int(v)


def _get_pto_isa_clone_path() -> Path:
"""Get the expected path to pto-isa clone."""
return _get_project_root() / "examples" / "scripts" / "_deps" / "pto-isa"
Expand Down Expand Up @@ -477,7 +483,7 @@ def __init__( # noqa: PLR0913
golden_path: str,
device_id: Optional[int] = None,
platform: str = "a2a3",
enable_profiling: bool = False,
enable_profiling: int = 0,
run_all_cases: bool = False,
case_name: Optional[str] = None,
pto_isa_commit: Optional[str] = None,
Expand All @@ -492,7 +498,7 @@ def __init__( # noqa: PLR0913
self.kernels_dir = Path(kernels_dir).resolve()
self.golden_path = Path(golden_path).resolve()
self.platform = platform
self.enable_profiling = enable_profiling
self._perf_level = _normalize_perf_level(enable_profiling)
self.skip_golden = skip_golden
self.project_root = _get_project_root()

Expand Down Expand Up @@ -887,9 +893,9 @@ def _compile_one_kernel(kernel):
config = CallConfig()
config.block_dim = self.block_dim
config.aicpu_thread_num = self.aicpu_thread_num
if self.enable_profiling and round_idx == 0:
config.enable_profiling = True
logger.info("Profiling enabled")
if self._perf_level > 0 and round_idx == 0:
config.enable_profiling = self._perf_level
logger.info(f"Swimlane profiling enabled (mode={self._perf_level})")

with _temporary_env(run_env):
worker.run(chip_callable, orch_args, config)
Expand Down
8 changes: 6 additions & 2 deletions examples/scripts/run_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,12 @@ def compute_golden(tensors: dict, params: dict) -> None:

parser.add_argument(
"--enable-profiling",
action="store_true",
help="Enable profiling and generate swimlane.json",
type=int,
nargs="?",
const=3,
default=0,
metavar="LEVEL",
help="Swimlane profiling mode: 1=AICore only, 2=task+fanout, 3=full (default when flag given: 3)",
)

parser.add_argument(
Expand Down
22 changes: 17 additions & 5 deletions python/bindings/task_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -571,11 +571,23 @@ NB_MODULE(_task_interface, m) {
.def(nb::init<>())
.def_rw("block_dim", &CallConfig::block_dim)
.def_rw("aicpu_thread_num", &CallConfig::aicpu_thread_num)
.def_rw("enable_profiling", &CallConfig::enable_profiling)
.def_prop_rw(
"enable_profiling",
[](const CallConfig &self) {
return self.perf_level;
},
[](CallConfig &self, nb::object v) {
if (nb::isinstance<nb::bool_>(v)) {
self.perf_level = nb::cast<bool>(v) ? 3 : 0;
} else {
self.perf_level = nb::cast<int>(v);
}
}
)
.def("__repr__", [](const CallConfig &self) -> std::string {
std::ostringstream os;
os << "CallConfig(block_dim=" << self.block_dim << ", aicpu_thread_num=" << self.aicpu_thread_num
<< ", enable_profiling=" << (self.enable_profiling ? "True" : "False") << ")";
<< ", enable_profiling=" << self.perf_level << ")";
return os.str();
});

Expand All @@ -599,15 +611,15 @@ NB_MODULE(_task_interface, m) {
.def(
"run_raw",
[](ChipWorker &self, uint64_t callable, uint64_t args, int block_dim, int aicpu_thread_num,
bool enable_profiling) {
int perf_level) {
CallConfig config;
config.block_dim = block_dim;
config.aicpu_thread_num = aicpu_thread_num;
config.enable_profiling = enable_profiling;
config.perf_level = perf_level;
self.run(reinterpret_cast<const void *>(callable), reinterpret_cast<const void *>(args), config);
},
nb::arg("callable"), nb::arg("args"), nb::arg("block_dim") = 1, nb::arg("aicpu_thread_num") = 3,
nb::arg("enable_profiling") = false, "Run with raw pointer arguments (used from forked chip process)."
nb::arg("perf_level") = 0, "Run with raw pointer arguments (used from forked chip process)."
)
.def_prop_ro("device_id", &ChipWorker::device_id)
.def_prop_ro("initialized", &ChipWorker::initialized)
Expand Down
2 changes: 1 addition & 1 deletion python/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def _chip_process_loop(

error = 0
try:
cw.run_raw(callable_ptr, heap_args_ptr, block_dim, aicpu_tn, bool(profiling))
cw.run_raw(callable_ptr, heap_args_ptr, block_dim, aicpu_tn, profiling)
except Exception: # noqa: BLE001
error = 1
struct.pack_into("i", buf, _CHIP_OFF_ERROR, error)
Expand Down
9 changes: 9 additions & 0 deletions src/a2a3/platform/include/host/performance_collector.h
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,12 @@ class PerformanceCollector {
*/
bool is_initialized() const { return perf_shared_mem_host_ != nullptr; }

/**
* Set profiling level before initialize().
* 0=off, 1=AICore-only, 2=task+fanout, 3=full (with phase buffers)
*/
void set_perf_level(int level) { perf_level_ = level; }

/**
* Drain remaining buffers from the memory manager's ready queue
*
Expand Down Expand Up @@ -387,6 +393,9 @@ class PerformanceCollector {
PerfRegisterCallback register_cb_{nullptr};
PerfFreeCallback free_cb_{nullptr};

// Profiling level: 0=off, 1=AICore-only, 2=task+fanout, 3=full (with phase)
int perf_level_{0};

// Memory manager
ProfMemoryManager memory_manager_;

Expand Down
37 changes: 37 additions & 0 deletions src/a2a3/platform/include/host/runtime_profiling_mode.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
* Copyright (c) PyPTO Contributors.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
* -----------------------------------------------------------------------------------------------------------
*/

/**
* Shared helper: set perf_level and legacy enable_profiling on a Runtime struct.
*
* Used by both onboard and sim pto_runtime_c_api.cpp implementations.
* Some runtime structs still carry a bool enable_profiling member alongside
* the newer int perf_level. This template detects the legacy member at
* compile time and keeps both in sync.
*/

#pragma once

#include <type_traits>

template <typename T, typename = void>
struct HasEnableProfilingMember : std::false_type {};

template <typename T>
struct HasEnableProfilingMember<T, std::void_t<decltype(std::declval<T &>().enable_profiling)>> : std::true_type {};

template <typename R>
static inline void set_runtime_profiling_mode(R *runtime, int enable_profiling) {
runtime->perf_level = enable_profiling;
if constexpr (HasEnableProfilingMember<R>::value) {
runtime->enable_profiling = (enable_profiling > 0);
}
}
11 changes: 6 additions & 5 deletions src/a2a3/platform/onboard/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,7 @@ int DeviceRunner::run(
});

// Initialize performance profiling if enabled
if (runtime.enable_profiling) {
if (runtime.perf_level > 0) {
rc = init_performance_profiling(runtime, num_aicore, device_id);
if (rc != 0) {
LOG_ERROR("init_performance_profiling failed: %d", rc);
Expand Down Expand Up @@ -518,13 +518,13 @@ int DeviceRunner::run(
{
// Poll and collect performance data in a separate collector thread
std::thread collector_thread;
if (runtime.enable_profiling) {
if (runtime.perf_level > 0) {
collector_thread = create_thread([this, &runtime]() {
poll_and_collect_performance_data(runtime.get_task_count());
});
}
auto thread_guard = RAIIScopeGuard([&]() {
if (runtime.enable_profiling && collector_thread.joinable()) {
if (runtime.perf_level > 0 && collector_thread.joinable()) {
collector_thread.join();
}
});
Expand All @@ -545,13 +545,13 @@ int DeviceRunner::run(
}

// Signal collector that device execution is complete
if (runtime.enable_profiling) {
if (runtime.perf_level > 0) {
perf_collector_.signal_execution_complete();
}
}

// Stop memory management, drain remaining buffers, collect phase data, export
if (runtime.enable_profiling) {
if (runtime.perf_level > 0) {
perf_collector_.stop_memory_manager();
perf_collector_.drain_remaining_buffers();
perf_collector_.scan_remaining_perf_buffers();
Expand Down Expand Up @@ -810,6 +810,7 @@ int DeviceRunner::init_performance_profiling(Runtime &runtime, int num_aicore, i
return rtFree(dev_ptr);
};

perf_collector_.set_perf_level(runtime.perf_level);
return perf_collector_.initialize(runtime, num_aicore, device_id, alloc_cb, register_cb, free_cb);
}

Expand Down
5 changes: 2 additions & 3 deletions src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "common/unified_log.h"
#include "device_runner.h" // NOLINT(build/include_subdir)
#include "runtime.h" // NOLINT(build/include_subdir)
#include "host/runtime_profiling_mode.h"

extern "C" {

Expand Down Expand Up @@ -157,9 +158,7 @@ int run_runtime(
}

// Phase 2: profiling
if (enable_profiling) {
r->enable_profiling = true;
}
set_runtime_profiling_mode(r, enable_profiling);

// Phase 3: launch
std::vector<uint8_t> aicpu_vec(aicpu_binary, aicpu_binary + aicpu_size);
Expand Down
11 changes: 6 additions & 5 deletions src/a2a3/platform/sim/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ int DeviceRunner::run(
last_runtime_ = &runtime;

// Initialize performance profiling if enabled
if (runtime.enable_profiling) {
if (runtime.perf_level > 0) {
rc = init_performance_profiling(runtime, num_aicore, device_id);
if (rc != 0) {
LOG_ERROR("init_performance_profiling failed: %d", rc);
Expand Down Expand Up @@ -397,7 +397,7 @@ int DeviceRunner::run(

// Poll and collect performance data during execution (if enabled)
std::thread collector_thread;
if (runtime.enable_profiling) {
if (runtime.perf_level > 0) {
collector_thread = create_thread([this, &runtime]() {
poll_and_collect_performance_data(runtime.get_task_count());
});
Expand All @@ -413,19 +413,19 @@ int DeviceRunner::run(
}

// Signal collector that device execution is complete
if (runtime.enable_profiling) {
if (runtime.perf_level > 0) {
perf_collector_.signal_execution_complete();
}

// Wait for collector thread if it was launched
if (runtime.enable_profiling && collector_thread.joinable()) {
if (runtime.perf_level > 0 && collector_thread.joinable()) {
collector_thread.join();
}

LOG_INFO("All threads completed");

// Stop memory management, drain remaining buffers, collect phase data, export
if (runtime.enable_profiling) {
if (runtime.perf_level > 0) {
perf_collector_.stop_memory_manager();
perf_collector_.drain_remaining_buffers();
perf_collector_.scan_remaining_perf_buffers();
Expand Down Expand Up @@ -622,6 +622,7 @@ void DeviceRunner::remove_kernel_binary(int func_id) {
// =============================================================================

int DeviceRunner::init_performance_profiling(Runtime &runtime, int num_aicore, int device_id) {
perf_collector_.set_perf_level(runtime.perf_level);
// Define allocation callback (a2a3sim: use malloc)
auto alloc_cb = [](size_t size) -> void * {
return malloc(size);
Expand Down
5 changes: 2 additions & 3 deletions src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "cpu_sim_context.h" // NOLINT(build/include_subdir)
#include "device_runner.h" // NOLINT(build/include_subdir)
#include "runtime.h" // NOLINT(build/include_subdir)
#include "host/runtime_profiling_mode.h"

extern "C" {

Expand Down Expand Up @@ -154,9 +155,7 @@ int run_runtime(
}

// Phase 2: profiling
if (enable_profiling) {
r->enable_profiling = true;
}
set_runtime_profiling_mode(r, enable_profiling);

// Phase 3: launch
std::vector<uint8_t> aicpu_vec;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ void perf_aicpu_switch_buffer(Runtime *runtime, int core_id, int thread_idx) {
}

void perf_aicpu_flush_buffers(Runtime *runtime, int thread_idx, const int *cur_thread_cores, int core_num) {
if (!runtime->enable_profiling) {
if (runtime->perf_level <= 0) {
return;
}

Expand Down
Loading
Loading