From 0b39cb0f25db4a9143455f316017dfa17b60001d Mon Sep 17 00:00:00 2001
From: wangzihao122 <watson_silhouette@126.com>
Date: Fri, 10 Apr 2026 02:46:58 +0800
Subject: [PATCH] Refactor: introduce tiered profiling levels for a2a3
 tensormap_and_ringbuffer swimlane export

Replace the boolean `enable_profiling` flag with a 4-level `perf_level` (0=off,
1=AICore-only, 2=task+fanout, 3=full with AICPU phase records). The
tensormap_and_ringbuffer runtime honors all four levels, while legacy
host_build_graph / aicpu_build_graph paths continue to treat any non-zero value
as a simple on/off and stay on their existing bool member (synchronized via a
shared SFINAE helper in runtime_profiling_mode.h).

CLI and JSON are lifted to match:
- `--enable-profiling` in run_example.py now takes an optional int (default 3
  when flag given, 0 otherwise).
- The swimlane JSON schema gains a new version=0 (level 1: AICore-only) that
  omits dispatch/finish/fanout fields, and swimlane_converter.py accepts it.
- Phase buffer allocation, scheduler-phase recording and orchestrator summary
  writes in aicpu_executor.cpp are gated on perf_level>=3 so lower levels no
  longer pay the phase-profiling overhead; fanout/dispatch_timestamp collection
  is gated on perf_level>=2.
Additionally:
- CallConfig and WorkerPayload switch from bool to int; Python bindings
  accept both bool and int for backward compatibility (_normalize_perf_level
  in code_runner.py, getter/setter shim in task_interface.cpp).
- PerformanceCollector skips phase-buffer shared-memory allocation and
  phase-thread management when perf_level < 3 (calc_perf_data_size path).
- device_runner.cpp (onboard + sim): all enable_profiling guards replaced
  with perf_level > 0; set_perf_level() called before initialize().
- Unit tests updated for int-based profiling values.
---
 examples/scripts/code_runner.py               |  16 ++-
 examples/scripts/run_example.py               |   8 +-
 python/bindings/task_interface.cpp            |  22 ++-
 python/simpler/worker.py                      |   2 +-
 .../include/host/performance_collector.h      |   9 ++
 .../include/host/runtime_profiling_mode.h     |  37 +++++
 .../platform/onboard/host/device_runner.cpp   |  11 +-
 .../onboard/host/pto_runtime_c_api.cpp        |   5 +-
 src/a2a3/platform/sim/host/device_runner.cpp  |  11 +-
 .../platform/sim/host/pto_runtime_c_api.cpp   |   5 +-
 .../src/aicpu/performance_collector_aicpu.cpp |   2 +-
 .../src/host/performance_collector.cpp        | 134 ++++++++++--------
 .../aicpu_build_graph/runtime/runtime.h       |   1 +
 .../host_build_graph/runtime/runtime.cpp      |   1 +
 .../host_build_graph/runtime/runtime.h        |   3 +-
 .../aicore/aicore_executor.cpp                |   2 +-
 .../aicpu/aicpu_executor.cpp                  | 107 +++++++-------
 .../runtime/pto_orchestrator.cpp              |   4 +-
 .../runtime/pto_orchestrator.h                |   3 +-
 .../runtime/runtime.h                         |   2 +-
 src/common/distributed/dist_chip_process.cpp  |   2 +-
 src/common/distributed/dist_types.h           |   2 +-
 src/common/worker/chip_worker.cpp             |   4 +-
 src/common/worker/chip_worker.h               |   2 +-
 tests/ut/py/test_chip_worker.py               |   6 +-
 tools/swimlane_converter.py                   |  11 +-
 26 files changed, 256 insertions(+), 156 deletions(-)
 create mode 100644 src/a2a3/platform/include/host/runtime_profiling_mode.h

diff --git a/examples/scripts/code_runner.py b/examples/scripts/code_runner.py
index 38bd3eb2a..358c087c9 100644
--- a/examples/scripts/code_runner.py
+++ b/examples/scripts/code_runner.py
@@ -133,6 +133,12 @@ def _get_project_root() -> Path:
     return Path(__file__).parent.parent.parent  # examples/scripts/ -> examples/ -> simpler/
 
 
+def _normalize_perf_level(v) -> int:
+    if isinstance(v, bool):
+        return 3 if v else 0
+    return int(v)
+
+
 def _get_pto_isa_clone_path() -> Path:
     """Get the expected path to pto-isa clone."""
     return _get_project_root() / "examples" / "scripts" / "_deps" / "pto-isa"
@@ -477,7 +483,7 @@ def __init__(  # noqa: PLR0913
         golden_path: str,
         device_id: Optional[int] = None,
         platform: str = "a2a3",
-        enable_profiling: bool = False,
+        enable_profiling: int = 0,
         run_all_cases: bool = False,
         case_name: Optional[str] = None,
         pto_isa_commit: Optional[str] = None,
@@ -492,7 +498,7 @@ def __init__(  # noqa: PLR0913
         self.kernels_dir = Path(kernels_dir).resolve()
         self.golden_path = Path(golden_path).resolve()
         self.platform = platform
-        self.enable_profiling = enable_profiling
+        self._perf_level = _normalize_perf_level(enable_profiling)
         self.skip_golden = skip_golden
         self.project_root = _get_project_root()
 
@@ -887,9 +893,9 @@ def _compile_one_kernel(kernel):
                 config = ChipCallConfig()
                 config.block_dim = self.block_dim
                 config.aicpu_thread_num = self.aicpu_thread_num
-                if self.enable_profiling and round_idx == 0:
-                    config.enable_profiling = True
-                    logger.info("Profiling enabled")
+                if self._perf_level > 0 and round_idx == 0:
+                    config.enable_profiling = self._perf_level
+                    logger.info(f"Swimlane profiling enabled (mode={self._perf_level})")
 
                 with _temporary_env(run_env):
                     worker.run(chip_callable, orch_args, config)
diff --git a/examples/scripts/run_example.py b/examples/scripts/run_example.py
index 89ab84199..2a65dfeda 100644
--- a/examples/scripts/run_example.py
+++ b/examples/scripts/run_example.py
@@ -148,8 +148,12 @@ def compute_golden(tensors: dict, params: dict) -> None:
 
     parser.add_argument(
         "--enable-profiling",
-        action="store_true",
-        help="Enable profiling and generate swimlane.json",
+        type=int,
+        nargs="?",
+        const=3,
+        default=0,
+        metavar="LEVEL",
+        help="Swimlane profiling mode: 1=AICore only, 2=task+fanout, 3=full (default when flag given: 3)",
     )
 
     parser.add_argument(
diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp
index 2e1962df0..87d252a2b 100644
--- a/python/bindings/task_interface.cpp
+++ b/python/bindings/task_interface.cpp
@@ -579,11 +579,23 @@ NB_MODULE(_task_interface, m) {
         .def(nb::init<>())
         .def_rw("block_dim", &ChipCallConfig::block_dim)
         .def_rw("aicpu_thread_num", &ChipCallConfig::aicpu_thread_num)
-        .def_rw("enable_profiling", &ChipCallConfig::enable_profiling)
+        .def_prop_rw(
+            "enable_profiling",
+            [](const ChipCallConfig &self) {
+                return self.perf_level;
+            },
+            [](ChipCallConfig &self, nb::object v) {
+                if (nb::isinstance<nb::bool_>(v)) {
+                    self.perf_level = nb::cast<bool>(v) ? 3 : 0;
+                } else {
+                    self.perf_level = nb::cast<int>(v);
+                }
+            }
+        )
         .def("__repr__", [](const ChipCallConfig &self) -> std::string {
             std::ostringstream os;
             os << "ChipCallConfig(block_dim=" << self.block_dim << ", aicpu_thread_num=" << self.aicpu_thread_num
-               << ", enable_profiling=" << (self.enable_profiling ? "True" : "False") << ")";
+               << ", enable_profiling=" << self.perf_level << ")";
             return os.str();
         });
 
@@ -608,15 +620,15 @@ NB_MODULE(_task_interface, m) {
         .def(
             "run_raw",
             [](ChipWorker &self, uint64_t callable, uint64_t args, int block_dim, int aicpu_thread_num,
-               bool enable_profiling) {
+               int perf_level) {
                 ChipCallConfig config;
                 config.block_dim = block_dim;
                 config.aicpu_thread_num = aicpu_thread_num;
-                config.enable_profiling = enable_profiling;
+                config.perf_level = perf_level;
                 self.run(reinterpret_cast<const void *>(callable), reinterpret_cast<const void *>(args), config);
             },
             nb::arg("callable"), nb::arg("args"), nb::arg("block_dim") = 1, nb::arg("aicpu_thread_num") = 3,
-            nb::arg("enable_profiling") = false, "Run with raw pointer arguments (used from forked chip process)."
+            nb::arg("perf_level") = 0, "Run with raw pointer arguments (used from forked chip process)."
         )
         .def_prop_ro("device_id", &ChipWorker::device_id)
         .def_prop_ro("initialized", &ChipWorker::initialized)
diff --git a/python/simpler/worker.py b/python/simpler/worker.py
index 3e8d2e9aa..5508f8c0b 100644
--- a/python/simpler/worker.py
+++ b/python/simpler/worker.py
@@ -174,7 +174,7 @@ def _chip_process_loop(
 
             error = 0
             try:
-                cw.run_raw(callable_ptr, heap_args_ptr, block_dim, aicpu_tn, bool(profiling))
+                cw.run_raw(callable_ptr, heap_args_ptr, block_dim, aicpu_tn, profiling)
             except Exception:  # noqa: BLE001
                 error = 1
             struct.pack_into("i", buf, _CHIP_OFF_ERROR, error)
diff --git a/src/a2a3/platform/include/host/performance_collector.h b/src/a2a3/platform/include/host/performance_collector.h
index cf6a52e2b..98644abbf 100644
--- a/src/a2a3/platform/include/host/performance_collector.h
+++ b/src/a2a3/platform/include/host/performance_collector.h
@@ -332,6 +332,12 @@ class PerformanceCollector {
      */
     bool is_initialized() const { return perf_shared_mem_host_ != nullptr; }
 
+    /**
+     * Set profiling level before initialize().
+     * 0=off, 1=AICore-only, 2=task+fanout, 3=full (with phase buffers)
+     */
+    void set_perf_level(int level) { perf_level_ = level; }
+
     /**
      * Drain remaining buffers from the memory manager's ready queue
      *
@@ -387,6 +393,9 @@ class PerformanceCollector {
     PerfRegisterCallback register_cb_{nullptr};
     PerfFreeCallback free_cb_{nullptr};
 
+    // Profiling level: 0=off, 1=AICore-only, 2=task+fanout, 3=full (with phase)
+    int perf_level_{0};
+
     // Memory manager
     ProfMemoryManager memory_manager_;
 
diff --git a/src/a2a3/platform/include/host/runtime_profiling_mode.h b/src/a2a3/platform/include/host/runtime_profiling_mode.h
new file mode 100644
index 000000000..a3999a300
--- /dev/null
+++ b/src/a2a3/platform/include/host/runtime_profiling_mode.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+/**
+ * Shared helper: set perf_level and legacy enable_profiling on a Runtime struct.
+ *
+ * Used by both onboard and sim pto_runtime_c_api.cpp implementations.
+ * Some runtime structs still carry a bool enable_profiling member alongside
+ * the newer int perf_level.  This template detects the legacy member at
+ * compile time and keeps both in sync.
+ */
+
+#pragma once
+
+#include <type_traits>
+
+template <typename T, typename = void>
+struct HasEnableProfilingMember : std::false_type {};
+
+template <typename T>
+struct HasEnableProfilingMember<T, std::void_t<decltype(std::declval<T &>().enable_profiling)>> : std::true_type {};
+
+template <typename R>
+static inline void set_runtime_profiling_mode(R *runtime, int enable_profiling) {
+    runtime->perf_level = enable_profiling;
+    if constexpr (HasEnableProfilingMember<R>::value) {
+        runtime->enable_profiling = (enable_profiling > 0);
+    }
+}
diff --git a/src/a2a3/platform/onboard/host/device_runner.cpp b/src/a2a3/platform/onboard/host/device_runner.cpp
index 41d2235c8..24cc86b04 100644
--- a/src/a2a3/platform/onboard/host/device_runner.cpp
+++ b/src/a2a3/platform/onboard/host/device_runner.cpp
@@ -461,7 +461,7 @@ int DeviceRunner::run(
     });
 
     // Initialize performance profiling if enabled
-    if (runtime.enable_profiling) {
+    if (runtime.perf_level > 0) {
         rc = init_performance_profiling(runtime, num_aicore, device_id);
         if (rc != 0) {
             LOG_ERROR("init_performance_profiling failed: %d", rc);
@@ -530,13 +530,13 @@ int DeviceRunner::run(
     {
         // Poll and collect performance data in a separate collector thread
         std::thread collector_thread;
-        if (runtime.enable_profiling) {
+        if (runtime.perf_level > 0) {
             collector_thread = create_thread([this, &runtime]() {
                 poll_and_collect_performance_data(runtime.get_task_count());
             });
         }
         auto thread_guard = RAIIScopeGuard([&]() {
-            if (runtime.enable_profiling && collector_thread.joinable()) {
+            if (runtime.perf_level > 0 && collector_thread.joinable()) {
                 collector_thread.join();
             }
         });
@@ -557,13 +557,13 @@ int DeviceRunner::run(
         }
 
         // Signal collector that device execution is complete
-        if (runtime.enable_profiling) {
+        if (runtime.perf_level > 0) {
             perf_collector_.signal_execution_complete();
         }
     }
 
     // Stop memory management, drain remaining buffers, collect phase data, export
-    if (runtime.enable_profiling) {
+    if (runtime.perf_level > 0) {
         perf_collector_.stop_memory_manager();
         perf_collector_.drain_remaining_buffers();
         perf_collector_.scan_remaining_perf_buffers();
@@ -822,6 +822,7 @@ int DeviceRunner::init_performance_profiling(Runtime &runtime, int num_aicore, i
         return rtFree(dev_ptr);
     };
 
+    perf_collector_.set_perf_level(runtime.perf_level);
     return perf_collector_.initialize(runtime, num_aicore, device_id, alloc_cb, register_cb, free_cb);
 }
 
diff --git a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
index ceab62b52..b945c5133 100644
--- a/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/onboard/host/pto_runtime_c_api.cpp
@@ -26,6 +26,7 @@
 #include "common/unified_log.h"
 #include "device_runner.h"  // NOLINT(build/include_subdir)
 #include "runtime.h"        // NOLINT(build/include_subdir)
+#include "host/runtime_profiling_mode.h"
 
 extern "C" {
 
@@ -162,9 +163,7 @@ int run_runtime(
                 }
 
                 // Phase 2: profiling
-                if (enable_profiling) {
-                    r->enable_profiling = true;
-                }
+                set_runtime_profiling_mode(r, enable_profiling);
 
                 // Phase 3: launch
                 std::vector<uint8_t> aicpu_vec(aicpu_binary, aicpu_binary + aicpu_size);
diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp
index e60ceed58..3e38a1f4e 100644
--- a/src/a2a3/platform/sim/host/device_runner.cpp
+++ b/src/a2a3/platform/sim/host/device_runner.cpp
@@ -299,7 +299,7 @@ int DeviceRunner::run(
     last_runtime_ = &runtime;
 
     // Initialize performance profiling if enabled
-    if (runtime.enable_profiling) {
+    if (runtime.perf_level > 0) {
         rc = init_performance_profiling(runtime, num_aicore, device_id);
         if (rc != 0) {
             LOG_ERROR("init_performance_profiling failed: %d", rc);
@@ -388,7 +388,7 @@ int DeviceRunner::run(
 
     // Poll and collect performance data during execution (if enabled)
     std::thread collector_thread;
-    if (runtime.enable_profiling) {
+    if (runtime.perf_level > 0) {
         collector_thread = create_thread([this, &runtime]() {
             poll_and_collect_performance_data(runtime.get_task_count());
         });
@@ -404,19 +404,19 @@ int DeviceRunner::run(
     }
 
     // Signal collector that device execution is complete
-    if (runtime.enable_profiling) {
+    if (runtime.perf_level > 0) {
         perf_collector_.signal_execution_complete();
     }
 
     // Wait for collector thread if it was launched
-    if (runtime.enable_profiling && collector_thread.joinable()) {
+    if (runtime.perf_level > 0 && collector_thread.joinable()) {
         collector_thread.join();
     }
 
     LOG_INFO("All threads completed");
 
     // Stop memory management, drain remaining buffers, collect phase data, export
-    if (runtime.enable_profiling) {
+    if (runtime.perf_level > 0) {
         perf_collector_.stop_memory_manager();
         perf_collector_.drain_remaining_buffers();
         perf_collector_.scan_remaining_perf_buffers();
@@ -623,6 +623,7 @@ void DeviceRunner::remove_kernel_binary(int func_id) {
 // =============================================================================
 
 int DeviceRunner::init_performance_profiling(Runtime &runtime, int num_aicore, int device_id) {
+    perf_collector_.set_perf_level(runtime.perf_level);
     // Define allocation callback (a2a3sim: use malloc)
     auto alloc_cb = [](size_t size) -> void * {
         return malloc(size);
diff --git a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
index 37028f27d..6d85847b8 100644
--- a/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
+++ b/src/a2a3/platform/sim/host/pto_runtime_c_api.cpp
@@ -28,6 +28,7 @@
 #include "cpu_sim_context.h"  // NOLINT(build/include_subdir)
 #include "device_runner.h"    // NOLINT(build/include_subdir)
 #include "runtime.h"          // NOLINT(build/include_subdir)
+#include "host/runtime_profiling_mode.h"
 
 extern "C" {
 
@@ -154,9 +155,7 @@ int run_runtime(
         }
 
         // Phase 2: profiling
-        if (enable_profiling) {
-            r->enable_profiling = true;
-        }
+        set_runtime_profiling_mode(r, enable_profiling);
 
         // Phase 3: launch
         std::vector<uint8_t> aicpu_vec;
diff --git a/src/a2a3/platform/src/aicpu/performance_collector_aicpu.cpp b/src/a2a3/platform/src/aicpu/performance_collector_aicpu.cpp
index 57098e111..2a5c26619 100644
--- a/src/a2a3/platform/src/aicpu/performance_collector_aicpu.cpp
+++ b/src/a2a3/platform/src/aicpu/performance_collector_aicpu.cpp
@@ -219,7 +219,7 @@ void perf_aicpu_switch_buffer(Runtime *runtime, int core_id, int thread_idx) {
 }
 
 void perf_aicpu_flush_buffers(Runtime *runtime, int thread_idx, const int *cur_thread_cores, int core_num) {
-    if (!runtime->enable_profiling) {
+    if (runtime->perf_level <= 0) {
         return;
     }
 
diff --git a/src/a2a3/platform/src/host/performance_collector.cpp b/src/a2a3/platform/src/host/performance_collector.cpp
index 79a3bd78b..7ee695437 100644
--- a/src/a2a3/platform/src/host/performance_collector.cpp
+++ b/src/a2a3/platform/src/host/performance_collector.cpp
@@ -77,7 +77,7 @@ void ProfMemoryManager::stop() {
 
     // Drain remaining done_queue and free buffers
     {
-        std::lock_guard<std::mutex> lock(done_mutex_);
+        std::scoped_lock lock(done_mutex_);
         while (!done_queue_.empty()) {
             CopyDoneInfo info = done_queue_.front();
             done_queue_.pop();
@@ -99,7 +99,7 @@ void ProfMemoryManager::stop() {
 }
 
 bool ProfMemoryManager::try_pop_ready(ReadyBufferInfo &info) {
-    std::lock_guard<std::mutex> lock(ready_mutex_);
+    std::scoped_lock lock(ready_mutex_);
     if (ready_queue_.empty()) {
         return false;
     }
@@ -121,7 +121,7 @@ bool ProfMemoryManager::wait_pop_ready(ReadyBufferInfo &info, std::chrono::milli
 }
 
 void ProfMemoryManager::notify_copy_done(const CopyDoneInfo &info) {
-    std::lock_guard<std::mutex> lock(done_mutex_);
+    std::scoped_lock lock(done_mutex_);
     done_queue_.push(info);
 }
 
@@ -210,7 +210,7 @@ void ProfMemoryManager::process_ready_entry(
                 host_ptr = resolve_host_ptr(new_dev_ptr);
             }
             if (new_dev_ptr == nullptr) {
-                std::lock_guard<std::mutex> lock(done_mutex_);
+                std::scoped_lock lock(done_mutex_);
                 while (!done_queue_.empty()) {
                     CopyDoneInfo dinfo = done_queue_.front();
                     done_queue_.pop();
@@ -258,7 +258,7 @@ void ProfMemoryManager::process_ready_entry(
         info.buffer_seq = seq;
 
         {
-            std::lock_guard<std::mutex> lock(ready_mutex_);
+            std::scoped_lock lock(ready_mutex_);
             ready_queue_.push(info);
         }
         ready_cv_.notify_one();
@@ -289,7 +289,7 @@ void ProfMemoryManager::process_ready_entry(
                 host_ptr = resolve_host_ptr(new_dev_ptr);
             }
             if (new_dev_ptr == nullptr) {
-                std::lock_guard<std::mutex> lock(done_mutex_);
+                std::scoped_lock lock(done_mutex_);
                 while (!done_queue_.empty()) {
                     CopyDoneInfo dinfo = done_queue_.front();
                     done_queue_.pop();
@@ -335,7 +335,7 @@ void ProfMemoryManager::process_ready_entry(
         info.buffer_seq = seq;
 
         {
-            std::lock_guard<std::mutex> lock(ready_mutex_);
+            std::scoped_lock lock(ready_mutex_);
             ready_queue_.push(info);
         }
         ready_cv_.notify_one();
@@ -348,7 +348,7 @@ void ProfMemoryManager::mgmt_loop() {
     while (running_.load()) {
         // 1. Recycle done queue: move completed buffers to recycled pools for reuse
         {
-            std::lock_guard<std::mutex> lock(done_mutex_);
+            std::scoped_lock lock(done_mutex_);
             while (!done_queue_.empty()) {
                 CopyDoneInfo info = done_queue_.front();
                 done_queue_.pop();
@@ -561,8 +561,9 @@ int PerformanceCollector::initialize(
     free_cb_ = free_cb;
 
     // Step 1: Calculate shared memory size (slot arrays only, no actual buffers)
-    int num_phase_threads = PLATFORM_MAX_AICPU_THREADS;
-    size_t total_size = calc_perf_data_size_with_phases(num_aicore, num_phase_threads);
+    int num_phase_threads = (perf_level_ >= 3) ? PLATFORM_MAX_AICPU_THREADS : 0;
+    size_t total_size = (num_phase_threads > 0) ? calc_perf_data_size_with_phases(num_aicore, num_phase_threads) :
+                                                  calc_perf_data_size(num_aicore);
 
     LOG_DEBUG("Shared memory allocation plan:");
     LOG_DEBUG("  Number of cores:      %d", num_aicore);
@@ -651,41 +652,43 @@ int PerformanceCollector::initialize(
         num_aicore * (PLATFORM_PROF_BUFFERS_PER_CORE - 1)
     );
 
-    // Step 6: Initialize PhaseBufferStates — 1 buffer per thread in free_queue, rest to recycled pool
-    for (int t = 0; t < num_phase_threads; t++) {
-        PhaseBufferState *state = get_phase_buffer_state(perf_host_ptr, num_aicore, t);
-        memset(state, 0, sizeof(PhaseBufferState));
-
-        state->free_queue.head = 0;
-        state->free_queue.tail = 0;
-        state->current_buf_ptr = 0;
-        state->current_buf_seq = 0;
-
-        for (int s = 0; s < PLATFORM_PROF_BUFFERS_PER_THREAD; s++) {
-            void *host_buf_ptr = nullptr;
-            void *dev_buf_ptr = alloc_single_buffer(sizeof(PhaseBuffer), &host_buf_ptr);
-            if (dev_buf_ptr == nullptr) {
-                LOG_ERROR("Failed to allocate PhaseBuffer for thread %d, buffer %d", t, s);
-                return -1;
-            }
-            PhaseBuffer *buf = reinterpret_cast<PhaseBuffer *>(host_buf_ptr);
-            memset(buf, 0, sizeof(PhaseBuffer));
-            buf->count = 0;
+    // Step 6: Initialize PhaseBufferStates (only when phase recording enabled)
+    if (num_phase_threads > 0) {
+        for (int t = 0; t < num_phase_threads; t++) {
+            PhaseBufferState *state = get_phase_buffer_state(perf_host_ptr, num_aicore, t);
+            memset(state, 0, sizeof(PhaseBufferState));
+
+            state->free_queue.head = 0;
+            state->free_queue.tail = 0;
+            state->current_buf_ptr = 0;
+            state->current_buf_seq = 0;
+
+            for (int s = 0; s < PLATFORM_PROF_BUFFERS_PER_THREAD; s++) {
+                void *host_buf_ptr = nullptr;
+                void *dev_buf_ptr = alloc_single_buffer(sizeof(PhaseBuffer), &host_buf_ptr);
+                if (dev_buf_ptr == nullptr) {
+                    LOG_ERROR("Failed to allocate PhaseBuffer for thread %d, buffer %d", t, s);
+                    return -1;
+                }
+                PhaseBuffer *buf = reinterpret_cast<PhaseBuffer *>(host_buf_ptr);
+                memset(buf, 0, sizeof(PhaseBuffer));
+                buf->count = 0;
 
-            if (s == 0) {
-                state->free_queue.buffer_ptrs[0] = reinterpret_cast<uint64_t>(dev_buf_ptr);
-            } else {
-                memory_manager_.recycled_phase_buffers_.push_back(dev_buf_ptr);
+                if (s == 0) {
+                    state->free_queue.buffer_ptrs[0] = reinterpret_cast<uint64_t>(dev_buf_ptr);
+                } else {
+                    memory_manager_.recycled_phase_buffers_.push_back(dev_buf_ptr);
+                }
             }
+            wmb();
+            state->free_queue.tail = 1;
+            wmb();
         }
-        wmb();
-        state->free_queue.tail = 1;
-        wmb();
+        LOG_DEBUG(
+            "Initialized %d PhaseBufferStates: 1 buffer/thread, %d in recycled pool", num_phase_threads,
+            num_phase_threads * (PLATFORM_PROF_BUFFERS_PER_THREAD - 1)
+        );
     }
-    LOG_DEBUG(
-        "Initialized %d PhaseBufferStates: 1 buffer/thread, %d in recycled pool", num_phase_threads,
-        num_phase_threads * (PLATFORM_PROF_BUFFERS_PER_THREAD - 1)
-    );
 
     wmb();
 
@@ -706,8 +709,8 @@ void PerformanceCollector::start_memory_manager(const ThreadFactory &thread_fact
     }
 
     memory_manager_.start(
-        perf_shared_mem_host_, num_aicore_, PLATFORM_MAX_AICPU_THREADS, alloc_cb_, register_cb_, free_cb_, device_id_,
-        thread_factory
+        perf_shared_mem_host_, num_aicore_, (perf_level_ >= 3) ? PLATFORM_MAX_AICPU_THREADS : 0, alloc_cb_,
+        register_cb_, free_cb_, device_id_, thread_factory
     );
 }
 
@@ -1245,7 +1248,17 @@ int PerformanceCollector::export_swimlane_json(const std::string &output_path) {
     }
 
     // Step 7: Write JSON data
-    int version = has_phase_data_ ? 2 : 1;
+    int version;
+    if (perf_level_ <= 1) {
+        version = 0;
+    } else if (has_phase_data_) {
+        version = 2;
+    } else {
+        if (perf_level_ >= 3) {
+            LOG_WARN("perf_level=%d but no phase data collected; writing version=1", perf_level_);
+        }
+        version = 1;
+    }
     outfile << "{\n";
     outfile << "  \"version\": " << version << ",\n";
     outfile << "  \"tasks\": [\n";
@@ -1258,8 +1271,6 @@ int PerformanceCollector::export_swimlane_json(const std::string &output_path) {
         double start_us = cycles_to_us(record.start_time - base_time_cycles);
         double end_us = cycles_to_us(record.end_time - base_time_cycles);
         double duration_us = end_us - start_us;
-        double dispatch_us = (record.dispatch_time > 0) ? cycles_to_us(record.dispatch_time - base_time_cycles) : 0.0;
-        double finish_us = (record.finish_time > 0) ? cycles_to_us(record.finish_time - base_time_cycles) : 0.0;
 
         const char *core_type_str = (record.core_type == CoreType::AIC) ? "aic" : "aiv";
 
@@ -1271,20 +1282,27 @@ int PerformanceCollector::export_swimlane_json(const std::string &output_path) {
         outfile << "      \"ring_id\": " << static_cast<int>(record.task_id >> 32) << ",\n";
         outfile << "      \"start_time_us\": " << std::fixed << std::setprecision(3) << start_us << ",\n";
         outfile << "      \"end_time_us\": " << std::fixed << std::setprecision(3) << end_us << ",\n";
-        outfile << "      \"duration_us\": " << std::fixed << std::setprecision(3) << duration_us << ",\n";
-        outfile << "      \"dispatch_time_us\": " << std::fixed << std::setprecision(3) << dispatch_us << ",\n";
-        outfile << "      \"finish_time_us\": " << std::fixed << std::setprecision(3) << finish_us << ",\n";
-        outfile << "      \"fanout\": [";
-        int safe_fanout_count =
-            (record.fanout_count >= 0 && record.fanout_count <= RUNTIME_MAX_FANOUT) ? record.fanout_count : 0;
-        for (int j = 0; j < safe_fanout_count; ++j) {
-            outfile << record.fanout[j];
-            if (j < safe_fanout_count - 1) {
-                outfile << ", ";
+        if (perf_level_ >= 2) {
+            double dispatch_us =
+                (record.dispatch_time > 0) ? cycles_to_us(record.dispatch_time - base_time_cycles) : 0.0;
+            double finish_us = (record.finish_time > 0) ? cycles_to_us(record.finish_time - base_time_cycles) : 0.0;
+            outfile << "      \"duration_us\": " << std::fixed << std::setprecision(3) << duration_us << ",\n";
+            outfile << "      \"dispatch_time_us\": " << std::fixed << std::setprecision(3) << dispatch_us << ",\n";
+            outfile << "      \"finish_time_us\": " << std::fixed << std::setprecision(3) << finish_us << ",\n";
+            outfile << "      \"fanout\": [";
+            int safe_fanout_count =
+                (record.fanout_count >= 0 && record.fanout_count <= RUNTIME_MAX_FANOUT) ? record.fanout_count : 0;
+            for (int j = 0; j < safe_fanout_count; ++j) {
+                outfile << record.fanout[j];
+                if (j < safe_fanout_count - 1) {
+                    outfile << ", ";
+                }
             }
+            outfile << "],\n";
+            outfile << "      \"fanout_count\": " << record.fanout_count << "\n";
+        } else {
+            outfile << "      \"duration_us\": " << std::fixed << std::setprecision(3) << duration_us << "\n";
         }
-        outfile << "],\n";
-        outfile << "      \"fanout_count\": " << record.fanout_count << "\n";
         outfile << "    }";
         if (i < tagged_records.size() - 1) {
             outfile << ",";
diff --git a/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h b/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h
index eb01973dd..6ad207d76 100644
--- a/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h
+++ b/src/a2a3/runtime/aicpu_build_graph/runtime/runtime.h
@@ -168,6 +168,7 @@ class Runtime {
 
     // Profiling support
     bool enable_profiling;  // Enable profiling flag
+    int perf_level = 0;     // Derived from enable_profiling: 0=off, 2=task+fanout
 
     // Orchestrator-to-scheduler transition control
     // When true, orchestrator threads convert to scheduler threads after orchestration completes.
diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.cpp b/src/a2a3/runtime/host_build_graph/runtime/runtime.cpp
index 9899b1a48..852c6c297 100644
--- a/src/a2a3/runtime/host_build_graph/runtime/runtime.cpp
+++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.cpp
@@ -46,6 +46,7 @@ Runtime::Runtime() {
     worker_count = 0;
     sche_cpu_num = 1;
     enable_profiling = false;
+    perf_level = 0;
     perf_data_base = 0;
     tensor_pair_count = 0;
 
diff --git a/src/a2a3/runtime/host_build_graph/runtime/runtime.h b/src/a2a3/runtime/host_build_graph/runtime/runtime.h
index 0c92ed234..9d7432a11 100644
--- a/src/a2a3/runtime/host_build_graph/runtime/runtime.h
+++ b/src/a2a3/runtime/host_build_graph/runtime/runtime.h
@@ -202,7 +202,8 @@ class Runtime {
     int sche_cpu_num;  // Number of AICPU threads for scheduling
 
     // Profiling support
-    bool enable_profiling;    // Enable profiling flag
+    bool enable_profiling;    // Legacy flag for host_build_graph runtime paths
+    int perf_level;           // 0=off, >0=profiling on (extended mode encoding)
     uint64_t perf_data_base;  // Performance data shared memory base address (device-side)
 
     // Task storage
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
index cb2977227..10ab0bf82 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicore/aicore_executor.cpp
@@ -88,7 +88,7 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in
     // Cache per-core dispatch payload pointer (set by AICPU before aicpu_ready)
     __gm__ PTO2DispatchPayload *payload = reinterpret_cast<__gm__ PTO2DispatchPayload *>(my_hank->task);
 
-    bool profiling_enabled = runtime->enable_profiling;
+    bool profiling_enabled = (runtime->perf_level > 0);
 
     // Phase 4: Main execution loop - poll register for tasks until exit signal
     // Register encoding: AICPU_IDLE_TASK_ID=idle, task_id=task, AICORE_EXIT_SIGNAL=exit
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 7b8b60538..9d808bb1d 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -456,7 +456,7 @@ struct AicpuExecutor {
         PTO2LocalReadyBuffer *local_bufs, CoreType ct
 #if PTO2_PROFILING
         ,
-        bool profiling_enabled, uint32_t &phase_complete_count, uint64_t dispatch_ts
+        bool task_recording_enabled, int perf_level, uint32_t &phase_complete_count, uint64_t dispatch_ts
 #endif
 #if PTO2_SCHED_PROFILING
         ,
@@ -506,26 +506,31 @@ struct AicpuExecutor {
         }
 
 #if PTO2_PROFILING
-        if (profiling_enabled) {
+        if (task_recording_enabled) {
 #if PTO2_SCHED_PROFILING
             uint64_t t_perf_start = get_sys_cnt_aicpu();
 #endif
             Handshake *h = &hank[core_id];
-            uint64_t finish_ts = get_sys_cnt_aicpu();
             PerfBuffer *perf_buf = reinterpret_cast<PerfBuffer *>(h->perf_records_addr);
 
             uint64_t fanout_arr[RUNTIME_MAX_FANOUT];
             int32_t fanout_n = 0;
-            PTO2DepListEntry *cur = slot_state.fanout_head;
-            while (cur != nullptr && fanout_n < RUNTIME_MAX_FANOUT) {
-                fanout_arr[fanout_n++] = cur->slot_state->task->task_id.raw;
-                cur = cur->next;
+            uint64_t finish_ts = 0;
+
+            if (perf_level >= 2) {
+                finish_ts = get_sys_cnt_aicpu();
+                PTO2DepListEntry *cur = slot_state.fanout_head;
+                while (cur != nullptr && fanout_n < RUNTIME_MAX_FANOUT) {
+                    fanout_arr[fanout_n++] = cur->slot_state->task->task_id.raw;
+                    cur = cur->next;
+                }
             }
 
             int32_t perf_slot_idx = static_cast<int32_t>(subslot);
             if (perf_aicpu_complete_record(
                     perf_buf, static_cast<uint32_t>(expected_reg_task_id), slot_state.task->task_id.raw,
-                    slot_state.task->kernel_id[perf_slot_idx], ct, dispatch_ts, finish_ts, fanout_arr, fanout_n
+                    slot_state.task->kernel_id[perf_slot_idx], ct, dispatch_ts, finish_ts,
+                    (perf_level >= 2) ? fanout_arr : nullptr, fanout_n
                 ) != 0) {
                 DEV_ERROR(
                     "Core %d: perf_aicpu_complete_record failed for task 0x%" PRIx64, core_id,
@@ -564,7 +569,7 @@ struct AicpuExecutor {
         PTO2LocalReadyBuffer *local_bufs
 #if PTO2_PROFILING
         ,
-        bool profiling_enabled, uint32_t &phase_complete_count
+        bool task_recording_enabled, int perf_level, uint32_t &phase_complete_count
 #endif
 #if PTO2_SCHED_PROFILING
         ,
@@ -586,7 +591,7 @@ struct AicpuExecutor {
             int32_t reg_state = EXTRACT_TASK_STATE(reg_val);
 
 #if PTO2_SCHED_PROFILING
-            if (profiling_enabled) {
+            if (task_recording_enabled) {
                 complete_probe_count++;
             }
 #endif
@@ -596,7 +601,7 @@ struct AicpuExecutor {
             if (!t.matched) continue;
 
 #if PTO2_SCHED_PROFILING
-            if (profiling_enabled && (t.running_done || t.pending_done)) {
+            if (task_recording_enabled && (t.running_done || t.pending_done)) {
                 complete_hit_count++;
             }
 #endif
@@ -610,7 +615,7 @@ struct AicpuExecutor {
                     completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs, CT
 #if PTO2_PROFILING
                     ,
-                    profiling_enabled, phase_complete_count, core.pending_dispatch_timestamp
+                    task_recording_enabled, perf_level, phase_complete_count, core.pending_dispatch_timestamp
 #endif
 #if PTO2_SCHED_PROFILING
                     ,
@@ -626,7 +631,7 @@ struct AicpuExecutor {
                     completed_this_turn, deferred_release_slot_states, deferred_release_count, local_bufs, CT
 #if PTO2_PROFILING
                     ,
-                    profiling_enabled, phase_complete_count, core.running_dispatch_timestamp
+                    task_recording_enabled, perf_level, phase_complete_count, core.running_dispatch_timestamp
 #endif
 #if PTO2_SCHED_PROFILING
                     ,
@@ -773,7 +778,7 @@ struct AicpuExecutor {
         PTO2SubtaskSlot subslot, bool to_pending
 #if PTO2_PROFILING
         ,
-        bool profiling_enabled
+        bool task_recording_enabled
 #endif
     ) {
         CoreTracker &tracker = core_trackers_[thread_idx];
@@ -813,7 +818,7 @@ struct AicpuExecutor {
             core_exec_state.pending_slot_state = &slot_state;
             core_exec_state.pending_reg_task_id = static_cast<int32_t>(reg_task_id);
 #if PTO2_PROFILING
-            if (profiling_enabled) {
+            if (task_recording_enabled && runtime->perf_level >= 2) {
                 core_exec_state.pending_dispatch_timestamp = get_sys_cnt_aicpu();
             }
 #endif
@@ -822,7 +827,7 @@ struct AicpuExecutor {
             core_exec_state.running_slot_state = &slot_state;
             core_exec_state.running_reg_task_id = static_cast<int32_t>(reg_task_id);
 #if PTO2_PROFILING
-            if (profiling_enabled) {
+            if (task_recording_enabled && runtime->perf_level >= 2) {
                 core_exec_state.running_dispatch_timestamp = get_sys_cnt_aicpu();
             }
 #endif
@@ -830,7 +835,7 @@ struct AicpuExecutor {
             tracker.change_core_state(core_offset);
         }
 #if PTO2_PROFILING
-        if (profiling_enabled) {
+        if (task_recording_enabled) {
             if (core_exec_state.dispatch_count >= PLATFORM_PROF_BUFFER_SIZE) {
                 perf_aicpu_switch_buffer(runtime, core_id, thread_idx);
                 core_exec_state.dispatch_count = 0;
@@ -852,7 +857,7 @@ struct AicpuExecutor {
         Runtime *runtime, int32_t thread_idx, int32_t cluster_offset, PTO2TaskSlotState &slot_state
 #if PTO2_PROFILING
         ,
-        bool profiling_enabled
+        bool task_recording_enabled
 #endif
     ) {
         CoreTracker &tracker = core_trackers_[thread_idx];
@@ -863,7 +868,7 @@ struct AicpuExecutor {
                 false
 #if PTO2_PROFILING
                 ,
-                profiling_enabled
+                task_recording_enabled
 #endif
             );
         }
@@ -873,7 +878,7 @@ struct AicpuExecutor {
                 false
 #if PTO2_PROFILING
                 ,
-                profiling_enabled
+                task_recording_enabled
 #endif
             );
         }
@@ -883,7 +888,7 @@ struct AicpuExecutor {
                 false
 #if PTO2_PROFILING
                 ,
-                profiling_enabled
+                task_recording_enabled
 #endif
             );
         }
@@ -923,7 +928,7 @@ struct AicpuExecutor {
         PTO2ResourceShape shape
 #if PTO2_PROFILING
         ,
-        bool profiling_enabled, uint32_t &phase_dispatch_count
+        bool task_recording_enabled, bool phase_recording_enabled, uint32_t &phase_dispatch_count
 #endif
     ) {
         CoreTracker &tracker = core_trackers_[thread_idx];
@@ -932,7 +937,7 @@ struct AicpuExecutor {
                 runtime, thread_idx, cluster_offset, slot_state
 #if PTO2_PROFILING
                 ,
-                profiling_enabled
+                task_recording_enabled
 #endif
             );
         } else if (shape == PTO2ResourceShape::AIC) {
@@ -941,7 +946,7 @@ struct AicpuExecutor {
                 false
 #if PTO2_PROFILING
                 ,
-                profiling_enabled
+                task_recording_enabled
 #endif
             );
         } else {  // AIV
@@ -952,12 +957,14 @@ struct AicpuExecutor {
                 runtime, thread_idx, core_offset, slot_state, PTO2SubtaskSlot::AIV0, false
 #if PTO2_PROFILING
                 ,
-                profiling_enabled
+                task_recording_enabled
 #endif
             );
         }
 #if PTO2_PROFILING
-        phase_dispatch_count += __builtin_popcount(pto2_core_mask(slot_state.active_mask));
+        if (phase_recording_enabled) {
+            phase_dispatch_count += __builtin_popcount(pto2_core_mask(slot_state.active_mask));
+        }
 #endif
     }
 
@@ -981,7 +988,7 @@ struct AicpuExecutor {
         Runtime *runtime, int32_t block_num
 #if PTO2_PROFILING
         ,
-        bool profiling_enabled, uint32_t &phase_dispatch_count
+        bool task_recording_enabled, bool phase_recording_enabled, uint32_t &phase_dispatch_count
 #endif
     ) {
         PTO2TaskSlotState *slot_state = drain_state_.pending_task;
@@ -998,7 +1005,7 @@ struct AicpuExecutor {
                     runtime, t, valid.pop_first(), *slot_state, shape
 #if PTO2_PROFILING
                     ,
-                    profiling_enabled, phase_dispatch_count
+                    task_recording_enabled, phase_recording_enabled, phase_dispatch_count
 #endif
                 );
                 slot_state->next_block_idx++;
@@ -1032,7 +1039,7 @@ struct AicpuExecutor {
         Runtime *runtime, int32_t thread_idx
 #if PTO2_PROFILING
         ,
-        bool profiling_enabled, uint32_t &phase_dispatch_count
+        bool task_recording_enabled, bool phase_recording_enabled, uint32_t &phase_dispatch_count
 #endif
     ) {
         // Spin until drain is fully initialized (sentinel -1 → block_num > 0).
@@ -1081,7 +1088,7 @@ struct AicpuExecutor {
             runtime, block_num
 #if PTO2_PROFILING
             ,
-            profiling_enabled, phase_dispatch_count
+            task_recording_enabled, phase_recording_enabled, phase_dispatch_count
 #endif
         );
     }
@@ -1474,9 +1481,10 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
 #if PTO2_PROFILING
         // Assign perf buffers to cores early so profiling captures all tasks
         // (total_tasks written to header later when orchestrator completes)
-        if (runtime->enable_profiling) {
+        if (runtime->perf_level > 0) {
             perf_aicpu_init_profiling(runtime);
-            // Initialize phase profiling for scheduler threads + orchestrator threads
+        }
+        if (runtime->perf_level >= 3) {
             perf_aicpu_init_phase_profiling(runtime, sched_thread_num_);
             perf_aicpu_set_orch_thread_idx(sched_thread_num_);
         }
@@ -1495,7 +1503,9 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
     int32_t idle_iterations = 0;
     int32_t last_progress_count = 0;
 #if PTO2_PROFILING
-    bool profiling_enabled = runtime->enable_profiling;
+    int perf_level = runtime->perf_level;
+    bool task_recording_enabled = (perf_level > 0);
+    bool phase_recording_enabled = (perf_level >= 3);
 #endif
 
     // Scheduler profiling counters
@@ -1618,7 +1628,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
                 deferred_release_slot_states, deferred_release_count, local_bufs
 #if PTO2_PROFILING
                 ,
-                profiling_enabled, phase_complete_count
+                task_recording_enabled, perf_level, phase_complete_count
 #endif
 #if PTO2_SCHED_PROFILING
                 ,
@@ -1636,7 +1646,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
                 deferred_release_slot_states, deferred_release_count, local_bufs
 #if PTO2_PROFILING
                 ,
-                profiling_enabled, phase_complete_count
+                task_recording_enabled, perf_level, phase_complete_count
 #endif
 #if PTO2_SCHED_PROFILING
                 ,
@@ -1668,7 +1678,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
             CYCLE_COUNT_LAP(sched_idle_cycle);
         } else {
             CYCLE_COUNT_LAP(sched_complete_cycle);
-            if (profiling_enabled && phase_complete_count > 0) {
+            if (phase_recording_enabled && phase_complete_count > 0) {
                 perf_aicpu_record_phase(
                     thread_idx, AicpuPhaseId::SCHED_COMPLETE, _t0_phase, _t1, sched_loop_count, phase_complete_count
                 );
@@ -1689,7 +1699,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
                 runtime, thread_idx
 #if PTO2_PROFILING
                 ,
-                profiling_enabled, phase_dispatch_count
+                task_recording_enabled, phase_recording_enabled, phase_dispatch_count
 #endif
             );
             continue;
@@ -1773,7 +1783,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
                             runtime, thread_idx, current_valid_cluster_offset, *slot_state, shape
 #if PTO2_PROFILING
                             ,
-                            profiling_enabled, phase_dispatch_count
+                            task_recording_enabled, phase_recording_enabled, phase_dispatch_count
 #endif
                         );
                         slot_state->next_block_idx++;
@@ -1841,7 +1851,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
                             PTO2SubtaskSlot::AIC, true
 #if PTO2_PROFILING
                             ,
-                            profiling_enabled
+                            task_recording_enabled
 #endif
                         );
                         slot_state->next_block_idx++;
@@ -1879,7 +1889,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
             CYCLE_COUNT_LAP(sched_idle_cycle);
         } else {
             CYCLE_COUNT_LAP(sched_dispatch_cycle);
-            if (profiling_enabled && phase_dispatch_count > 0) {
+            if (phase_recording_enabled && phase_dispatch_count > 0) {
                 perf_aicpu_record_phase(
                     thread_idx, AicpuPhaseId::SCHED_DISPATCH, _t0_phase, _t1, sched_loop_count, phase_dispatch_count
                 );
@@ -2039,7 +2049,7 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
             }
 #if PTO2_PROFILING
             CYCLE_COUNT_LAP(sched_idle_cycle);
-            if (profiling_enabled) {
+            if (phase_recording_enabled) {
                 perf_aicpu_record_phase(thread_idx, AicpuPhaseId::SCHED_IDLE_WAIT, _t0_phase, _t1, sched_loop_count, 0);
                 _t0_phase = _t1;
             }
@@ -2207,9 +2217,10 @@ int32_t AicpuExecutor::resolve_and_dispatch_pto2(Runtime *runtime, int32_t threa
 #endif
 
 #if PTO2_PROFILING
-    // Flush performance buffers for cores managed by this thread
-    if (profiling_enabled) {
+    if (task_recording_enabled) {
         perf_aicpu_flush_buffers(runtime, thread_idx, core_assignments_[thread_idx], core_num);
+    }
+    if (phase_recording_enabled) {
         perf_aicpu_flush_phase_buffers(thread_idx);
     }
 #endif
@@ -2388,7 +2399,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             }
 
 #if PTO2_PROFILING
-            rt->orchestrator.enable_profiling = runtime->enable_profiling;
+            rt->orchestrator.perf_level = runtime->perf_level;
 #endif
 
             // Total core counts = aic_count_ / aiv_count_ (set once at runtime init).
@@ -2412,7 +2423,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             }
 
 #if PTO2_PROFILING
-            if (runtime->enable_profiling) {
+            if (runtime->perf_level >= 3) {
                 perf_aicpu_set_orch_thread_idx(thread_idx);
             }
 #endif
@@ -2496,7 +2507,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
 
 #if PTO2_PROFILING
             // Write orchestrator summary to shared memory for host-side export (only if profiling enabled)
-            if (runtime->enable_profiling) {
+            if (runtime->perf_level >= 3) {
                 AicpuOrchSummary orch_summary = {};
                 orch_summary.start_time = orch_cycle_start;
                 orch_summary.end_time = orch_cycle_end;
@@ -2516,7 +2527,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
 
 #if PTO2_PROFILING
             // Write core-to-thread mapping (one-time, after orchestration)
-            if (runtime->enable_profiling) {
+            if (runtime->perf_level >= 3) {
                 perf_aicpu_write_core_assignments(
                     core_assignments_, core_count_per_thread_, sched_thread_num_, cores_total_num_
                 );
@@ -2540,7 +2551,7 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             pto2_submitted_tasks = pto2_task_count;
 #endif
             total_tasks_ = pto2_task_count;
-            if (runtime->enable_profiling && pto2_task_count > 0) {
+            if (runtime->perf_level > 0 && pto2_task_count > 0) {
                 perf_aicpu_update_total_tasks(runtime, static_cast<uint32_t>(pto2_task_count));
             }
             int32_t inline_completed = static_cast<int32_t>(rt->orchestrator.inline_completed_tasks);
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index e201f3d30..eed777d19 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -94,8 +94,8 @@ __attribute__((weak, visibility("hidden"))) void
 perf_aicpu_record_orch_phase(AicpuPhaseId, uint64_t, uint64_t, uint32_t, uint64_t) {}
 // submit_idx needed for swimlane task_id tagging (no cycle accumulation at this level)
 static uint32_t g_orch_submit_idx = 0;
-#define CYCLE_COUNT_START()                     \
-    bool _prof_active = orch->enable_profiling; \
+#define CYCLE_COUNT_START()                      \
+    bool _prof_active = (orch->perf_level >= 3); \
     uint64_t _t0 = _prof_active ? get_sys_cnt_aicpu() : 0, _t1 = 0
 #define CYCLE_COUNT_LAP(acc) \
     do {                     \
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
index 9db96eaa1..8bf27e791 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.h
@@ -75,8 +75,7 @@ struct PTO2OrchestratorState {
     int32_t total_cluster_count{0};  // AIC cores = MIX clusters
     int32_t total_aiv_count{0};      // AIV cores (= 2 × clusters on standard hardware)
 #if PTO2_PROFILING
-    // Runtime profiling switch copied from Runtime::enable_profiling.
-    bool enable_profiling;
+    int perf_level;
 #endif
 
     // === GM HEAP (for output buffers) ===
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index 18a069ab2..7b4308134 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -170,7 +170,7 @@ class Runtime {
     uint64_t func_id_to_addr_[RUNTIME_MAX_FUNC_ID];
 
     // Profiling support
-    bool enable_profiling;  // Enable profiling flag
+    int perf_level = 0;  // 0=off, 1=AICore-only, 2=task+fanout, 3=full
 
     // Orchestrator-to-scheduler transition control
     // When true, orchestrator threads convert to scheduler threads after orchestration completes.
diff --git a/src/common/distributed/dist_chip_process.cpp b/src/common/distributed/dist_chip_process.cpp
index dd8f7621a..dece07f5f 100644
--- a/src/common/distributed/dist_chip_process.cpp
+++ b/src/common/distributed/dist_chip_process.cpp
@@ -57,7 +57,7 @@ void DistChipProcess::run(const WorkerPayload &payload) {
     // Write config fields
     int32_t block_dim = payload.block_dim;
     int32_t aicpu_tn = payload.aicpu_thread_num;
-    int32_t profiling = payload.enable_profiling ? 1 : 0;
+    int32_t profiling = payload.enable_profiling;
     std::memcpy(base() + OFF_BLOCK_DIM, &block_dim, sizeof(int32_t));
     std::memcpy(base() + OFF_AICPU_THREAD_NUM, &aicpu_tn, sizeof(int32_t));
     std::memcpy(base() + OFF_ENABLE_PROFILING, &profiling, sizeof(int32_t));
diff --git a/src/common/distributed/dist_types.h b/src/common/distributed/dist_types.h
index b3a9695ee..a3bc802b7 100644
--- a/src/common/distributed/dist_types.h
+++ b/src/common/distributed/dist_types.h
@@ -81,7 +81,7 @@ struct WorkerPayload {
     const void *args = nullptr;      // ChipStorageTaskArgs*
     int32_t block_dim = 1;
     int32_t aicpu_thread_num = 3;
-    bool enable_profiling = false;
+    int enable_profiling = 0;
 
     // --- SubWorker fields ---
     int32_t callable_id = -1;
diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp
index 2ef0a9b00..1ade6d818 100644
--- a/src/common/worker/chip_worker.cpp
+++ b/src/common/worker/chip_worker.cpp
@@ -191,7 +191,7 @@ void ChipWorker::run(const WorkerPayload &payload) {
     ChipCallConfig config;
     config.block_dim = payload.block_dim;
     config.aicpu_thread_num = payload.aicpu_thread_num;
-    config.enable_profiling = payload.enable_profiling;
+    config.perf_level = payload.enable_profiling;
     run(payload.callable, payload.args, config);
 }
 
@@ -204,7 +204,7 @@ void ChipWorker::run(const void *callable, const void *args, const ChipCallConfi
 
     int rc = run_runtime_fn_(
         device_ctx_, rt, callable, args, config.block_dim, config.aicpu_thread_num, device_id_, aicpu_binary_.data(),
-        aicpu_binary_.size(), aicore_binary_.data(), aicore_binary_.size(), config.enable_profiling ? 1 : 0
+        aicpu_binary_.size(), aicore_binary_.data(), aicore_binary_.size(), config.perf_level
     );
     if (rc != 0) {
         throw std::runtime_error("run_runtime failed with code " + std::to_string(rc));
diff --git a/src/common/worker/chip_worker.h b/src/common/worker/chip_worker.h
index b6fd7c4b5..cf614ae22 100644
--- a/src/common/worker/chip_worker.h
+++ b/src/common/worker/chip_worker.h
@@ -21,7 +21,7 @@
 struct ChipCallConfig {
     int block_dim = 24;
     int aicpu_thread_num = 3;
-    bool enable_profiling = false;
+    int perf_level = 0;  // 0=off, 1=AICore-only, 2=task+fanout, 3=full
 };
 
 class ChipWorker : public IWorker {
diff --git a/tests/ut/py/test_chip_worker.py b/tests/ut/py/test_chip_worker.py
index 59b85f985..8a3947e3b 100644
--- a/tests/ut/py/test_chip_worker.py
+++ b/tests/ut/py/test_chip_worker.py
@@ -31,7 +31,7 @@ def test_defaults(self):
         config = ChipCallConfig()
         assert config.block_dim == 24
         assert config.aicpu_thread_num == 3
-        assert config.enable_profiling is False
+        assert config.enable_profiling == 0
 
     def test_setters(self):
         config = ChipCallConfig()
@@ -40,13 +40,13 @@ def test_setters(self):
         config.enable_profiling = True
         assert config.block_dim == 32
         assert config.aicpu_thread_num == 4
-        assert config.enable_profiling is True
+        assert config.enable_profiling > 0
 
     def test_repr(self):
         config = ChipCallConfig()
         r = repr(config)
         assert "block_dim=24" in r
-        assert "enable_profiling=False" in r
+        assert "enable_profiling=0" in r
 
 
 # ============================================================================
diff --git a/tools/swimlane_converter.py b/tools/swimlane_converter.py
index f47ae9b47..f7cb2670b 100644
--- a/tools/swimlane_converter.py
+++ b/tools/swimlane_converter.py
@@ -120,8 +120,8 @@ def read_perf_data(filepath):
             raise ValueError(f"Missing required field: {field}")
 
     # Validate version
-    if data["version"] not in [1, 2]:
-        raise ValueError(f"Unsupported version: {data['version']} (expected 1 or 2)")
+    if data["version"] not in [0, 1, 2]:
+        raise ValueError(f"Unsupported version: {data['version']} (expected 0/1/2)")
 
     return data
 
@@ -495,7 +495,8 @@ def generate_chrome_trace_json(  # noqa: PLR0912, PLR0915
         dur = task["duration_us"]
 
         # Build fanout hint string (packed ids → rXtY / tY for readability)
-        fanout_str = "[" + ", ".join(format_task_display(x) for x in task["fanout"]) + "]"
+        fanout_list = task.get("fanout", [])
+        fanout_str = "[" + ", ".join(format_task_display(x) for x in fanout_list) + "]"
 
         # Get function name if available
         func_id = task["func_id"]
@@ -579,7 +580,7 @@ def generate_chrome_trace_json(  # noqa: PLR0912, PLR0915
         src_tid = core_to_tid[task["core_id"]]
         src_ts_end = task["end_time_us"]
 
-        for succ_task_id in task["fanout"]:
+        for succ_task_id in task.get("fanout", []):
             if succ_task_id not in task_map:
                 if verbose:
                     print(
@@ -810,7 +811,7 @@ def generate_chrome_trace_json(  # noqa: PLR0912, PLR0915
 
             src_tid = core_to_tid[task["core_id"]]
 
-            for succ_task_id in task["fanout"]:
+            for succ_task_id in task.get("fanout", []):
                 if succ_task_id not in task_map:
                     continue