diff --git a/ci.py b/ci.py index 713294e93..095c52153 100644 --- a/ci.py +++ b/ci.py @@ -354,6 +354,7 @@ def _compile_kernel(kernel): func_name=orchestration["function_name"], binary=orch_binary, children=kernel_binaries, + config_name=orchestration.get("config_name", ""), ) all_cases = getattr(golden, "ALL_CASES", {"Default": {}}) diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp index 2e1962df0..31d1cbb11 100644 --- a/python/bindings/task_interface.cpp +++ b/python/bindings/task_interface.cpp @@ -448,7 +448,7 @@ NB_MODULE(_task_interface, m) { .def_static( "build", [](std::vector signature, std::string func_name, nb::bytes binary, - std::vector> children) -> PyChipCallable { + std::vector> children, std::string config_name) -> PyChipCallable { auto bin_ptr = reinterpret_cast(binary.c_str()); auto bin_size = static_cast(binary.size()); auto child_count = static_cast(children.size()); @@ -462,11 +462,12 @@ NB_MODULE(_task_interface, m) { auto buf = make_callable( signature.data(), static_cast(signature.size()), func_name.c_str(), bin_ptr, bin_size, - func_ids.data(), child_bufs.data(), child_count + func_ids.data(), child_bufs.data(), child_count, config_name.c_str() ); return PyChipCallable{std::move(buf)}; }, nb::arg("signature"), nb::arg("func_name"), nb::arg("binary"), nb::arg("children"), + nb::arg("config_name") = "", "Build a ChipCallable from signature, func_name, binary, and list of (func_id, CoreCallable) children." ) @@ -503,6 +504,15 @@ NB_MODULE(_task_interface, m) { "The orchestration function name." ) + .def_prop_ro( + "config_name", + [](const PyChipCallable &self) -> std::string { + const auto &c = self.get(); + return std::string(c.config_name(), c.config_name_len()); + }, + "The optional orchestration config function name." + ) + .def_prop_ro( "child_count", [](const PyChipCallable &self) -> int32_t { @@ -568,9 +578,9 @@ NB_MODULE(_task_interface, m) { .def("__repr__", [](const PyChipCallable &self) -> std::string { const auto &c = self.get(); std::ostringstream os; - os << "ChipCallable(func_name=\"" << std::string(c.func_name(), c.func_name_len()) - << "\", sig_count=" << c.sig_count() << ", binary_size=" << c.binary_size() - << ", child_count=" << c.child_count() << ")"; + os << "ChipCallable(func_name=\"" << std::string(c.func_name(), c.func_name_len()) << "\", config_name=\"" + << std::string(c.config_name(), c.config_name_len()) << "\", sig_count=" << c.sig_count() + << ", binary_size=" << c.binary_size() << ", child_count=" << c.child_count() << ")"; return os.str(); }); diff --git a/simpler_setup/code_runner.py b/simpler_setup/code_runner.py index f382aa276..600453b73 100644 --- a/simpler_setup/code_runner.py +++ b/simpler_setup/code_runner.py @@ -819,6 +819,7 @@ def _compile_one_kernel(kernel): func_name=self.orchestration["function_name"], binary=orch_so_binary, children=kernel_binaries, + config_name=self.orchestration.get("config_name", ""), ) # Step 2: Create ChipWorker diff --git a/simpler_setup/scene_test.py b/simpler_setup/scene_test.py index 1ea89676c..b0a500d61 100644 --- a/simpler_setup/scene_test.py +++ b/simpler_setup/scene_test.py @@ -328,6 +328,7 @@ def _compile_chip_callable_from_spec(spec, platform, runtime, cache_key): func_name=orch["function_name"], binary=orch_binary, children=kernel_binaries, + config_name=orch.get("config_name", ""), ) _compile_cache[cache_key] = chip_callable return chip_callable diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index ae5af8b4d..54bd70ce0 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -88,6 +88,8 @@ constexpr int32_t STALL_DUMP_WAIT_MAX = 4; constexpr int32_t STALL_DUMP_CORE_MAX = 8; constexpr int32_t PROGRESS_VERBOSE_THRESHOLD = 10; // log every completion for the first N tasks constexpr int32_t PROGRESS_LOG_INTERVAL = 250; // log every N completions after threshold +constexpr const char *DEFAULT_ORCH_ENTRY_SYMBOL = "aicpu_orchestration_entry"; +constexpr const char *DEFAULT_ORCH_CONFIG_SYMBOL = "aicpu_orchestration_config"; static int32_t read_pto2_runtime_status(Runtime *runtime) { if (runtime == nullptr) { @@ -2312,33 +2314,52 @@ int32_t AicpuExecutor::run(Runtime *runtime) { } DEV_INFO("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle); - dlerror(); - auto config_func = - reinterpret_cast(dlsym(handle, "aicpu_orchestration_config")); + const char *entry_symbol = runtime->get_device_orch_func_name(); + if (entry_symbol == nullptr || entry_symbol[0] == '\0') { + entry_symbol = DEFAULT_ORCH_ENTRY_SYMBOL; + } + const char *config_symbol = runtime->get_device_orch_config_name(); + if (config_symbol == nullptr || config_symbol[0] == '\0') { + config_symbol = DEFAULT_ORCH_CONFIG_SYMBOL; + } dlerror(); - DeviceOrchestrationFunc orch_func = - reinterpret_cast(dlsym(handle, "aicpu_orchestration_entry")); - const char *dlsym_error = dlerror(); - if (dlsym_error != nullptr) { - DEV_ERROR("Thread %d: dlsym failed: %s", thread_idx, dlsym_error); + DeviceOrchestrationFunc orch_func = reinterpret_cast(dlsym(handle, entry_symbol)); + const char *entry_dlsym_error = dlerror(); + if (entry_dlsym_error != nullptr) { + DEV_ERROR( + "Thread %d: dlsym failed for entry symbol '%s': %s", thread_idx, entry_symbol, entry_dlsym_error + ); dlclose(handle); unlink(so_path); return -1; } if (orch_func == nullptr) { - DEV_ERROR("Thread %d: dlsym returned NULL for aicpu_orchestration_entry", thread_idx); + DEV_ERROR("Thread %d: dlsym returned NULL for entry symbol '%s'", thread_idx, entry_symbol); dlclose(handle); unlink(so_path); return -1; } + dlerror(); + auto config_func = reinterpret_cast(dlsym(handle, config_symbol)); + const char *config_dlsym_error = dlerror(); + if (config_dlsym_error != nullptr || config_func == nullptr) { + DEV_ERROR( + "Thread %d: dlsym failed for config symbol '%s': %s", thread_idx, config_symbol, + config_dlsym_error ? config_dlsym_error : "NULL function pointer" + ); + config_func = nullptr; + } + dlerror(); auto bind_runtime_func = reinterpret_cast(dlsym(handle, "pto2_framework_bind_runtime")); const char *bind_runtime_error = dlerror(); if (bind_runtime_error != nullptr) { - DEV_INFO("Thread %d: Optional TLS runtime binder not found: %s", thread_idx, bind_runtime_error); + DEV_ERROR( + "Thread %d: dlsym failed for pto2_framework_bind_runtime: %s", thread_idx, bind_runtime_error + ); bind_runtime_func = nullptr; } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index 81accc910..b2d3b25de 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -134,6 +134,8 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const uint8_t *orch_so_binary = static_cast(callable->binary_data()); size_t orch_so_size = callable->binary_size(); + runtime->set_device_orch_func_name(callable->func_name()); + runtime->set_device_orch_config_name(callable->config_name()); if (orch_so_binary == nullptr || orch_so_size == 0) { LOG_ERROR("Orchestration SO binary is required for device orchestration"); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp index 3c1a25499..ff0cbda6d 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp @@ -51,6 +51,8 @@ Runtime::Runtime() { // Initialize device orchestration SO binary device_orch_so_size_ = 0; + device_orch_func_name_[0] = '\0'; + device_orch_config_name_[0] = '\0'; // Initialize kernel binary tracking registered_kernel_count_ = 0; @@ -119,6 +121,28 @@ const void *Runtime::get_device_orch_so_data() const { size_t Runtime::get_device_orch_so_size() const { return device_orch_so_size_; } +void Runtime::set_device_orch_func_name(const char *name) { + if (name == nullptr) { + device_orch_func_name_[0] = '\0'; + return; + } + std::strncpy(device_orch_func_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1); + device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0'; +} + +const char *Runtime::get_device_orch_func_name() const { return device_orch_func_name_; } + +void Runtime::set_device_orch_config_name(const char *name) { + if (name == nullptr) { + device_orch_config_name_[0] = '\0'; + return; + } + std::strncpy(device_orch_config_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1); + device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0'; +} + +const char *Runtime::get_device_orch_config_name() const { return device_orch_config_name_; } + uint64_t Runtime::get_function_bin_addr(int func_id) const { if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return 0; return func_id_to_addr_[func_id]; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h index f45be4b29..2d0e0b4b4 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -49,6 +49,7 @@ #define RUNTIME_MAX_TENSOR_PAIRS 64 #define RUNTIME_MAX_FUNC_ID 32 #define RUNTIME_MAX_ORCH_SO_SIZE (4 * 1024 * 1024) // 4MB max for orchestration SO +#define RUNTIME_MAX_ORCH_SYMBOL_NAME 64 // Default ready queue shards: one shard per worker thread (total minus orchestrator) constexpr int RUNTIME_DEFAULT_READY_QUEUE_SHARDS = PLATFORM_MAX_AICPU_THREADS - 1; @@ -199,6 +200,8 @@ class Runtime { // Stored as a copy to avoid lifetime issues with Python ctypes arrays uint8_t device_orch_so_storage_[RUNTIME_MAX_ORCH_SO_SIZE]; size_t device_orch_so_size_; + char device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME]; + char device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME]; public: /** @@ -252,6 +255,10 @@ class Runtime { void set_device_orch_so(const void *data, size_t size); const void *get_device_orch_so_data() const; size_t get_device_orch_so_size() const; + void set_device_orch_func_name(const char *name); + const char *get_device_orch_func_name() const; + void set_device_orch_config_name(const char *name); + const char *get_device_orch_config_name() const; uint64_t get_function_bin_addr(int func_id) const; void set_function_bin_addr(int func_id, uint64_t addr); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 37c2c81c2..a46dfbce1 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -88,6 +88,8 @@ constexpr int32_t STALL_DUMP_WAIT_MAX = 4; constexpr int32_t STALL_DUMP_CORE_MAX = 8; constexpr int32_t PROGRESS_VERBOSE_THRESHOLD = 10; // log every completion for the first N tasks constexpr int32_t PROGRESS_LOG_INTERVAL = 250; // log every N completions after threshold +constexpr const char *DEFAULT_ORCH_ENTRY_SYMBOL = "aicpu_orchestration_entry"; +constexpr const char *DEFAULT_ORCH_CONFIG_SYMBOL = "aicpu_orchestration_config"; static int32_t read_pto2_runtime_status(Runtime *runtime) { if (runtime == nullptr) { @@ -2287,33 +2289,52 @@ int32_t AicpuExecutor::run(Runtime *runtime) { } DEV_INFO("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle); - dlerror(); - auto config_func = - reinterpret_cast(dlsym(handle, "aicpu_orchestration_config")); + const char *entry_symbol = runtime->get_device_orch_func_name(); + if (entry_symbol == nullptr || entry_symbol[0] == '\0') { + entry_symbol = DEFAULT_ORCH_ENTRY_SYMBOL; + } + const char *config_symbol = runtime->get_device_orch_config_name(); + if (config_symbol == nullptr || config_symbol[0] == '\0') { + config_symbol = DEFAULT_ORCH_CONFIG_SYMBOL; + } dlerror(); - DeviceOrchestrationFunc orch_func = - reinterpret_cast(dlsym(handle, "aicpu_orchestration_entry")); - const char *dlsym_error = dlerror(); - if (dlsym_error != nullptr) { - DEV_ERROR("Thread %d: dlsym failed: %s", thread_idx, dlsym_error); + DeviceOrchestrationFunc orch_func = reinterpret_cast(dlsym(handle, entry_symbol)); + const char *entry_dlsym_error = dlerror(); + if (entry_dlsym_error != nullptr) { + DEV_ERROR( + "Thread %d: dlsym failed for entry symbol '%s': %s", thread_idx, entry_symbol, entry_dlsym_error + ); dlclose(handle); unlink(so_path); return -1; } if (orch_func == nullptr) { - DEV_ERROR("Thread %d: dlsym returned NULL for aicpu_orchestration_entry", thread_idx); + DEV_ERROR("Thread %d: dlsym returned NULL for entry symbol '%s'", thread_idx, entry_symbol); dlclose(handle); unlink(so_path); return -1; } + dlerror(); + auto config_func = reinterpret_cast(dlsym(handle, config_symbol)); + const char *config_dlsym_error = dlerror(); + if (config_dlsym_error != nullptr || config_func == nullptr) { + DEV_ERROR( + "Thread %d: dlsym failed for config symbol '%s': %s", thread_idx, config_symbol, + config_dlsym_error ? config_dlsym_error : "NULL function pointer" + ); + config_func = nullptr; + } + dlerror(); auto bind_runtime_func = reinterpret_cast(dlsym(handle, "pto2_framework_bind_runtime")); const char *bind_runtime_error = dlerror(); if (bind_runtime_error != nullptr) { - DEV_INFO("Thread %d: Optional TLS runtime binder not found: %s", thread_idx, bind_runtime_error); + DEV_ERROR( + "Thread %d: dlsym failed for pto2_framework_bind_runtime: %s", thread_idx, bind_runtime_error + ); bind_runtime_func = nullptr; } diff --git a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index 81accc910..b2d3b25de 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -134,6 +134,8 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable, const uint8_t *orch_so_binary = static_cast(callable->binary_data()); size_t orch_so_size = callable->binary_size(); + runtime->set_device_orch_func_name(callable->func_name()); + runtime->set_device_orch_config_name(callable->config_name()); if (orch_so_binary == nullptr || orch_so_size == 0) { LOG_ERROR("Orchestration SO binary is required for device orchestration"); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp index 3c1a25499..ff0cbda6d 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp @@ -51,6 +51,8 @@ Runtime::Runtime() { // Initialize device orchestration SO binary device_orch_so_size_ = 0; + device_orch_func_name_[0] = '\0'; + device_orch_config_name_[0] = '\0'; // Initialize kernel binary tracking registered_kernel_count_ = 0; @@ -119,6 +121,28 @@ const void *Runtime::get_device_orch_so_data() const { size_t Runtime::get_device_orch_so_size() const { return device_orch_so_size_; } +void Runtime::set_device_orch_func_name(const char *name) { + if (name == nullptr) { + device_orch_func_name_[0] = '\0'; + return; + } + std::strncpy(device_orch_func_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1); + device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0'; +} + +const char *Runtime::get_device_orch_func_name() const { return device_orch_func_name_; } + +void Runtime::set_device_orch_config_name(const char *name) { + if (name == nullptr) { + device_orch_config_name_[0] = '\0'; + return; + } + std::strncpy(device_orch_config_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1); + device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0'; +} + +const char *Runtime::get_device_orch_config_name() const { return device_orch_config_name_; } + uint64_t Runtime::get_function_bin_addr(int func_id) const { if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return 0; return func_id_to_addr_[func_id]; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h index d1a73f3f2..d57b80cde 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -49,6 +49,7 @@ #define RUNTIME_MAX_TENSOR_PAIRS 64 #define RUNTIME_MAX_FUNC_ID 32 #define RUNTIME_MAX_ORCH_SO_SIZE (4 * 1024 * 1024) // 1MB max for orchestration SO +#define RUNTIME_MAX_ORCH_SYMBOL_NAME 64 // Default ready queue shards: one shard per worker thread (total minus orchestrator) constexpr int RUNTIME_DEFAULT_READY_QUEUE_SHARDS = PLATFORM_MAX_AICPU_THREADS - 1; @@ -199,6 +200,8 @@ class Runtime { // Stored as a copy to avoid lifetime issues with Python ctypes arrays uint8_t device_orch_so_storage_[RUNTIME_MAX_ORCH_SO_SIZE]; size_t device_orch_so_size_; + char device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME]; + char device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME]; public: /** @@ -252,6 +255,10 @@ class Runtime { void set_device_orch_so(const void *data, size_t size); const void *get_device_orch_so_data() const; size_t get_device_orch_so_size() const; + void set_device_orch_func_name(const char *name); + const char *get_device_orch_func_name() const; + void set_device_orch_config_name(const char *name); + const char *get_device_orch_config_name() const; uint64_t get_function_bin_addr(int func_id) const; void set_function_bin_addr(int func_id, uint64_t addr); diff --git a/src/common/task_interface/callable.h b/src/common/task_interface/callable.h index 9301d4204..50f7c3f29 100644 --- a/src/common/task_interface/callable.h +++ b/src/common/task_interface/callable.h @@ -105,6 +105,8 @@ struct Callable { int32_t child_func_ids_[MaxChildren]; uint32_t child_offsets_[MaxChildren]; int32_t child_count_; + char config_name_[CALLABLE_FUNC_NAME_MAX]; + uint32_t config_name_len_; char storage_[]; ArgDirection sig(int32_t i) const { @@ -116,6 +118,8 @@ struct Callable { uint32_t binary_size() const { return binary_size_; } const char *func_name() const { return func_name_; } uint32_t func_name_len() const { return func_name_len_; } + const char *config_name() const { return config_name_; } + uint32_t config_name_len() const { return config_name_len_; } const Child &child(int32_t i) const { if (i < 0 || i >= child_count_) throw std::out_of_range("Callable: child index out of range"); @@ -137,7 +141,8 @@ struct Callable { template friend std::vector make_callable( const ArgDirection *sig, int32_t sig_count, const char *func_name, const void *binary, uint32_t binary_size, - const int32_t *child_func_ids, const std::vector *child_buffers, int32_t child_count + const int32_t *child_func_ids, const std::vector *child_buffers, int32_t child_count, + const char *config_name ); }; @@ -180,7 +185,8 @@ make_callable(const ArgDirection *sig, int32_t sig_count, const void *binary, ui template std::vector make_callable( const ArgDirection *sig, int32_t sig_count, const char *func_name, const void *binary, uint32_t binary_size, - const int32_t *child_func_ids, const std::vector *child_buffers, int32_t child_count + const int32_t *child_func_ids, const std::vector *child_buffers, int32_t child_count, + const char *config_name = nullptr ) { if (sig_count > MaxSig) throw std::invalid_argument("make_callable: sig_count exceeds MaxSig"); if (child_count > MaxChildren) throw std::invalid_argument("make_callable: child_count exceeds MaxChildren"); @@ -215,6 +221,17 @@ std::vector make_callable( obj->func_name_len_ = 0; } + // Store config_name (null-terminated, truncated to CALLABLE_FUNC_NAME_MAX-1) + std::memset(obj->config_name_, 0, CALLABLE_FUNC_NAME_MAX); + if (config_name != nullptr) { + size_t name_len = std::strlen(config_name); + if (name_len >= CALLABLE_FUNC_NAME_MAX) name_len = CALLABLE_FUNC_NAME_MAX - 1; + std::memcpy(obj->config_name_, config_name, name_len); + obj->config_name_len_ = static_cast(name_len); + } else { + obj->config_name_len_ = 0; + } + if (binary_size > 0) std::memcpy(obj->storage_, binary, binary_size); for (int32_t i = 0; i < child_count; ++i) { diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp index 57b22dcff..b3314019a 100644 --- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp +++ b/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp @@ -61,7 +61,7 @@ aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { }; } -__attribute__((visibility("default"))) void aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args) { +__attribute__((visibility("default"))) void build_paged_attention_graph(const ChipStorageTaskArgs &orch_args) { uint64_t prof_param_extract = 0; uint64_t prof_ext_tensor = 0; uint64_t prof_scope = 0; diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp index 3e7a5935a..1460a588d 100644 --- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp +++ b/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp @@ -73,7 +73,7 @@ aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) { }; } -__attribute__((visibility("default"))) void aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args) { +__attribute__((visibility("default"))) void build_paged_attention_graph(const ChipStorageTaskArgs &orch_args) { #ifdef ENABLE_PROFILING uint64_t prof_param_extract = 0; uint64_t prof_ext_tensor = 0;