diff --git a/ci.py b/ci.py
index 713294e93..095c52153 100644
--- a/ci.py
+++ b/ci.py
@@ -354,6 +354,7 @@ def _compile_kernel(kernel):
         func_name=orchestration["function_name"],
         binary=orch_binary,
         children=kernel_binaries,
+        config_name=orchestration.get("config_name", ""),
     )
 
     all_cases = getattr(golden, "ALL_CASES", {"Default": {}})
diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp
index 2e1962df0..31d1cbb11 100644
--- a/python/bindings/task_interface.cpp
+++ b/python/bindings/task_interface.cpp
@@ -448,7 +448,7 @@ NB_MODULE(_task_interface, m) {
         .def_static(
             "build",
             [](std::vector<ArgDirection> signature, std::string func_name, nb::bytes binary,
-               std::vector<std::tuple<int32_t, PyCoreCallable>> children) -> PyChipCallable {
+               std::vector<std::tuple<int32_t, PyCoreCallable>> children, std::string config_name) -> PyChipCallable {
                 auto bin_ptr = reinterpret_cast<const void *>(binary.c_str());
                 auto bin_size = static_cast<uint32_t>(binary.size());
                 auto child_count = static_cast<int32_t>(children.size());
@@ -462,11 +462,12 @@ NB_MODULE(_task_interface, m) {
 
                 auto buf = make_callable<CoreCallable, CHIP_MAX_TENSOR_ARGS, 32>(
                     signature.data(), static_cast<int32_t>(signature.size()), func_name.c_str(), bin_ptr, bin_size,
-                    func_ids.data(), child_bufs.data(), child_count
+                    func_ids.data(), child_bufs.data(), child_count, config_name.c_str()
                 );
                 return PyChipCallable{std::move(buf)};
             },
             nb::arg("signature"), nb::arg("func_name"), nb::arg("binary"), nb::arg("children"),
+            nb::arg("config_name") = "",
             "Build a ChipCallable from signature, func_name, binary, and list of (func_id, CoreCallable) children."
         )
 
@@ -503,6 +504,15 @@ NB_MODULE(_task_interface, m) {
             "The orchestration function name."
         )
 
+        .def_prop_ro(
+            "config_name",
+            [](const PyChipCallable &self) -> std::string {
+                const auto &c = self.get();
+                return std::string(c.config_name(), c.config_name_len());
+            },
+            "The optional orchestration config function name."
+        )
+
         .def_prop_ro(
             "child_count",
             [](const PyChipCallable &self) -> int32_t {
@@ -568,9 +578,9 @@ NB_MODULE(_task_interface, m) {
         .def("__repr__", [](const PyChipCallable &self) -> std::string {
             const auto &c = self.get();
             std::ostringstream os;
-            os << "ChipCallable(func_name=\"" << std::string(c.func_name(), c.func_name_len())
-               << "\", sig_count=" << c.sig_count() << ", binary_size=" << c.binary_size()
-               << ", child_count=" << c.child_count() << ")";
+            os << "ChipCallable(func_name=\"" << std::string(c.func_name(), c.func_name_len()) << "\", config_name=\""
+               << std::string(c.config_name(), c.config_name_len()) << "\", sig_count=" << c.sig_count()
+               << ", binary_size=" << c.binary_size() << ", child_count=" << c.child_count() << ")";
             return os.str();
         });
 
diff --git a/simpler_setup/code_runner.py b/simpler_setup/code_runner.py
index f382aa276..600453b73 100644
--- a/simpler_setup/code_runner.py
+++ b/simpler_setup/code_runner.py
@@ -819,6 +819,7 @@ def _compile_one_kernel(kernel):
             func_name=self.orchestration["function_name"],
             binary=orch_so_binary,
             children=kernel_binaries,
+            config_name=self.orchestration.get("config_name", ""),
         )
 
         # Step 2: Create ChipWorker
diff --git a/simpler_setup/scene_test.py b/simpler_setup/scene_test.py
index 1ea89676c..b0a500d61 100644
--- a/simpler_setup/scene_test.py
+++ b/simpler_setup/scene_test.py
@@ -328,6 +328,7 @@ def _compile_chip_callable_from_spec(spec, platform, runtime, cache_key):
         func_name=orch["function_name"],
         binary=orch_binary,
         children=kernel_binaries,
+        config_name=orch.get("config_name", ""),
     )
     _compile_cache[cache_key] = chip_callable
     return chip_callable
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index ae5af8b4d..54bd70ce0 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -88,6 +88,8 @@ constexpr int32_t STALL_DUMP_WAIT_MAX = 4;
 constexpr int32_t STALL_DUMP_CORE_MAX = 8;
 constexpr int32_t PROGRESS_VERBOSE_THRESHOLD = 10;  // log every completion for the first N tasks
 constexpr int32_t PROGRESS_LOG_INTERVAL = 250;      // log every N completions after threshold
+constexpr const char *DEFAULT_ORCH_ENTRY_SYMBOL = "aicpu_orchestration_entry";
+constexpr const char *DEFAULT_ORCH_CONFIG_SYMBOL = "aicpu_orchestration_config";
 
 static int32_t read_pto2_runtime_status(Runtime *runtime) {
     if (runtime == nullptr) {
@@ -2312,33 +2314,52 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             }
             DEV_INFO("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle);
 
-            dlerror();
-            auto config_func =
-                reinterpret_cast<DeviceOrchestrationConfigFunc>(dlsym(handle, "aicpu_orchestration_config"));
+            const char *entry_symbol = runtime->get_device_orch_func_name();
+            if (entry_symbol == nullptr || entry_symbol[0] == '\0') {
+                entry_symbol = DEFAULT_ORCH_ENTRY_SYMBOL;
+            }
+            const char *config_symbol = runtime->get_device_orch_config_name();
+            if (config_symbol == nullptr || config_symbol[0] == '\0') {
+                config_symbol = DEFAULT_ORCH_CONFIG_SYMBOL;
+            }
 
             dlerror();
-            DeviceOrchestrationFunc orch_func =
-                reinterpret_cast<DeviceOrchestrationFunc>(dlsym(handle, "aicpu_orchestration_entry"));
-            const char *dlsym_error = dlerror();
-            if (dlsym_error != nullptr) {
-                DEV_ERROR("Thread %d: dlsym failed: %s", thread_idx, dlsym_error);
+            DeviceOrchestrationFunc orch_func = reinterpret_cast<DeviceOrchestrationFunc>(dlsym(handle, entry_symbol));
+            const char *entry_dlsym_error = dlerror();
+            if (entry_dlsym_error != nullptr) {
+                DEV_ERROR(
+                    "Thread %d: dlsym failed for entry symbol '%s': %s", thread_idx, entry_symbol, entry_dlsym_error
+                );
                 dlclose(handle);
                 unlink(so_path);
                 return -1;
             }
             if (orch_func == nullptr) {
-                DEV_ERROR("Thread %d: dlsym returned NULL for aicpu_orchestration_entry", thread_idx);
+                DEV_ERROR("Thread %d: dlsym returned NULL for entry symbol '%s'", thread_idx, entry_symbol);
                 dlclose(handle);
                 unlink(so_path);
                 return -1;
             }
 
+            dlerror();
+            auto config_func = reinterpret_cast<DeviceOrchestrationConfigFunc>(dlsym(handle, config_symbol));
+            const char *config_dlsym_error = dlerror();
+            if (config_dlsym_error != nullptr || config_func == nullptr) {
+                DEV_ERROR(
+                    "Thread %d: dlsym failed for config symbol '%s': %s", thread_idx, config_symbol,
+                    config_dlsym_error ? config_dlsym_error : "NULL function pointer"
+                );
+                config_func = nullptr;
+            }
+
             dlerror();
             auto bind_runtime_func =
                 reinterpret_cast<DeviceOrchestrationBindRuntimeFunc>(dlsym(handle, "pto2_framework_bind_runtime"));
             const char *bind_runtime_error = dlerror();
             if (bind_runtime_error != nullptr) {
-                DEV_INFO("Thread %d: Optional TLS runtime binder not found: %s", thread_idx, bind_runtime_error);
+                DEV_ERROR(
+                    "Thread %d: dlsym failed for pto2_framework_bind_runtime: %s", thread_idx, bind_runtime_error
+                );
                 bind_runtime_func = nullptr;
             }
 
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index 81accc910..b2d3b25de 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -134,6 +134,8 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable,
 
     const uint8_t *orch_so_binary = static_cast<const uint8_t *>(callable->binary_data());
     size_t orch_so_size = callable->binary_size();
+    runtime->set_device_orch_func_name(callable->func_name());
+    runtime->set_device_orch_config_name(callable->config_name());
 
     if (orch_so_binary == nullptr || orch_so_size == 0) {
         LOG_ERROR("Orchestration SO binary is required for device orchestration");
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp
index 3c1a25499..ff0cbda6d 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp
@@ -51,6 +51,8 @@ Runtime::Runtime() {
 
     // Initialize device orchestration SO binary
     device_orch_so_size_ = 0;
+    device_orch_func_name_[0] = '\0';
+    device_orch_config_name_[0] = '\0';
 
     // Initialize kernel binary tracking
     registered_kernel_count_ = 0;
@@ -119,6 +121,28 @@ const void *Runtime::get_device_orch_so_data() const {
 
 size_t Runtime::get_device_orch_so_size() const { return device_orch_so_size_; }
 
+void Runtime::set_device_orch_func_name(const char *name) {
+    if (name == nullptr) {
+        device_orch_func_name_[0] = '\0';
+        return;
+    }
+    std::strncpy(device_orch_func_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1);
+    device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0';
+}
+
+const char *Runtime::get_device_orch_func_name() const { return device_orch_func_name_; }
+
+void Runtime::set_device_orch_config_name(const char *name) {
+    if (name == nullptr) {
+        device_orch_config_name_[0] = '\0';
+        return;
+    }
+    std::strncpy(device_orch_config_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1);
+    device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0';
+}
+
+const char *Runtime::get_device_orch_config_name() const { return device_orch_config_name_; }
+
 uint64_t Runtime::get_function_bin_addr(int func_id) const {
     if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return 0;
     return func_id_to_addr_[func_id];
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index f45be4b29..2d0e0b4b4 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -49,6 +49,7 @@
 #define RUNTIME_MAX_TENSOR_PAIRS 64
 #define RUNTIME_MAX_FUNC_ID 32
 #define RUNTIME_MAX_ORCH_SO_SIZE (4 * 1024 * 1024)  // 4MB max for orchestration SO
+#define RUNTIME_MAX_ORCH_SYMBOL_NAME 64
 
 // Default ready queue shards: one shard per worker thread (total minus orchestrator)
 constexpr int RUNTIME_DEFAULT_READY_QUEUE_SHARDS = PLATFORM_MAX_AICPU_THREADS - 1;
@@ -199,6 +200,8 @@ class Runtime {
     // Stored as a copy to avoid lifetime issues with Python ctypes arrays
     uint8_t device_orch_so_storage_[RUNTIME_MAX_ORCH_SO_SIZE];
     size_t device_orch_so_size_;
+    char device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
+    char device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
 
 public:
     /**
@@ -252,6 +255,10 @@ class Runtime {
     void set_device_orch_so(const void *data, size_t size);
     const void *get_device_orch_so_data() const;
     size_t get_device_orch_so_size() const;
+    void set_device_orch_func_name(const char *name);
+    const char *get_device_orch_func_name() const;
+    void set_device_orch_config_name(const char *name);
+    const char *get_device_orch_config_name() const;
 
     uint64_t get_function_bin_addr(int func_id) const;
     void set_function_bin_addr(int func_id, uint64_t addr);
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
index 37c2c81c2..a46dfbce1 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
@@ -88,6 +88,8 @@ constexpr int32_t STALL_DUMP_WAIT_MAX = 4;
 constexpr int32_t STALL_DUMP_CORE_MAX = 8;
 constexpr int32_t PROGRESS_VERBOSE_THRESHOLD = 10;  // log every completion for the first N tasks
 constexpr int32_t PROGRESS_LOG_INTERVAL = 250;      // log every N completions after threshold
+constexpr const char *DEFAULT_ORCH_ENTRY_SYMBOL = "aicpu_orchestration_entry";
+constexpr const char *DEFAULT_ORCH_CONFIG_SYMBOL = "aicpu_orchestration_config";
 
 static int32_t read_pto2_runtime_status(Runtime *runtime) {
     if (runtime == nullptr) {
@@ -2287,33 +2289,52 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
             }
             DEV_INFO("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle);
 
-            dlerror();
-            auto config_func =
-                reinterpret_cast<DeviceOrchestrationConfigFunc>(dlsym(handle, "aicpu_orchestration_config"));
+            const char *entry_symbol = runtime->get_device_orch_func_name();
+            if (entry_symbol == nullptr || entry_symbol[0] == '\0') {
+                entry_symbol = DEFAULT_ORCH_ENTRY_SYMBOL;
+            }
+            const char *config_symbol = runtime->get_device_orch_config_name();
+            if (config_symbol == nullptr || config_symbol[0] == '\0') {
+                config_symbol = DEFAULT_ORCH_CONFIG_SYMBOL;
+            }
 
             dlerror();
-            DeviceOrchestrationFunc orch_func =
-                reinterpret_cast<DeviceOrchestrationFunc>(dlsym(handle, "aicpu_orchestration_entry"));
-            const char *dlsym_error = dlerror();
-            if (dlsym_error != nullptr) {
-                DEV_ERROR("Thread %d: dlsym failed: %s", thread_idx, dlsym_error);
+            DeviceOrchestrationFunc orch_func = reinterpret_cast<DeviceOrchestrationFunc>(dlsym(handle, entry_symbol));
+            const char *entry_dlsym_error = dlerror();
+            if (entry_dlsym_error != nullptr) {
+                DEV_ERROR(
+                    "Thread %d: dlsym failed for entry symbol '%s': %s", thread_idx, entry_symbol, entry_dlsym_error
+                );
                 dlclose(handle);
                 unlink(so_path);
                 return -1;
             }
             if (orch_func == nullptr) {
-                DEV_ERROR("Thread %d: dlsym returned NULL for aicpu_orchestration_entry", thread_idx);
+                DEV_ERROR("Thread %d: dlsym returned NULL for entry symbol '%s'", thread_idx, entry_symbol);
                 dlclose(handle);
                 unlink(so_path);
                 return -1;
             }
 
+            dlerror();
+            auto config_func = reinterpret_cast<DeviceOrchestrationConfigFunc>(dlsym(handle, config_symbol));
+            const char *config_dlsym_error = dlerror();
+            if (config_dlsym_error != nullptr || config_func == nullptr) {
+                DEV_ERROR(
+                    "Thread %d: dlsym failed for config symbol '%s': %s", thread_idx, config_symbol,
+                    config_dlsym_error ? config_dlsym_error : "NULL function pointer"
+                );
+                config_func = nullptr;
+            }
+
             dlerror();
             auto bind_runtime_func =
                 reinterpret_cast<DeviceOrchestrationBindRuntimeFunc>(dlsym(handle, "pto2_framework_bind_runtime"));
             const char *bind_runtime_error = dlerror();
             if (bind_runtime_error != nullptr) {
-                DEV_INFO("Thread %d: Optional TLS runtime binder not found: %s", thread_idx, bind_runtime_error);
+                DEV_ERROR(
+                    "Thread %d: dlsym failed for pto2_framework_bind_runtime: %s", thread_idx, bind_runtime_error
+                );
                 bind_runtime_func = nullptr;
             }
 
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index 81accc910..b2d3b25de 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -134,6 +134,8 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable,
 
     const uint8_t *orch_so_binary = static_cast<const uint8_t *>(callable->binary_data());
     size_t orch_so_size = callable->binary_size();
+    runtime->set_device_orch_func_name(callable->func_name());
+    runtime->set_device_orch_config_name(callable->config_name());
 
     if (orch_so_binary == nullptr || orch_so_size == 0) {
         LOG_ERROR("Orchestration SO binary is required for device orchestration");
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp
index 3c1a25499..ff0cbda6d 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp
@@ -51,6 +51,8 @@ Runtime::Runtime() {
 
     // Initialize device orchestration SO binary
     device_orch_so_size_ = 0;
+    device_orch_func_name_[0] = '\0';
+    device_orch_config_name_[0] = '\0';
 
     // Initialize kernel binary tracking
     registered_kernel_count_ = 0;
@@ -119,6 +121,28 @@ const void *Runtime::get_device_orch_so_data() const {
 
 size_t Runtime::get_device_orch_so_size() const { return device_orch_so_size_; }
 
+void Runtime::set_device_orch_func_name(const char *name) {
+    if (name == nullptr) {
+        device_orch_func_name_[0] = '\0';
+        return;
+    }
+    std::strncpy(device_orch_func_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1);
+    device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0';
+}
+
+const char *Runtime::get_device_orch_func_name() const { return device_orch_func_name_; }
+
+void Runtime::set_device_orch_config_name(const char *name) {
+    if (name == nullptr) {
+        device_orch_config_name_[0] = '\0';
+        return;
+    }
+    std::strncpy(device_orch_config_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1);
+    device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0';
+}
+
+const char *Runtime::get_device_orch_config_name() const { return device_orch_config_name_; }
+
 uint64_t Runtime::get_function_bin_addr(int func_id) const {
     if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return 0;
     return func_id_to_addr_[func_id];
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
index d1a73f3f2..d57b80cde 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
@@ -49,6 +49,7 @@
 #define RUNTIME_MAX_TENSOR_PAIRS 64
 #define RUNTIME_MAX_FUNC_ID 32
 #define RUNTIME_MAX_ORCH_SO_SIZE (4 * 1024 * 1024)  // 1MB max for orchestration SO
+#define RUNTIME_MAX_ORCH_SYMBOL_NAME 64
 
 // Default ready queue shards: one shard per worker thread (total minus orchestrator)
 constexpr int RUNTIME_DEFAULT_READY_QUEUE_SHARDS = PLATFORM_MAX_AICPU_THREADS - 1;
@@ -199,6 +200,8 @@ class Runtime {
     // Stored as a copy to avoid lifetime issues with Python ctypes arrays
     uint8_t device_orch_so_storage_[RUNTIME_MAX_ORCH_SO_SIZE];
     size_t device_orch_so_size_;
+    char device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
+    char device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
 
 public:
     /**
@@ -252,6 +255,10 @@ class Runtime {
     void set_device_orch_so(const void *data, size_t size);
     const void *get_device_orch_so_data() const;
     size_t get_device_orch_so_size() const;
+    void set_device_orch_func_name(const char *name);
+    const char *get_device_orch_func_name() const;
+    void set_device_orch_config_name(const char *name);
+    const char *get_device_orch_config_name() const;
 
     uint64_t get_function_bin_addr(int func_id) const;
     void set_function_bin_addr(int func_id, uint64_t addr);
diff --git a/src/common/task_interface/callable.h b/src/common/task_interface/callable.h
index 9301d4204..50f7c3f29 100644
--- a/src/common/task_interface/callable.h
+++ b/src/common/task_interface/callable.h
@@ -105,6 +105,8 @@ struct Callable {
     int32_t child_func_ids_[MaxChildren];
     uint32_t child_offsets_[MaxChildren];
     int32_t child_count_;
+    char config_name_[CALLABLE_FUNC_NAME_MAX];
+    uint32_t config_name_len_;
     char storage_[];
 
     ArgDirection sig(int32_t i) const {
@@ -116,6 +118,8 @@ struct Callable {
     uint32_t binary_size() const { return binary_size_; }
     const char *func_name() const { return func_name_; }
     uint32_t func_name_len() const { return func_name_len_; }
+    const char *config_name() const { return config_name_; }
+    uint32_t config_name_len() const { return config_name_len_; }
 
     const Child &child(int32_t i) const {
         if (i < 0 || i >= child_count_) throw std::out_of_range("Callable: child index out of range");
@@ -137,7 +141,8 @@ struct Callable {
     template <typename C, int MS, int MC>
     friend std::vector<uint8_t> make_callable(
         const ArgDirection *sig, int32_t sig_count, const char *func_name, const void *binary, uint32_t binary_size,
-        const int32_t *child_func_ids, const std::vector<uint8_t> *child_buffers, int32_t child_count
+        const int32_t *child_func_ids, const std::vector<uint8_t> *child_buffers, int32_t child_count,
+        const char *config_name
     );
 };
 
@@ -180,7 +185,8 @@ make_callable(const ArgDirection *sig, int32_t sig_count, const void *binary, ui
 template <typename Child, int MaxSig, int MaxChildren>
 std::vector<uint8_t> make_callable(
     const ArgDirection *sig, int32_t sig_count, const char *func_name, const void *binary, uint32_t binary_size,
-    const int32_t *child_func_ids, const std::vector<uint8_t> *child_buffers, int32_t child_count
+    const int32_t *child_func_ids, const std::vector<uint8_t> *child_buffers, int32_t child_count,
+    const char *config_name = nullptr
 ) {
     if (sig_count > MaxSig) throw std::invalid_argument("make_callable: sig_count exceeds MaxSig");
     if (child_count > MaxChildren) throw std::invalid_argument("make_callable: child_count exceeds MaxChildren");
@@ -215,6 +221,17 @@ std::vector<uint8_t> make_callable(
         obj->func_name_len_ = 0;
     }
 
+    // Store config_name (null-terminated, truncated to CALLABLE_FUNC_NAME_MAX-1)
+    std::memset(obj->config_name_, 0, CALLABLE_FUNC_NAME_MAX);
+    if (config_name != nullptr) {
+        size_t name_len = std::strlen(config_name);
+        if (name_len >= CALLABLE_FUNC_NAME_MAX) name_len = CALLABLE_FUNC_NAME_MAX - 1;
+        std::memcpy(obj->config_name_, config_name, name_len);
+        obj->config_name_len_ = static_cast<uint32_t>(name_len);
+    } else {
+        obj->config_name_len_ = 0;
+    }
+
     if (binary_size > 0) std::memcpy(obj->storage_, binary, binary_size);
 
     for (int32_t i = 0; i < child_count; ++i) {
diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
index 57b22dcff..b3314019a 100644
--- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ b/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -61,7 +61,7 @@ aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
     };
 }
 
-__attribute__((visibility("default"))) void aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args) {
+__attribute__((visibility("default"))) void build_paged_attention_graph(const ChipStorageTaskArgs &orch_args) {
     uint64_t prof_param_extract = 0;
     uint64_t prof_ext_tensor = 0;
     uint64_t prof_scope = 0;
diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp
index 3e7a5935a..1460a588d 100644
--- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp
+++ b/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp
@@ -73,7 +73,7 @@ aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
     };
 }
 
-__attribute__((visibility("default"))) void aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args) {
+__attribute__((visibility("default"))) void build_paged_attention_graph(const ChipStorageTaskArgs &orch_args) {
 #ifdef ENABLE_PROFILING
     uint64_t prof_param_extract = 0;
     uint64_t prof_ext_tensor = 0;