Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ci.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,7 @@ def _compile_kernel(kernel):
func_name=orchestration["function_name"],
binary=orch_binary,
children=kernel_binaries,
config_name=orchestration.get("config_name", ""),
)

all_cases = getattr(golden, "ALL_CASES", {"Default": {}})
Expand Down
20 changes: 15 additions & 5 deletions python/bindings/task_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,7 @@ NB_MODULE(_task_interface, m) {
.def_static(
"build",
[](std::vector<ArgDirection> signature, std::string func_name, nb::bytes binary,
std::vector<std::tuple<int32_t, PyCoreCallable>> children) -> PyChipCallable {
std::vector<std::tuple<int32_t, PyCoreCallable>> children, std::string config_name) -> PyChipCallable {
auto bin_ptr = reinterpret_cast<const void *>(binary.c_str());
auto bin_size = static_cast<uint32_t>(binary.size());
auto child_count = static_cast<int32_t>(children.size());
Expand All @@ -462,11 +462,12 @@ NB_MODULE(_task_interface, m) {

auto buf = make_callable<CoreCallable, CHIP_MAX_TENSOR_ARGS, 32>(
signature.data(), static_cast<int32_t>(signature.size()), func_name.c_str(), bin_ptr, bin_size,
func_ids.data(), child_bufs.data(), child_count
func_ids.data(), child_bufs.data(), child_count, config_name.c_str()
);
return PyChipCallable{std::move(buf)};
},
nb::arg("signature"), nb::arg("func_name"), nb::arg("binary"), nb::arg("children"),
nb::arg("config_name") = "",
"Build a ChipCallable from signature, func_name, binary, and list of (func_id, CoreCallable) children."
)

Expand Down Expand Up @@ -503,6 +504,15 @@ NB_MODULE(_task_interface, m) {
"The orchestration function name."
)

.def_prop_ro(
"config_name",
[](const PyChipCallable &self) -> std::string {
const auto &c = self.get();
return std::string(c.config_name(), c.config_name_len());
},
"The optional orchestration config function name."
)

.def_prop_ro(
"child_count",
[](const PyChipCallable &self) -> int32_t {
Expand Down Expand Up @@ -568,9 +578,9 @@ NB_MODULE(_task_interface, m) {
.def("__repr__", [](const PyChipCallable &self) -> std::string {
const auto &c = self.get();
std::ostringstream os;
os << "ChipCallable(func_name=\"" << std::string(c.func_name(), c.func_name_len())
<< "\", sig_count=" << c.sig_count() << ", binary_size=" << c.binary_size()
<< ", child_count=" << c.child_count() << ")";
os << "ChipCallable(func_name=\"" << std::string(c.func_name(), c.func_name_len()) << "\", config_name=\""
<< std::string(c.config_name(), c.config_name_len()) << "\", sig_count=" << c.sig_count()
<< ", binary_size=" << c.binary_size() << ", child_count=" << c.child_count() << ")";
return os.str();
});

Expand Down
1 change: 1 addition & 0 deletions simpler_setup/code_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -819,6 +819,7 @@ def _compile_one_kernel(kernel):
func_name=self.orchestration["function_name"],
binary=orch_so_binary,
children=kernel_binaries,
config_name=self.orchestration.get("config_name", ""),
)

# Step 2: Create ChipWorker
Expand Down
1 change: 1 addition & 0 deletions simpler_setup/scene_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,7 @@ def _compile_chip_callable_from_spec(spec, platform, runtime, cache_key):
func_name=orch["function_name"],
binary=orch_binary,
children=kernel_binaries,
config_name=orch.get("config_name", ""),
)
_compile_cache[cache_key] = chip_callable
return chip_callable
Expand Down
41 changes: 31 additions & 10 deletions src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ constexpr int32_t STALL_DUMP_WAIT_MAX = 4;
constexpr int32_t STALL_DUMP_CORE_MAX = 8;
constexpr int32_t PROGRESS_VERBOSE_THRESHOLD = 10; // log every completion for the first N tasks
constexpr int32_t PROGRESS_LOG_INTERVAL = 250; // log every N completions after threshold
constexpr const char *DEFAULT_ORCH_ENTRY_SYMBOL = "aicpu_orchestration_entry";
constexpr const char *DEFAULT_ORCH_CONFIG_SYMBOL = "aicpu_orchestration_config";

static int32_t read_pto2_runtime_status(Runtime *runtime) {
if (runtime == nullptr) {
Expand Down Expand Up @@ -2312,33 +2314,52 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
}
DEV_INFO("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle);

dlerror();
auto config_func =
reinterpret_cast<DeviceOrchestrationConfigFunc>(dlsym(handle, "aicpu_orchestration_config"));
const char *entry_symbol = runtime->get_device_orch_func_name();
if (entry_symbol == nullptr || entry_symbol[0] == '\0') {
entry_symbol = DEFAULT_ORCH_ENTRY_SYMBOL;
}
const char *config_symbol = runtime->get_device_orch_config_name();
if (config_symbol == nullptr || config_symbol[0] == '\0') {
config_symbol = DEFAULT_ORCH_CONFIG_SYMBOL;
}

dlerror();
DeviceOrchestrationFunc orch_func =
reinterpret_cast<DeviceOrchestrationFunc>(dlsym(handle, "aicpu_orchestration_entry"));
const char *dlsym_error = dlerror();
if (dlsym_error != nullptr) {
DEV_ERROR("Thread %d: dlsym failed: %s", thread_idx, dlsym_error);
DeviceOrchestrationFunc orch_func = reinterpret_cast<DeviceOrchestrationFunc>(dlsym(handle, entry_symbol));
const char *entry_dlsym_error = dlerror();
if (entry_dlsym_error != nullptr) {
DEV_ERROR(
"Thread %d: dlsym failed for entry symbol '%s': %s", thread_idx, entry_symbol, entry_dlsym_error
);
dlclose(handle);
unlink(so_path);
return -1;
}
if (orch_func == nullptr) {
DEV_ERROR("Thread %d: dlsym returned NULL for aicpu_orchestration_entry", thread_idx);
DEV_ERROR("Thread %d: dlsym returned NULL for entry symbol '%s'", thread_idx, entry_symbol);
dlclose(handle);
unlink(so_path);
return -1;
}

dlerror();
auto config_func = reinterpret_cast<DeviceOrchestrationConfigFunc>(dlsym(handle, config_symbol));
const char *config_dlsym_error = dlerror();
if (config_dlsym_error != nullptr || config_func == nullptr) {
DEV_ERROR(
"Thread %d: dlsym failed for config symbol '%s': %s", thread_idx, config_symbol,
config_dlsym_error ? config_dlsym_error : "NULL function pointer"
);
config_func = nullptr;
}

dlerror();
auto bind_runtime_func =
reinterpret_cast<DeviceOrchestrationBindRuntimeFunc>(dlsym(handle, "pto2_framework_bind_runtime"));
const char *bind_runtime_error = dlerror();
if (bind_runtime_error != nullptr) {
DEV_INFO("Thread %d: Optional TLS runtime binder not found: %s", thread_idx, bind_runtime_error);
DEV_ERROR(
"Thread %d: dlsym failed for pto2_framework_bind_runtime: %s", thread_idx, bind_runtime_error
);
bind_runtime_func = nullptr;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable,

const uint8_t *orch_so_binary = static_cast<const uint8_t *>(callable->binary_data());
size_t orch_so_size = callable->binary_size();
runtime->set_device_orch_func_name(callable->func_name());
runtime->set_device_orch_config_name(callable->config_name());

if (orch_so_binary == nullptr || orch_so_size == 0) {
LOG_ERROR("Orchestration SO binary is required for device orchestration");
Expand Down
24 changes: 24 additions & 0 deletions src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ Runtime::Runtime() {

// Initialize device orchestration SO binary
device_orch_so_size_ = 0;
device_orch_func_name_[0] = '\0';
device_orch_config_name_[0] = '\0';

// Initialize kernel binary tracking
registered_kernel_count_ = 0;
Expand Down Expand Up @@ -119,6 +121,28 @@ const void *Runtime::get_device_orch_so_data() const {

size_t Runtime::get_device_orch_so_size() const { return device_orch_so_size_; }

void Runtime::set_device_orch_func_name(const char *name) {
if (name == nullptr) {
device_orch_func_name_[0] = '\0';
return;
}
std::strncpy(device_orch_func_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1);
device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0';
}

const char *Runtime::get_device_orch_func_name() const { return device_orch_func_name_; }

void Runtime::set_device_orch_config_name(const char *name) {
if (name == nullptr) {
device_orch_config_name_[0] = '\0';
return;
}
std::strncpy(device_orch_config_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1);
device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0';
}

const char *Runtime::get_device_orch_config_name() const { return device_orch_config_name_; }

uint64_t Runtime::get_function_bin_addr(int func_id) const {
if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return 0;
return func_id_to_addr_[func_id];
Expand Down
7 changes: 7 additions & 0 deletions src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
#define RUNTIME_MAX_TENSOR_PAIRS 64
#define RUNTIME_MAX_FUNC_ID 32
#define RUNTIME_MAX_ORCH_SO_SIZE (4 * 1024 * 1024) // 4MB max for orchestration SO
#define RUNTIME_MAX_ORCH_SYMBOL_NAME 64

// Default ready queue shards: one shard per worker thread (total minus orchestrator)
constexpr int RUNTIME_DEFAULT_READY_QUEUE_SHARDS = PLATFORM_MAX_AICPU_THREADS - 1;
Expand Down Expand Up @@ -199,6 +200,8 @@ class Runtime {
// Stored as a copy to avoid lifetime issues with Python ctypes arrays
uint8_t device_orch_so_storage_[RUNTIME_MAX_ORCH_SO_SIZE];
size_t device_orch_so_size_;
char device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
char device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];

public:
/**
Expand Down Expand Up @@ -252,6 +255,10 @@ class Runtime {
void set_device_orch_so(const void *data, size_t size);
const void *get_device_orch_so_data() const;
size_t get_device_orch_so_size() const;
void set_device_orch_func_name(const char *name);
const char *get_device_orch_func_name() const;
void set_device_orch_config_name(const char *name);
const char *get_device_orch_config_name() const;

uint64_t get_function_bin_addr(int func_id) const;
void set_function_bin_addr(int func_id, uint64_t addr);
Expand Down
41 changes: 31 additions & 10 deletions src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ constexpr int32_t STALL_DUMP_WAIT_MAX = 4;
constexpr int32_t STALL_DUMP_CORE_MAX = 8;
constexpr int32_t PROGRESS_VERBOSE_THRESHOLD = 10; // log every completion for the first N tasks
constexpr int32_t PROGRESS_LOG_INTERVAL = 250; // log every N completions after threshold
constexpr const char *DEFAULT_ORCH_ENTRY_SYMBOL = "aicpu_orchestration_entry";
constexpr const char *DEFAULT_ORCH_CONFIG_SYMBOL = "aicpu_orchestration_config";

static int32_t read_pto2_runtime_status(Runtime *runtime) {
if (runtime == nullptr) {
Expand Down Expand Up @@ -2287,33 +2289,52 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
}
DEV_INFO("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle);

dlerror();
auto config_func =
reinterpret_cast<DeviceOrchestrationConfigFunc>(dlsym(handle, "aicpu_orchestration_config"));
const char *entry_symbol = runtime->get_device_orch_func_name();
if (entry_symbol == nullptr || entry_symbol[0] == '\0') {
entry_symbol = DEFAULT_ORCH_ENTRY_SYMBOL;
}
const char *config_symbol = runtime->get_device_orch_config_name();
if (config_symbol == nullptr || config_symbol[0] == '\0') {
config_symbol = DEFAULT_ORCH_CONFIG_SYMBOL;
}

dlerror();
DeviceOrchestrationFunc orch_func =
reinterpret_cast<DeviceOrchestrationFunc>(dlsym(handle, "aicpu_orchestration_entry"));
const char *dlsym_error = dlerror();
if (dlsym_error != nullptr) {
DEV_ERROR("Thread %d: dlsym failed: %s", thread_idx, dlsym_error);
DeviceOrchestrationFunc orch_func = reinterpret_cast<DeviceOrchestrationFunc>(dlsym(handle, entry_symbol));
const char *entry_dlsym_error = dlerror();
if (entry_dlsym_error != nullptr) {
DEV_ERROR(
"Thread %d: dlsym failed for entry symbol '%s': %s", thread_idx, entry_symbol, entry_dlsym_error
);
dlclose(handle);
unlink(so_path);
return -1;
}
if (orch_func == nullptr) {
DEV_ERROR("Thread %d: dlsym returned NULL for aicpu_orchestration_entry", thread_idx);
DEV_ERROR("Thread %d: dlsym returned NULL for entry symbol '%s'", thread_idx, entry_symbol);
dlclose(handle);
unlink(so_path);
return -1;
}

dlerror();
auto config_func = reinterpret_cast<DeviceOrchestrationConfigFunc>(dlsym(handle, config_symbol));
const char *config_dlsym_error = dlerror();
if (config_dlsym_error != nullptr || config_func == nullptr) {
DEV_ERROR(
"Thread %d: dlsym failed for config symbol '%s': %s", thread_idx, config_symbol,
config_dlsym_error ? config_dlsym_error : "NULL function pointer"
);
config_func = nullptr;
}

dlerror();
auto bind_runtime_func =
reinterpret_cast<DeviceOrchestrationBindRuntimeFunc>(dlsym(handle, "pto2_framework_bind_runtime"));
const char *bind_runtime_error = dlerror();
if (bind_runtime_error != nullptr) {
DEV_INFO("Thread %d: Optional TLS runtime binder not found: %s", thread_idx, bind_runtime_error);
DEV_ERROR(
"Thread %d: dlsym failed for pto2_framework_bind_runtime: %s", thread_idx, bind_runtime_error
);
bind_runtime_func = nullptr;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ extern "C" int init_runtime_impl(Runtime *runtime, const ChipCallable *callable,

const uint8_t *orch_so_binary = static_cast<const uint8_t *>(callable->binary_data());
size_t orch_so_size = callable->binary_size();
runtime->set_device_orch_func_name(callable->func_name());
runtime->set_device_orch_config_name(callable->config_name());

if (orch_so_binary == nullptr || orch_so_size == 0) {
LOG_ERROR("Orchestration SO binary is required for device orchestration");
Expand Down
24 changes: 24 additions & 0 deletions src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ Runtime::Runtime() {

// Initialize device orchestration SO binary
device_orch_so_size_ = 0;
device_orch_func_name_[0] = '\0';
device_orch_config_name_[0] = '\0';

// Initialize kernel binary tracking
registered_kernel_count_ = 0;
Expand Down Expand Up @@ -119,6 +121,28 @@ const void *Runtime::get_device_orch_so_data() const {

size_t Runtime::get_device_orch_so_size() const { return device_orch_so_size_; }

void Runtime::set_device_orch_func_name(const char *name) {
if (name == nullptr) {
device_orch_func_name_[0] = '\0';
return;
}
std::strncpy(device_orch_func_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1);
device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0';
}

const char *Runtime::get_device_orch_func_name() const { return device_orch_func_name_; }

void Runtime::set_device_orch_config_name(const char *name) {
if (name == nullptr) {
device_orch_config_name_[0] = '\0';
return;
}
std::strncpy(device_orch_config_name_, name, RUNTIME_MAX_ORCH_SYMBOL_NAME - 1);
device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME - 1] = '\0';
}

const char *Runtime::get_device_orch_config_name() const { return device_orch_config_name_; }

uint64_t Runtime::get_function_bin_addr(int func_id) const {
if (func_id < 0 || func_id >= RUNTIME_MAX_FUNC_ID) return 0;
return func_id_to_addr_[func_id];
Expand Down
7 changes: 7 additions & 0 deletions src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
#define RUNTIME_MAX_TENSOR_PAIRS 64
#define RUNTIME_MAX_FUNC_ID 32
#define RUNTIME_MAX_ORCH_SO_SIZE (4 * 1024 * 1024) // 1MB max for orchestration SO
#define RUNTIME_MAX_ORCH_SYMBOL_NAME 64

// Default ready queue shards: one shard per worker thread (total minus orchestrator)
constexpr int RUNTIME_DEFAULT_READY_QUEUE_SHARDS = PLATFORM_MAX_AICPU_THREADS - 1;
Expand Down Expand Up @@ -199,6 +200,8 @@ class Runtime {
// Stored as a copy to avoid lifetime issues with Python ctypes arrays
uint8_t device_orch_so_storage_[RUNTIME_MAX_ORCH_SO_SIZE];
size_t device_orch_so_size_;
char device_orch_func_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];
char device_orch_config_name_[RUNTIME_MAX_ORCH_SYMBOL_NAME];

public:
/**
Expand Down Expand Up @@ -252,6 +255,10 @@ class Runtime {
void set_device_orch_so(const void *data, size_t size);
const void *get_device_orch_so_data() const;
size_t get_device_orch_so_size() const;
void set_device_orch_func_name(const char *name);
const char *get_device_orch_func_name() const;
void set_device_orch_config_name(const char *name);
const char *get_device_orch_config_name() const;

uint64_t get_function_bin_addr(int func_id) const;
void set_function_bin_addr(int func_id, uint64_t addr);
Expand Down
Loading
Loading