diff --git a/src/a2a3/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp index 31f07beae..7e468fce2 100644 --- a/src/a2a3/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/aicpu_build_graph/aicpu/aicpu_executor.cpp @@ -29,6 +29,11 @@ #include "runtime.h" #include "spin_hint.h" +// memfd-based SO loading (Linux only) +#if defined(__linux__) +#include "memfd_loader.h" +#endif + // Runtime headers (full struct definition for create/destroy + PTO2_SCOPE) #include "pto_runtime2.h" #include "pto_runtime2_types.h" @@ -237,6 +242,7 @@ struct AicpuExecutor { // Orchestration SO handle - defer dlclose until all tasks complete void *orch_so_handle_{nullptr}; char orch_so_path_[256]{}; // Path to orchestration SO file for cleanup + int orch_so_memfd_{-1}; // memfd for memfd_create path (-1 if file-based) // Shared orchestration function pointer (loaded by first orch thread, used by all) DeviceOrchestrationFunc orch_func_{nullptr}; @@ -1596,50 +1602,71 @@ int32_t AicpuExecutor::run(Runtime *runtime) { return -1; } - // Try multiple paths that may allow execution on AICPU + // Try memfd first (Linux only), fall back to file-based char so_path[256]; - bool file_created = false; - const char *candidate_dirs[] = { - "/usr/lib64/aicpu_kernels/0/aicpu_kernels_device", "/usr/lib64", "/lib64", "/var/tmp", "/tmp" - }; - const int32_t num_candidates = sizeof(candidate_dirs) / sizeof(candidate_dirs[0]); - - for (int32_t i = 0; i < num_candidates && !file_created; i++) { - snprintf(so_path, sizeof(so_path), "%s/libdevice_orch_%d.so", candidate_dirs[i], getpid()); - int32_t fd = open(so_path, O_WRONLY | O_CREAT | O_TRUNC, 0755); - if (fd < 0) { - DEV_INFO( - "Thread %d: Cannot create SO at %s (errno=%d), trying next path", thread_idx, so_path, errno - ); - continue; - } - ssize_t written = write(fd, so_data, so_size); - close(fd); - if (written != static_cast(so_size)) { - DEV_INFO( - "Thread %d: Cannot write SO to %s (errno=%d), trying next path", thread_idx, so_path, errno - ); - unlink(so_path); - continue; - } - file_created = true; - DEV_INFO("Thread %d: Created SO file at %s (%zu bytes)", thread_idx, so_path, so_size); - } + void *handle = nullptr; - if (!file_created) { - DEV_ERROR("Thread %d: Failed to create SO file in any candidate path", thread_idx); - return -1; +#if defined(__linux__) + int memfd = -1; + // Attempt memfd-based loading first + int memfd_rc = load_orchestration_so_with_memfd(so_data, so_size, thread_idx, &handle, so_path, &memfd); + + if (memfd_rc == 0 && handle != nullptr) { + // memfd loading succeeded, use memfd-loaded handle + orch_so_memfd_ = memfd; } +#endif - dlerror(); - void *handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL); - const char *dlopen_err = dlerror(); if (handle == nullptr) { - DEV_ERROR("Thread %d: dlopen failed: %s", thread_idx, dlopen_err ? dlopen_err : "unknown"); - unlink(so_path); - return -1; + // memfd failed or unavailable - use file-based loading +#if defined(__linux__) + orch_so_memfd_ = -1; +#endif + + // Try multiple paths that may allow execution on AICPU + bool file_created = false; + const char *candidate_dirs[] = { + "/usr/lib64/aicpu_kernels/0/aicpu_kernels_device", "/usr/lib64", "/lib64", "/var/tmp", "/tmp" + }; + const int32_t num_candidates = sizeof(candidate_dirs) / sizeof(candidate_dirs[0]); + + for (int32_t i = 0; i < num_candidates && !file_created; i++) { + snprintf(so_path, sizeof(so_path), "%s/libdevice_orch_%d.so", candidate_dirs[i], getpid()); + int32_t fd = open(so_path, O_WRONLY | O_CREAT | O_TRUNC, 0755); + if (fd < 0) { + DEV_INFO( + "Thread %d: Cannot create SO at %s (errno=%d), trying next path", thread_idx, so_path, errno + ); + continue; + } + ssize_t written = write(fd, so_data, so_size); + close(fd); + if (written != static_cast(so_size)) { + DEV_INFO( + "Thread %d: Cannot write SO to %s (errno=%d), trying next path", thread_idx, so_path, errno + ); + unlink(so_path); + continue; + } + file_created = true; + DEV_INFO("Thread %d: Created SO file at %s (%zu bytes)", thread_idx, so_path, so_size); + } + + if (!file_created) { + DEV_ERROR("Thread %d: Failed to create SO file in any candidate path", thread_idx); + return -1; + } + + dlerror(); + handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL); + const char *dlopen_err = dlerror(); + if (handle == nullptr) { + DEV_ERROR("Thread %d: dlopen failed: %s", thread_idx, dlopen_err ? dlopen_err : "unknown"); + unlink(so_path); + return -1; + } + DEV_INFO("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle); } - DEV_INFO("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle); dlerror(); auto config_func = @@ -1970,8 +1997,21 @@ int32_t AicpuExecutor::run(Runtime *runtime) { // Destroy PTO2 runtime and close orchestration SO (moved from orchestrator path) if (!runtime->get_orch_built_on_host() && orch_so_handle_ != nullptr) { pto2_runtime_destroy(rt); + // Handle cleanup based on loading method +#if defined(__linux__) + if (orch_so_memfd_ >= 0) { + // memfd-based: close fd AFTER dlclose + cleanup_memfd_so(orch_so_memfd_, orch_so_handle_); + } else { + // File-based: dlclose handle and unlink file + dlclose(orch_so_handle_); + unlink(orch_so_path_); + } +#else + // Non-Linux: only file-based loading dlclose(orch_so_handle_); unlink(orch_so_path_); +#endif } DEV_ALWAYS("Thread %d: Last thread, marking executor finished", thread_idx); } @@ -2029,6 +2069,7 @@ void AicpuExecutor::deinit(Runtime *runtime) { orch_args_cached_ = nullptr; orch_so_handle_ = nullptr; orch_so_path_[0] = '\0'; + orch_so_memfd_ = -1; // Reset register-related state for (int32_t i = 0; i < MAX_CORES_PER_THREAD; i++) { diff --git a/src/a2a3/runtime/aicpu_build_graph/aicpu/memfd_loader.h b/src/a2a3/runtime/aicpu_build_graph/aicpu/memfd_loader.h new file mode 100644 index 000000000..d95d00e6e --- /dev/null +++ b/src/a2a3/runtime/aicpu_build_graph/aicpu/memfd_loader.h @@ -0,0 +1,130 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * @file memfd_loader.h + * @brief Memory file descriptor based SO loading for AICPU environment + */ + +#ifndef SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_AICPU_MEMFD_LOADER_H_ +#define SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_AICPU_MEMFD_LOADER_H_ + +// Enable GNU extensions for memfd_create and MFD_CLOEXEC +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include +#include +#include + +#include "aicpu/device_log.h" + +/** + * Load orchestration SO using memfd + */ +static inline int load_orchestration_so_with_memfd( + const void *so_data, size_t so_size, int orch_thread_num, void **out_handle, char *out_so_path, int *out_memfd +) { + *out_handle = nullptr; + *out_memfd = -1; + out_so_path[0] = '\0'; + + if (so_data == nullptr || so_size == 0) { + return -1; + } + + // Create memfd + int fd = memfd_create("libdevice_orch", MFD_CLOEXEC); + + if (fd < 0) { + DEV_INFO("memfd_create failed: errno=%d", errno); + return -1; + } + + // Write SO data to memfd + ssize_t written = write(fd, so_data, so_size); + + if (written < 0) { + DEV_INFO("memfd write failed: errno=%d", errno); + close(fd); + return -1; + } + if (written != static_cast(so_size)) { + DEV_INFO("memfd partial write: %zd/%zu", written, so_size); + close(fd); + return -1; + } + + // Reset file position to beginning before dlopen + lseek(fd, 0, SEEK_SET); + + // Construct /proc/self/fd/N path for symlink target + char proc_fd_path[256]; + snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%d", fd); + + // Create a symlink to /proc/self/fd/N with a "normal" path + // This bypasses the AICPU dynamic linker's issue with /proc/self/fd/N paths + char link_path[256]; + snprintf(link_path, sizeof(link_path), "/tmp/libdevice_orch_%d_%d.so", getpid(), orch_thread_num); + + int symlink_rc = symlink(proc_fd_path, link_path); + if (symlink_rc != 0) { + DEV_INFO("symlink failed: errno=%d", errno); + close(fd); + return -1; + } + + snprintf(out_so_path, 256, "%s", link_path); + + // Try dlopen from the symlink + dlerror(); + void *handle = dlopen(out_so_path, RTLD_LAZY | RTLD_LOCAL); + + // Clean up symlink immediately after dlopen (dlopen has its own reference) + unlink(link_path); + + if (handle == nullptr) { + const char *dl_err = dlerror(); + DEV_INFO("dlopen from memfd symlink failed: %s", dl_err ? dl_err : "unknown"); + close(fd); + return -1; + } + + *out_handle = handle; + *out_memfd = fd; + return 0; +} + +/** + * Cleanup memfd-based SO + */ +static inline void cleanup_memfd_so(int memfd, void *handle) { + if (handle != nullptr) { + dlclose(handle); + } + if (memfd >= 0) { + close(memfd); + } +} + +#ifdef __cplusplus +} +#endif + +#endif // SRC_A2A3_RUNTIME_AICPU_BUILD_GRAPH_AICPU_MEMFD_LOADER_H_ diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 97afd6a4e..b53dff6d9 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -29,6 +29,11 @@ #include "runtime.h" #include "spin_hint.h" +// memfd-based SO loading (Linux only) +#if defined(__linux__) +#include "memfd_loader.h" +#endif + // Runtime headers (full struct definition for create/destroy + PTO2_SCOPE) #include "pto_runtime2.h" #include "pto_runtime2_types.h" @@ -340,6 +345,7 @@ struct AicpuExecutor { // Orchestration SO handle - defer dlclose until all tasks complete void *orch_so_handle_{nullptr}; char orch_so_path_[256]{}; // Path to orchestration SO file for cleanup + int orch_so_memfd_{-1}; // memfd for memfd_create path (-1 if file-based) // Shared orchestration function pointer (loaded by first orch thread, used by all) DeviceOrchestrationFunc orch_func_{nullptr}; @@ -1932,50 +1938,71 @@ int32_t AicpuExecutor::run(Runtime *runtime) { return -1; } - // Try multiple paths that may allow execution on AICPU + // Try memfd first (Linux only), fall back to file-based char so_path[256]; - bool file_created = false; - const char *candidate_dirs[] = { - "/usr/lib64/aicpu_kernels/0/aicpu_kernels_device", "/usr/lib64", "/lib64", "/var/tmp", "/tmp" - }; - const int32_t num_candidates = sizeof(candidate_dirs) / sizeof(candidate_dirs[0]); - - for (int32_t i = 0; i < num_candidates && !file_created; i++) { - snprintf(so_path, sizeof(so_path), "%s/libdevice_orch_%d.so", candidate_dirs[i], getpid()); - int32_t fd = open(so_path, O_WRONLY | O_CREAT | O_TRUNC, 0755); - if (fd < 0) { - DEV_INFO( - "Thread %d: Cannot create SO at %s (errno=%d), trying next path", thread_idx, so_path, errno - ); - continue; - } - ssize_t written = write(fd, so_data, so_size); - close(fd); - if (written != static_cast(so_size)) { - DEV_INFO( - "Thread %d: Cannot write SO to %s (errno=%d), trying next path", thread_idx, so_path, errno - ); - unlink(so_path); - continue; - } - file_created = true; - DEV_INFO("Thread %d: Created SO file at %s (%zu bytes)", thread_idx, so_path, so_size); - } + void *handle = nullptr; - if (!file_created) { - DEV_ERROR("Thread %d: Failed to create SO file in any candidate path", thread_idx); - return -1; +#if defined(__linux__) + int memfd = -1; + // Attempt memfd-based loading first + int memfd_rc = load_orchestration_so_with_memfd(so_data, so_size, thread_idx, &handle, so_path, &memfd); + + if (memfd_rc == 0 && handle != nullptr) { + // memfd loading succeeded, use memfd-loaded handle + orch_so_memfd_ = memfd; } +#endif - dlerror(); - void *handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL); - const char *dlopen_err = dlerror(); if (handle == nullptr) { - DEV_ERROR("Thread %d: dlopen failed: %s", thread_idx, dlopen_err ? dlopen_err : "unknown"); - unlink(so_path); - return -1; + // memfd failed or unavailable - use file-based loading +#if defined(__linux__) + orch_so_memfd_ = -1; +#endif + + // Try multiple paths that may allow execution on AICPU + bool file_created = false; + const char *candidate_dirs[] = { + "/usr/lib64/aicpu_kernels/0/aicpu_kernels_device", "/usr/lib64", "/lib64", "/var/tmp", "/tmp" + }; + const int32_t num_candidates = sizeof(candidate_dirs) / sizeof(candidate_dirs[0]); + + for (int32_t i = 0; i < num_candidates && !file_created; i++) { + snprintf(so_path, sizeof(so_path), "%s/libdevice_orch_%d.so", candidate_dirs[i], getpid()); + int32_t fd = open(so_path, O_WRONLY | O_CREAT | O_TRUNC, 0755); + if (fd < 0) { + DEV_INFO( + "Thread %d: Cannot create SO at %s (errno=%d), trying next path", thread_idx, so_path, errno + ); + continue; + } + ssize_t written = write(fd, so_data, so_size); + close(fd); + if (written != static_cast(so_size)) { + DEV_INFO( + "Thread %d: Cannot write SO to %s (errno=%d), trying next path", thread_idx, so_path, errno + ); + unlink(so_path); + continue; + } + file_created = true; + DEV_INFO("Thread %d: Created SO file at %s (%zu bytes)", thread_idx, so_path, so_size); + } + + if (!file_created) { + DEV_ERROR("Thread %d: Failed to create SO file in any candidate path", thread_idx); + return -1; + } + + dlerror(); + handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL); + const char *dlopen_err = dlerror(); + if (handle == nullptr) { + DEV_ERROR("Thread %d: dlopen failed: %s", thread_idx, dlopen_err ? dlopen_err : "unknown"); + unlink(so_path); + return -1; + } + DEV_INFO("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle); } - DEV_INFO("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle); dlerror(); auto config_func = @@ -2359,8 +2386,21 @@ int32_t AicpuExecutor::run(Runtime *runtime) { orch_bind_runtime_(nullptr); } pto2_runtime_destroy(rt); + // Handle cleanup based on loading method +#if defined(__linux__) + if (orch_so_memfd_ >= 0) { + // memfd-based: close fd AFTER dlclose + cleanup_memfd_so(orch_so_memfd_, orch_so_handle_); + } else { + // File-based: dlclose handle and unlink file + dlclose(orch_so_handle_); + unlink(orch_so_path_); + } +#else + // Non-Linux: only file-based loading dlclose(orch_so_handle_); unlink(orch_so_path_); +#endif } } @@ -2415,6 +2455,7 @@ void AicpuExecutor::deinit(Runtime *runtime) { orch_args_cached_ = nullptr; orch_so_handle_ = nullptr; orch_so_path_[0] = '\0'; + orch_so_memfd_ = -1; // Clear file-scope PTO2Runtime pointer (freed by orchestrator thread before deinit) rt = nullptr; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/memfd_loader.h b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/memfd_loader.h new file mode 100644 index 000000000..e7249b35b --- /dev/null +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/memfd_loader.h @@ -0,0 +1,130 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * @file memfd_loader.h + * @brief Memory file descriptor based SO loading for AICPU environment + */ + +#ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_AICPU_MEMFD_LOADER_H_ +#define SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_AICPU_MEMFD_LOADER_H_ + +// Enable GNU extensions for memfd_create and MFD_CLOEXEC +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include +#include +#include + +#include "aicpu/device_log.h" + +/** + * Load orchestration SO using memfd + */ +static inline int load_orchestration_so_with_memfd( + const void *so_data, size_t so_size, int orch_thread_num, void **out_handle, char *out_so_path, int *out_memfd +) { + *out_handle = nullptr; + *out_memfd = -1; + out_so_path[0] = '\0'; + + if (so_data == nullptr || so_size == 0) { + return -1; + } + + // Create memfd + int fd = memfd_create("libdevice_orch", MFD_CLOEXEC); + + if (fd < 0) { + DEV_INFO("memfd_create failed: errno=%d", errno); + return -1; + } + + // Write SO data to memfd + ssize_t written = write(fd, so_data, so_size); + + if (written < 0) { + DEV_INFO("memfd write failed: errno=%d", errno); + close(fd); + return -1; + } + if (written != static_cast(so_size)) { + DEV_INFO("memfd partial write: %zd/%zu", written, so_size); + close(fd); + return -1; + } + + // Reset file position to beginning before dlopen + lseek(fd, 0, SEEK_SET); + + // Construct /proc/self/fd/N path for symlink target + char proc_fd_path[256]; + snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%d", fd); + + // Create a symlink to /proc/self/fd/N with a "normal" path + // This bypasses the AICPU dynamic linker's issue with /proc/self/fd/N paths + char link_path[256]; + snprintf(link_path, sizeof(link_path), "/tmp/libdevice_orch_%d_%d.so", getpid(), orch_thread_num); + + int symlink_rc = symlink(proc_fd_path, link_path); + if (symlink_rc != 0) { + DEV_INFO("symlink failed: errno=%d", errno); + close(fd); + return -1; + } + + snprintf(out_so_path, 256, "%s", link_path); + + // Try dlopen from the symlink + dlerror(); + void *handle = dlopen(out_so_path, RTLD_LAZY | RTLD_LOCAL); + + // Clean up symlink immediately after dlopen (dlopen has its own reference) + unlink(link_path); + + if (handle == nullptr) { + const char *dl_err = dlerror(); + DEV_INFO("dlopen from memfd symlink failed: %s", dl_err ? dl_err : "unknown"); + close(fd); + return -1; + } + + *out_handle = handle; + *out_memfd = fd; + return 0; +} + +/** + * Cleanup memfd-based SO + */ +static inline void cleanup_memfd_so(int memfd, void *handle) { + if (handle != nullptr) { + dlclose(handle); + } + if (memfd >= 0) { + close(memfd); + } +} + +#ifdef __cplusplus +} +#endif + +#endif // SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_AICPU_MEMFD_LOADER_H_ diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index dcf3d5658..86efe2399 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -29,6 +29,11 @@ #include "runtime.h" #include "spin_hint.h" +// memfd-based SO loading (Linux only) +#if defined(__linux__) +#include "memfd_loader.h" +#endif + // Runtime headers (full struct definition for create/destroy + PTO2_SCOPE) #include "pto_runtime2.h" #include "pto_runtime2_types.h" @@ -338,6 +343,7 @@ struct AicpuExecutor { // Orchestration SO handle - defer dlclose until all tasks complete void *orch_so_handle_{nullptr}; char orch_so_path_[256]{}; // Path to orchestration SO file for cleanup + int orch_so_memfd_{-1}; // memfd for memfd_create path (-1 if file-based) // Shared orchestration function pointer (loaded by first orch thread, used by all) DeviceOrchestrationFunc orch_func_{nullptr}; @@ -1910,50 +1916,71 @@ int32_t AicpuExecutor::run(Runtime *runtime) { return -1; } - // Try multiple paths that may allow execution on AICPU + // Try memfd first (Linux only), fall back to file-based char so_path[256]; - bool file_created = false; - const char *candidate_dirs[] = { - "/usr/lib64/aicpu_kernels/0/aicpu_kernels_device", "/usr/lib64", "/lib64", "/var/tmp", "/tmp" - }; - const int32_t num_candidates = sizeof(candidate_dirs) / sizeof(candidate_dirs[0]); - - for (int32_t i = 0; i < num_candidates && !file_created; i++) { - snprintf(so_path, sizeof(so_path), "%s/libdevice_orch_%d.so", candidate_dirs[i], getpid()); - int32_t fd = open(so_path, O_WRONLY | O_CREAT | O_TRUNC, 0755); - if (fd < 0) { - DEV_INFO( - "Thread %d: Cannot create SO at %s (errno=%d), trying next path", thread_idx, so_path, errno - ); - continue; - } - ssize_t written = write(fd, so_data, so_size); - close(fd); - if (written != static_cast(so_size)) { - DEV_INFO( - "Thread %d: Cannot write SO to %s (errno=%d), trying next path", thread_idx, so_path, errno - ); - unlink(so_path); - continue; - } - file_created = true; - DEV_INFO("Thread %d: Created SO file at %s (%zu bytes)", thread_idx, so_path, so_size); - } + void *handle = nullptr; - if (!file_created) { - DEV_ERROR("Thread %d: Failed to create SO file in any candidate path", thread_idx); - return -1; +#if defined(__linux__) + int memfd = -1; + // Attempt memfd-based loading first + int memfd_rc = load_orchestration_so_with_memfd(so_data, so_size, thread_idx, &handle, so_path, &memfd); + + if (memfd_rc == 0 && handle != nullptr) { + // memfd loading succeeded, use memfd-loaded handle + orch_so_memfd_ = memfd; } +#endif - dlerror(); - void *handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL); - const char *dlopen_err = dlerror(); if (handle == nullptr) { - DEV_ERROR("Thread %d: dlopen failed: %s", thread_idx, dlopen_err ? dlopen_err : "unknown"); - unlink(so_path); - return -1; + // memfd failed or unavailable - use file-based loading +#if defined(__linux__) + orch_so_memfd_ = -1; +#endif + + // Try multiple paths that may allow execution on AICPU + bool file_created = false; + const char *candidate_dirs[] = { + "/usr/lib64/aicpu_kernels/0/aicpu_kernels_device", "/usr/lib64", "/lib64", "/var/tmp", "/tmp" + }; + const int32_t num_candidates = sizeof(candidate_dirs) / sizeof(candidate_dirs[0]); + + for (int32_t i = 0; i < num_candidates && !file_created; i++) { + snprintf(so_path, sizeof(so_path), "%s/libdevice_orch_%d.so", candidate_dirs[i], getpid()); + int32_t fd = open(so_path, O_WRONLY | O_CREAT | O_TRUNC, 0755); + if (fd < 0) { + DEV_INFO( + "Thread %d: Cannot create SO at %s (errno=%d), trying next path", thread_idx, so_path, errno + ); + continue; + } + ssize_t written = write(fd, so_data, so_size); + close(fd); + if (written != static_cast(so_size)) { + DEV_INFO( + "Thread %d: Cannot write SO to %s (errno=%d), trying next path", thread_idx, so_path, errno + ); + unlink(so_path); + continue; + } + file_created = true; + DEV_INFO("Thread %d: Created SO file at %s (%zu bytes)", thread_idx, so_path, so_size); + } + + if (!file_created) { + DEV_ERROR("Thread %d: Failed to create SO file in any candidate path", thread_idx); + return -1; + } + + dlerror(); + handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL); + const char *dlopen_err = dlerror(); + if (handle == nullptr) { + DEV_ERROR("Thread %d: dlopen failed: %s", thread_idx, dlopen_err ? dlopen_err : "unknown"); + unlink(so_path); + return -1; + } + DEV_INFO("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle); } - DEV_INFO("Thread %d: dlopen succeeded, handle=%p", thread_idx, handle); dlerror(); auto config_func = @@ -2336,8 +2363,21 @@ int32_t AicpuExecutor::run(Runtime *runtime) { orch_bind_runtime_(nullptr); } pto2_runtime_destroy(rt); + // Handle cleanup based on loading method +#if defined(__linux__) + if (orch_so_memfd_ >= 0) { + // memfd-based: close fd AFTER dlclose + cleanup_memfd_so(orch_so_memfd_, orch_so_handle_); + } else { + // File-based: dlclose handle and unlink file + dlclose(orch_so_handle_); + unlink(orch_so_path_); + } +#else + // Non-Linux: only file-based loading dlclose(orch_so_handle_); unlink(orch_so_path_); +#endif } } @@ -2391,6 +2431,7 @@ void AicpuExecutor::deinit(Runtime *runtime) { orch_args_cached_ = nullptr; orch_so_handle_ = nullptr; orch_so_path_[0] = '\0'; + orch_so_memfd_ = -1; // Clear file-scope PTO2Runtime pointer (freed by orchestrator thread before deinit) rt = nullptr; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/memfd_loader.h b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/memfd_loader.h new file mode 100644 index 000000000..6acabc1f9 --- /dev/null +++ b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/memfd_loader.h @@ -0,0 +1,130 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +/** + * @file memfd_loader.h + * @brief Memory file descriptor based SO loading for AICPU environment + */ + +#ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_AICPU_MEMFD_LOADER_H_ +#define SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_AICPU_MEMFD_LOADER_H_ + +// Enable GNU extensions for memfd_create and MFD_CLOEXEC +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include +#include +#include + +#include "aicpu/device_log.h" + +/** + * Load orchestration SO using memfd + */ +static inline int load_orchestration_so_with_memfd( + const void *so_data, size_t so_size, int orch_thread_num, void **out_handle, char *out_so_path, int *out_memfd +) { + *out_handle = nullptr; + *out_memfd = -1; + out_so_path[0] = '\0'; + + if (so_data == nullptr || so_size == 0) { + return -1; + } + + // Create memfd + int fd = memfd_create("libdevice_orch", MFD_CLOEXEC); + + if (fd < 0) { + DEV_INFO("memfd_create failed: errno=%d", errno); + return -1; + } + + // Write SO data to memfd + ssize_t written = write(fd, so_data, so_size); + + if (written < 0) { + DEV_INFO("memfd write failed: errno=%d", errno); + close(fd); + return -1; + } + if (written != static_cast(so_size)) { + DEV_INFO("memfd partial write: %zd/%zu", written, so_size); + close(fd); + return -1; + } + + // Reset file position to beginning before dlopen + lseek(fd, 0, SEEK_SET); + + // Construct /proc/self/fd/N path for symlink target + char proc_fd_path[256]; + snprintf(proc_fd_path, sizeof(proc_fd_path), "/proc/self/fd/%d", fd); + + // Create a symlink to /proc/self/fd/N with a "normal" path + // This bypasses the AICPU dynamic linker's issue with /proc/self/fd/N paths + char link_path[256]; + snprintf(link_path, sizeof(link_path), "/tmp/libdevice_orch_%d_%d.so", getpid(), orch_thread_num); + + int symlink_rc = symlink(proc_fd_path, link_path); + if (symlink_rc != 0) { + DEV_INFO("symlink failed: errno=%d", errno); + close(fd); + return -1; + } + + snprintf(out_so_path, 256, "%s", link_path); + + // Try dlopen from the symlink + dlerror(); + void *handle = dlopen(out_so_path, RTLD_LAZY | RTLD_LOCAL); + + // Clean up symlink immediately after dlopen (dlopen has its own reference) + unlink(link_path); + + if (handle == nullptr) { + const char *dl_err = dlerror(); + DEV_INFO("dlopen from memfd symlink failed: %s", dl_err ? dl_err : "unknown"); + close(fd); + return -1; + } + + *out_handle = handle; + *out_memfd = fd; + return 0; +} + +/** + * Cleanup memfd-based SO + */ +static inline void cleanup_memfd_so(int memfd, void *handle) { + if (handle != nullptr) { + dlclose(handle); + } + if (memfd >= 0) { + close(memfd); + } +} + +#ifdef __cplusplus +} +#endif + +#endif // SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_AICPU_MEMFD_LOADER_H_