pytorch · Gasoonjia · Jan 13, 2026 · Jan 13, 2026 · Jan 13, 2026 · Jan 14, 2026
@@ -25,37 +25,57 @@ endif()
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 find_package_torch()
 
-# Common AOTI functionality - combines all AOTI common components
-set(_aoti_common_sources common_shims.cpp)
-add_library(aoti_common STATIC ${_aoti_common_sources})
+# ==============================================================================
+# AOTI common shims using ETensor (for Metal backend)
+# TODO(gasoonjia): Remove this after metal migration
+# ==============================================================================
+add_library(aoti_common_shims STATIC common_shims.cpp)
 target_include_directories(
-  aoti_common
+  aoti_common_shims
   PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}> $<INSTALL_INTERFACE:include>
          $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
 )
 target_compile_options(
-  aoti_common
+  aoti_common_shims
   PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
          $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
 )
 target_compile_definitions(
-  aoti_common PRIVATE $<$<PLATFORM_ID:Windows>:EXPORT_AOTI_FUNCTIONS>
-)
-# Ensure symbols are exported properly
-if(APPLE)
-  target_link_options(aoti_common PUBLIC -Wl,-export_dynamic)
-else()
-  target_link_options(
-    aoti_common PUBLIC $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-Wl,--export-dynamic>
-  )
-endif()
+  aoti_common_shims PUBLIC $<$<PLATFORM_ID:Windows>:EXPORT_AOTI_FUNCTIONS>
+)
+target_link_libraries(aoti_common_shims PUBLIC extension_tensor ${CMAKE_DL_LIBS})
 
-# Link against ExecuTorch libraries and standard libraries
-target_link_libraries(aoti_common PUBLIC extension_tensor ${CMAKE_DL_LIBS})
-executorch_target_link_options_shared_lib(aoti_common)
+install(
+  TARGETS aoti_common_shims
+  EXPORT ExecuTorchTargets
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
+
+# ==============================================================================
+# AOTI common shims using SlimTensor (for CUDA backend)
+# Uses SlimTensor for all tensor operations
+# TODO(gasoonjia): Replace aoti_common_shims with this one after metal migration
+# ==============================================================================
+add_library(aoti_common_shims_slim STATIC common_shims_slim.cpp)
+target_include_directories(
+  aoti_common_shims_slim
+  PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}> $<INSTALL_INTERFACE:include>
+         $<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
+)
+target_compile_options(
+  aoti_common_shims_slim
+  PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
+         $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
+)
+target_compile_definitions(
+  aoti_common_shims_slim PUBLIC $<$<PLATFORM_ID:Windows>:EXPORT_AOTI_FUNCTIONS>
+)
+target_link_libraries(
+  aoti_common_shims_slim PUBLIC slimtensor extension_tensor ${CMAKE_DL_LIBS}
+)
 
 install(
-  TARGETS aoti_common
+  TARGETS aoti_common_shims_slim
   EXPORT ExecuTorchTargets
   DESTINATION ${CMAKE_INSTALL_LIBDIR}
 )
diff --git a/backends/aoti/targets.bzl b/backends/aoti/targets.bzl
@@ -33,7 +33,8 @@ def define_common_targets():
         ],
     )
 
-    # AOTI common shims functionality
+    # AOTI common shims functionality using ETensor
+    # TODO(gasoonjia): Remove this after metal migration
     runtime.cxx_library(
         name = "common_shims",
         srcs = [
@@ -89,6 +90,7 @@ def define_common_targets():
 
     # SlimTensor-based common shims library
     # Uses SlimTensor for all tensor operations
+    # TODO(gasoonjia): Replace common_shims with this one after metal migration
     runtime.cxx_library(
         name = "common_shims_slim",
         srcs = [
@@ -97,10 +99,27 @@ def define_common_targets():
         headers = [
             "common_shims_slim.h",
             "export.h",
+            "utils.h",
         ],
         visibility = ["@EXECUTORCH_CLIENTS"],
         exported_deps = [
             "//executorch/runtime/core:core",
+            "//executorch/runtime/core/exec_aten:lib",
             "//executorch/backends/aoti/slim/core:slimtensor",
         ],
     )
+
+    # Common AOTI functionality for SlimTensor-based backends (combining common_shims_slim and delegate_handle)
+    # All CUDA backend code should depend on this target
+    # TODO(gasoonjia): Replace aoti_common with this one after metal migration
+    runtime.cxx_library(
+        name = "aoti_common_slim",
+        # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
+        link_whole = True,
+        supports_python_dlopen = True,
+        visibility = ["PUBLIC"],
+        exported_deps = [
+            ":common_shims_slim",
+            ":delegate_handle",
+        ],
+    )
@@ -99,14 +99,18 @@ install(
 
 # CUDA-specific AOTI shim symbols (dynamically linked) Uses
 # common_shims_slim.cpp for SlimTensor-based shim implementations
-set(_aoti_cuda_shim_sources
-    runtime/shims/memory.cpp runtime/guard.cpp runtime/shims/cuda_guard.cpp
-    runtime/shims/int4mm.cu ${EXECUTORCH_ROOT}/backends/aoti/common_shims.cpp
-    ${EXECUTORCH_ROOT}/backends/aoti/common_shims_slim.cpp
-)
+ set(_aoti_cuda_shim_sources
+     runtime/shims/memory.cpp runtime/shims/cuda_guard.cpp
+     runtime/shims/int4mm.cu
+     ${EXECUTORCH_ROOT}/backends/aoti/common_shims_slim.cpp
+     ${EXECUTORCH_ROOT}/backends/aoti/slim/cuda/guard.cpp
+ )
 
 add_library(aoti_cuda_shims SHARED ${_aoti_cuda_shim_sources})
 
+# Define CUDA_AVAILABLE to use SlimTensor on GPU in common_shims_slim.h
+target_compile_definitions(aoti_cuda_shims PRIVATE CUDA_AVAILABLE=1)
+
 # Define export macros for shared library
 if(MSVC)
   target_compile_definitions(aoti_cuda_shims PRIVATE EXPORT_AOTI_FUNCTIONS)

diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
@@ -3,28 +3,6 @@ load("//tools/build/buck:nvcc_flags.bzl", "get_nvcc_arch_args")
 
 oncall("executorch")
 
-runtime.cxx_library(
-    name = "guard",
-    srcs = [
-        "guard.cpp",
-    ],
-    headers = [
-        "guard.h",
-        "utils.h",
-    ],
-    visibility = ["PUBLIC"],
-    deps = [
-        "//executorch/runtime/platform:platform",
-    ],
-    exported_deps = [
-        "//executorch/runtime/core:core",
-        "//executorch/runtime/core/exec_aten:lib",
-    ],
-    external_deps = [
-        ("cuda", None, "cuda-lazy"),
-    ],
-)
-
 runtime.cxx_library(
     name = "cuda_platform",
     srcs = [
@@ -71,14 +49,12 @@ runtime.cxx_library(
 runtime.cxx_library(
     name = "runtime_shims",
     srcs = [
-        "guard.cpp",
         "shims/cuda_guard.cpp",
         "shims/int4mm.cu",
         "shims/memory.cpp",
         "shims/tensor_attribute.cpp",
     ],
     headers = [
-        "guard.h",
         "shims/cuda_guard.h",
         "shims/int4mm.cuh",
         "shims/int4mm.h",
@@ -91,43 +67,18 @@ runtime.cxx_library(
     supports_python_dlopen = True,
     # Constructor needed for backend registration.
     compiler_flags = ["-Wno-global-constructors"],
+    preprocessor_flags = ["-DCUDA_AVAILABLE=1"],
     visibility = ["PUBLIC"],
     deps = [
         ":tensor_maker",
-        "//executorch/backends/aoti:common_shims",
-        "//executorch/runtime/core:core",
-        "//executorch/runtime/core/exec_aten:lib",
-        "//executorch/runtime/platform:platform",
-        "//executorch/backends/cuda/runtime:cuda_platform",
-    ],
-    nvcc_flags = get_nvcc_arch_args() + [
-        "-_NVCC_HOST_COMPILER_FLAG_",
-        "gcc",
-    ],
-    external_deps = [
-        ("cuda", None, "cuda-lazy"),
-    ],
-)
-
-runtime.cxx_library(
-    name = "runtime_shims_slim",
-    srcs = [
-        "shims/memory_slim.cpp",
-    ],
-    headers = [
-        "shims/memory_slim.h",
-    ],
-    # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
-    link_whole = True,
-    supports_python_dlopen = True,
-    visibility = ["@EXECUTORCH_CLIENTS"],
-    preprocessor_flags = ["-DCUDA_AVAILABLE=1"],
-    deps = [
+        "//executorch/backends/aoti:aoti_common_slim",
         "//executorch/backends/aoti/slim/core:slimtensor",
         "//executorch/backends/aoti/slim/factory:empty",
         "//executorch/backends/aoti/slim/factory:from_blob",
-        "//executorch/backends/aoti:common_shims",
+        "//executorch/backends/aoti/slim/cuda:guard",
         "//executorch/runtime/core:core",
+        "//executorch/runtime/core/exec_aten:lib",
+        "//executorch/runtime/core/exec_aten/util:tensor_util",
         "//executorch/runtime/platform:platform",
     ],
     nvcc_flags = get_nvcc_arch_args() + [
@@ -149,10 +100,16 @@ runtime.cxx_library(
     supports_python_dlopen = True,
     # Constructor needed for backend registration.
     compiler_flags = ["-Wno-global-constructors"],
+    preprocessor_flags = ["-DCUDA_AVAILABLE=1"],
     visibility = ["PUBLIC"],
     deps = [
         ":runtime_shims",
-        "//executorch/backends/aoti:aoti_common",
+        "//executorch/backends/aoti:aoti_common_slim",
+        "//executorch/backends/aoti/slim/core:slimtensor",
+        "//executorch/backends/aoti/slim/factory:empty",
+        "//executorch/backends/aoti/slim/factory:from_blob",
+        "//executorch/backends/aoti/slim/factory:from_etensor",
+        "//executorch/extension/tensor:tensor",
         "//executorch/runtime/backend:interface",
         "//executorch/runtime/core/exec_aten/util:tensor_util",
     ],