diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
index 567e781b3bc..d15656ab0b1 100644
--- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
+++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h
@@ -136,8 +136,10 @@ typedef struct stf_dim4
 } stf_dim4;
 
 //! \brief Partition (mapper) function: data coordinates -> grid position.
-//! Can be implemented in C or provided from Python via ctypes/cffi.
-typedef stf_pos4 (*stf_get_executor_fn)(stf_pos4 data_coords, stf_dim4 data_dims, stf_dim4 grid_dims);
+//! Writes the result into \p *result. The out-pointer convention is used
+//! instead of return-by-value so that the signature is trivially representable
+//! in FFI frameworks (ctypes, cffi, Rust) that cannot return C structs.
+typedef void (*stf_get_executor_fn)(stf_pos4* result, stf_pos4 data_coords, stf_dim4 data_dims, stf_dim4 grid_dims);
 
 //! \brief Create host execution place (CPU).
 stf_exec_place_handle stf_exec_place_host(void);
diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu
index 8bdd3f6bd8f..6136e486ffe 100644
--- a/c/experimental/stf/src/stf.cu
+++ b/c/experimental/stf/src/stf.cu
@@ -358,9 +358,10 @@ stf_data_place_handle stf_data_place_composite(stf_exec_place_handle grid, stf_g
   _CCCL_ASSERT(grid != nullptr, "exec place grid handle must not be null");
   _CCCL_ASSERT(mapper != nullptr, "partitioner function (mapper) must not be null");
   auto* grid_ptr = from_opaque(grid);
-  // Distinct function pointer types (C typedef vs C++ alias); not convertible via static_cast under nvcc.
-  partition_fn_t cpp_mapper = reinterpret_cast<partition_fn_t>(mapper);
-  auto* dp                  = stf_try_allocate([cpp_mapper, grid_ptr] {
+  // Distinct function pointer types (C typedef vs C++ alias) are not
+  // convertible via static_cast under nvcc.
+  const auto cpp_mapper = reinterpret_cast<partition_fn_t>(mapper);
+  auto* dp                        = stf_try_allocate([cpp_mapper, grid_ptr] {
     return new data_place(data_place::composite(cpp_mapper, *grid_ptr));
   });
   return to_opaque(dp);
diff --git a/c/experimental/stf/test/test_places.cpp b/c/experimental/stf/test/test_places.cpp
index cf5cc2f481e..9b690813862 100644
--- a/c/experimental/stf/test/test_places.cpp
+++ b/c/experimental/stf/test/test_places.cpp
@@ -19,7 +19,7 @@
 
 // Blocked partition along first dimension: maps data coordinates to grid position.
 // Used to exercise composite data place with a grid of execution places.
-static stf_pos4 blocked_mapper_1d(stf_pos4 data_coords, stf_dim4 data_dims, stf_dim4 grid_dims)
+static void blocked_mapper_1d(stf_pos4* result, stf_pos4 data_coords, stf_dim4 data_dims, stf_dim4 grid_dims)
 {
   uint64_t extent    = data_dims.x;
   uint64_t nplaces   = grid_dims.x;
@@ -34,12 +34,10 @@ static stf_pos4 blocked_mapper_1d(stf_pos4 data_coords, stf_dim4 data_dims, stf_
   {
     place_x = static_cast<int64_t>(nplaces) - 1;
   }
-  stf_pos4 result = {};
-  result.x        = place_x;
-  result.y        = 0;
-  result.z        = 0;
-  result.t        = 0;
-  return result;
+  result->x = place_x;
+  result->y = 0;
+  result->z = 0;
+  result->t = 0;
 }
 
 C2H_TEST("empty stf tasks", "[task]")
diff --git a/cudax/include/cuda/experimental/__places/data_place_interface.cuh b/cudax/include/cuda/experimental/__places/data_place_interface.cuh
index b8a8f24e8ea..33ecab14ffe 100644
--- a/cudax/include/cuda/experimental/__places/data_place_interface.cuh
+++ b/cudax/include/cuda/experimental/__places/data_place_interface.cuh
@@ -52,8 +52,10 @@ using ::cuda::experimental::stf::pos4;
 // Forward declarations
 class exec_place;
 
-//! Function type for computing executor placement from data coordinates
-using partition_fn_t = pos4 (*)(pos4, dim4, dim4);
+//! Function type for computing executor placement from data coordinates.
+//! Uses an out-pointer convention so the signature is trivially representable
+//! in FFI frameworks (ctypes, cffi, Rust) that cannot return C structs.
+using partition_fn_t = void (*)(pos4* result, pos4 data_coords, dim4 data_dims, dim4 grid_dims);
 
 /**
  * @brief Abstract interface for data_place implementations
diff --git a/cudax/include/cuda/experimental/__places/localized_array.cuh b/cudax/include/cuda/experimental/__places/localized_array.cuh
index 59dead271b4..708e06bd784 100644
--- a/cudax/include/cuda/experimental/__places/localized_array.cuh
+++ b/cudax/include/cuda/experimental/__places/localized_array.cuh
@@ -358,8 +358,9 @@ private:
   template <typename F>
   pos4 index_to_grid_pos(size_t linearized_index, F&& delinearize)
   {
-    pos4 coords        = delinearize(linearized_index);
-    pos4 eplace_coords = mapper(coords, data_dims, grid.get_dims());
+    const pos4 coords = delinearize(linearized_index);
+    pos4 eplace_coords(0);
+    mapper(&eplace_coords, coords, data_dims, grid.get_dims());
     return eplace_coords;
   }
 
diff --git a/cudax/include/cuda/experimental/__places/partitions/blocked_partition.cuh b/cudax/include/cuda/experimental/__places/partitions/blocked_partition.cuh
index cc4d5334eb7..dc1ab0f53a4 100644
--- a/cudax/include/cuda/experimental/__places/partitions/blocked_partition.cuh
+++ b/cudax/include/cuda/experimental/__places/partitions/blocked_partition.cuh
@@ -102,7 +102,7 @@ public:
     return box<dimensions>(bounds);
   }
 
-  _CCCL_HOST_DEVICE static pos4 get_executor(pos4 data_coords, dim4 data_dims, dim4 grid_dims)
+  _CCCL_HOST_DEVICE static void get_executor(pos4* result, pos4 data_coords, dim4 data_dims, dim4 grid_dims)
   {
     // Find the largest dimension
     size_t rank       = data_dims.get_rank();
@@ -120,7 +120,7 @@ public:
     // Get the coordinate in the selected dimension
     size_t c = data_coords.get(target_dim);
 
-    return pos4(c / part_size);
+    *result = pos4(c / part_size);
   }
 };
 
@@ -150,9 +150,10 @@ UNITTEST("blocked partition with very large data arrays")
   pos4 middle_coord(200, 150, 100, 500);
   pos4 last_coord(399, 299, 199, 999);
 
-  pos4 first_pos  = blocked_partition::get_executor(first_coord, massive_4d_dims, grid_dims);
-  pos4 middle_pos = blocked_partition::get_executor(middle_coord, massive_4d_dims, grid_dims);
-  pos4 last_pos   = blocked_partition::get_executor(last_coord, massive_4d_dims, grid_dims);
+  pos4 first_pos, middle_pos, last_pos;
+  blocked_partition::get_executor(&first_pos, first_coord, massive_4d_dims, grid_dims);
+  blocked_partition::get_executor(&middle_pos, middle_coord, massive_4d_dims, grid_dims);
+  blocked_partition::get_executor(&last_pos, last_coord, massive_4d_dims, grid_dims);
 
   // part_size = ceil(1000/4) = 250
   // t=0   -> block 0, t=500 -> block 2, t=999 -> block 3
diff --git a/cudax/include/cuda/experimental/__places/partitions/cyclic_shape.cuh b/cudax/include/cuda/experimental/__places/partitions/cyclic_shape.cuh
index c90bffa2a71..dbb61a63b49 100644
--- a/cudax/include/cuda/experimental/__places/partitions/cyclic_shape.cuh
+++ b/cudax/include/cuda/experimental/__places/partitions/cyclic_shape.cuh
@@ -247,10 +247,9 @@ public:
     return cyclic_shape<dimensions>(bounds);
   }
 
-  _CCCL_HOST_DEVICE static pos4 get_executor(pos4 /*unused*/, dim4 /*unused*/, dim4 /*unused*/)
+  _CCCL_HOST_DEVICE static void get_executor(pos4* result, pos4 /*unused*/, dim4 /*unused*/, dim4 /*unused*/)
   {
     abort();
-    return pos4(0);
   }
 };
 
diff --git a/cudax/include/cuda/experimental/__places/partitions/tiled_partition.cuh b/cudax/include/cuda/experimental/__places/partitions/tiled_partition.cuh
index 27c65b042e0..ba5262aa068 100644
--- a/cudax/include/cuda/experimental/__places/partitions/tiled_partition.cuh
+++ b/cudax/include/cuda/experimental/__places/partitions/tiled_partition.cuh
@@ -137,10 +137,10 @@ public:
     return reserved::tiled_mdspan_shape<tile_size, mdspan_shape_t>(in, place_position.x, grid_dims.x);
   }
 
-  _CCCL_HOST_DEVICE static pos4 get_executor(pos4 data_coords, dim4 /*unused*/, dim4 grid_dims)
+  _CCCL_HOST_DEVICE static void get_executor(pos4* result, pos4 data_coords, dim4 /*unused*/, dim4 grid_dims)
   {
     assert(grid_dims.x > 0);
-    return pos4((data_coords.x / tile_size) % grid_dims.x);
+    *result = pos4((data_coords.x / tile_size) % grid_dims.x);
   }
 };
 
@@ -178,7 +178,8 @@ UNITTEST("tiled partition with large 1D data")
 
   constexpr size_t tile_size = 1000;
 
-  pos4 tile_pos = tiled_partition<tile_size>::get_executor(large_coords, data_dims, grid_dims);
+  pos4 tile_pos;
+  tiled_partition<tile_size>::get_executor(&tile_pos, large_coords, data_dims, grid_dims);
 
   EXPECT(tile_pos.x == (test_coord / tile_size) % grid_dims.x);
 };
diff --git a/docs/cudax/places.rst b/docs/cudax/places.rst
index f37c1e128cc..63a5b94734e 100644
--- a/docs/cudax/places.rst
+++ b/docs/cudax/places.rst
@@ -478,7 +478,7 @@ over the different places of a grid.
        template <typename S_out, typename S_in>
        static const S_out apply(const S_in& in, pos4 position, dim4 grid_dims);
 
-       pos4 get_executor(pos4 data_coords, dim4 data_dims, dim4 grid_dims);
+       void get_executor(pos4* result, pos4 data_coords, dim4 data_dims, dim4 grid_dims);
    };
 
 A partitioning class must implement an ``apply`` method which takes:
@@ -504,8 +504,8 @@ method which allows localized data allocators. This
 method indicates, for each entry of a shape, on which place this entry
 should *preferably* be allocated.
 
-``get_executor`` returns a ``pos4`` coordinate in the execution place
-grid, and its arguments are:
+``get_executor`` writes a ``pos4`` coordinate in the execution place
+grid into ``*result``, and its input arguments are:
 
 - a coordinate within the shape described as a ``pos4`` object
 - the dimension of the shape expressed as a ``dim4`` object