diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index 567e781b3bc..d15656ab0b1 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -136,8 +136,10 @@ typedef struct stf_dim4 } stf_dim4; //! \brief Partition (mapper) function: data coordinates -> grid position. -//! Can be implemented in C or provided from Python via ctypes/cffi. -typedef stf_pos4 (*stf_get_executor_fn)(stf_pos4 data_coords, stf_dim4 data_dims, stf_dim4 grid_dims); +//! Writes the result into \p *result. The out-pointer convention is used +//! instead of return-by-value so that the signature is trivially representable +//! in FFI frameworks (ctypes, cffi, Rust) that cannot return C structs. +typedef void (*stf_get_executor_fn)(stf_pos4* result, stf_pos4 data_coords, stf_dim4 data_dims, stf_dim4 grid_dims); //! \brief Create host execution place (CPU). stf_exec_place_handle stf_exec_place_host(void); diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index 8bdd3f6bd8f..6136e486ffe 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -358,9 +358,10 @@ stf_data_place_handle stf_data_place_composite(stf_exec_place_handle grid, stf_g _CCCL_ASSERT(grid != nullptr, "exec place grid handle must not be null"); _CCCL_ASSERT(mapper != nullptr, "partitioner function (mapper) must not be null"); auto* grid_ptr = from_opaque(grid); - // Distinct function pointer types (C typedef vs C++ alias); not convertible via static_cast under nvcc. - partition_fn_t cpp_mapper = reinterpret_cast(mapper); - auto* dp = stf_try_allocate([cpp_mapper, grid_ptr] { + // Distinct function pointer types (C typedef vs C++ alias) are not + // convertible via static_cast under nvcc. + const auto cpp_mapper = reinterpret_cast(mapper); + auto* dp = stf_try_allocate([cpp_mapper, grid_ptr] { return new data_place(data_place::composite(cpp_mapper, *grid_ptr)); }); return to_opaque(dp); diff --git a/c/experimental/stf/test/test_places.cpp b/c/experimental/stf/test/test_places.cpp index cf5cc2f481e..9b690813862 100644 --- a/c/experimental/stf/test/test_places.cpp +++ b/c/experimental/stf/test/test_places.cpp @@ -19,7 +19,7 @@ // Blocked partition along first dimension: maps data coordinates to grid position. // Used to exercise composite data place with a grid of execution places. -static stf_pos4 blocked_mapper_1d(stf_pos4 data_coords, stf_dim4 data_dims, stf_dim4 grid_dims) +static void blocked_mapper_1d(stf_pos4* result, stf_pos4 data_coords, stf_dim4 data_dims, stf_dim4 grid_dims) { uint64_t extent = data_dims.x; uint64_t nplaces = grid_dims.x; @@ -34,12 +34,10 @@ static stf_pos4 blocked_mapper_1d(stf_pos4 data_coords, stf_dim4 data_dims, stf_ { place_x = static_cast(nplaces) - 1; } - stf_pos4 result = {}; - result.x = place_x; - result.y = 0; - result.z = 0; - result.t = 0; - return result; + result->x = place_x; + result->y = 0; + result->z = 0; + result->t = 0; } C2H_TEST("empty stf tasks", "[task]") diff --git a/cudax/include/cuda/experimental/__places/data_place_interface.cuh b/cudax/include/cuda/experimental/__places/data_place_interface.cuh index b8a8f24e8ea..33ecab14ffe 100644 --- a/cudax/include/cuda/experimental/__places/data_place_interface.cuh +++ b/cudax/include/cuda/experimental/__places/data_place_interface.cuh @@ -52,8 +52,10 @@ using ::cuda::experimental::stf::pos4; // Forward declarations class exec_place; -//! Function type for computing executor placement from data coordinates -using partition_fn_t = pos4 (*)(pos4, dim4, dim4); +//! Function type for computing executor placement from data coordinates. +//! Uses an out-pointer convention so the signature is trivially representable +//! in FFI frameworks (ctypes, cffi, Rust) that cannot return C structs. +using partition_fn_t = void (*)(pos4* result, pos4 data_coords, dim4 data_dims, dim4 grid_dims); /** * @brief Abstract interface for data_place implementations diff --git a/cudax/include/cuda/experimental/__places/localized_array.cuh b/cudax/include/cuda/experimental/__places/localized_array.cuh index 59dead271b4..708e06bd784 100644 --- a/cudax/include/cuda/experimental/__places/localized_array.cuh +++ b/cudax/include/cuda/experimental/__places/localized_array.cuh @@ -358,8 +358,9 @@ private: template pos4 index_to_grid_pos(size_t linearized_index, F&& delinearize) { - pos4 coords = delinearize(linearized_index); - pos4 eplace_coords = mapper(coords, data_dims, grid.get_dims()); + const pos4 coords = delinearize(linearized_index); + pos4 eplace_coords(0); + mapper(&eplace_coords, coords, data_dims, grid.get_dims()); return eplace_coords; } diff --git a/cudax/include/cuda/experimental/__places/partitions/blocked_partition.cuh b/cudax/include/cuda/experimental/__places/partitions/blocked_partition.cuh index cc4d5334eb7..dc1ab0f53a4 100644 --- a/cudax/include/cuda/experimental/__places/partitions/blocked_partition.cuh +++ b/cudax/include/cuda/experimental/__places/partitions/blocked_partition.cuh @@ -102,7 +102,7 @@ public: return box(bounds); } - _CCCL_HOST_DEVICE static pos4 get_executor(pos4 data_coords, dim4 data_dims, dim4 grid_dims) + _CCCL_HOST_DEVICE static void get_executor(pos4* result, pos4 data_coords, dim4 data_dims, dim4 grid_dims) { // Find the largest dimension size_t rank = data_dims.get_rank(); @@ -120,7 +120,7 @@ public: // Get the coordinate in the selected dimension size_t c = data_coords.get(target_dim); - return pos4(c / part_size); + *result = pos4(c / part_size); } }; @@ -150,9 +150,10 @@ UNITTEST("blocked partition with very large data arrays") pos4 middle_coord(200, 150, 100, 500); pos4 last_coord(399, 299, 199, 999); - pos4 first_pos = blocked_partition::get_executor(first_coord, massive_4d_dims, grid_dims); - pos4 middle_pos = blocked_partition::get_executor(middle_coord, massive_4d_dims, grid_dims); - pos4 last_pos = blocked_partition::get_executor(last_coord, massive_4d_dims, grid_dims); + pos4 first_pos, middle_pos, last_pos; + blocked_partition::get_executor(&first_pos, first_coord, massive_4d_dims, grid_dims); + blocked_partition::get_executor(&middle_pos, middle_coord, massive_4d_dims, grid_dims); + blocked_partition::get_executor(&last_pos, last_coord, massive_4d_dims, grid_dims); // part_size = ceil(1000/4) = 250 // t=0 -> block 0, t=500 -> block 2, t=999 -> block 3 diff --git a/cudax/include/cuda/experimental/__places/partitions/cyclic_shape.cuh b/cudax/include/cuda/experimental/__places/partitions/cyclic_shape.cuh index c90bffa2a71..dbb61a63b49 100644 --- a/cudax/include/cuda/experimental/__places/partitions/cyclic_shape.cuh +++ b/cudax/include/cuda/experimental/__places/partitions/cyclic_shape.cuh @@ -247,10 +247,9 @@ public: return cyclic_shape(bounds); } - _CCCL_HOST_DEVICE static pos4 get_executor(pos4 /*unused*/, dim4 /*unused*/, dim4 /*unused*/) + _CCCL_HOST_DEVICE static void get_executor(pos4* result, pos4 /*unused*/, dim4 /*unused*/, dim4 /*unused*/) { abort(); - return pos4(0); } }; diff --git a/cudax/include/cuda/experimental/__places/partitions/tiled_partition.cuh b/cudax/include/cuda/experimental/__places/partitions/tiled_partition.cuh index 27c65b042e0..ba5262aa068 100644 --- a/cudax/include/cuda/experimental/__places/partitions/tiled_partition.cuh +++ b/cudax/include/cuda/experimental/__places/partitions/tiled_partition.cuh @@ -137,10 +137,10 @@ public: return reserved::tiled_mdspan_shape(in, place_position.x, grid_dims.x); } - _CCCL_HOST_DEVICE static pos4 get_executor(pos4 data_coords, dim4 /*unused*/, dim4 grid_dims) + _CCCL_HOST_DEVICE static void get_executor(pos4* result, pos4 data_coords, dim4 /*unused*/, dim4 grid_dims) { assert(grid_dims.x > 0); - return pos4((data_coords.x / tile_size) % grid_dims.x); + *result = pos4((data_coords.x / tile_size) % grid_dims.x); } }; @@ -178,7 +178,8 @@ UNITTEST("tiled partition with large 1D data") constexpr size_t tile_size = 1000; - pos4 tile_pos = tiled_partition::get_executor(large_coords, data_dims, grid_dims); + pos4 tile_pos; + tiled_partition::get_executor(&tile_pos, large_coords, data_dims, grid_dims); EXPECT(tile_pos.x == (test_coord / tile_size) % grid_dims.x); }; diff --git a/docs/cudax/places.rst b/docs/cudax/places.rst index f37c1e128cc..63a5b94734e 100644 --- a/docs/cudax/places.rst +++ b/docs/cudax/places.rst @@ -478,7 +478,7 @@ over the different places of a grid. template static const S_out apply(const S_in& in, pos4 position, dim4 grid_dims); - pos4 get_executor(pos4 data_coords, dim4 data_dims, dim4 grid_dims); + void get_executor(pos4* result, pos4 data_coords, dim4 data_dims, dim4 grid_dims); }; A partitioning class must implement an ``apply`` method which takes: @@ -504,8 +504,8 @@ method which allows localized data allocators. This method indicates, for each entry of a shape, on which place this entry should *preferably* be allocated. -``get_executor`` returns a ``pos4`` coordinate in the execution place -grid, and its arguments are: +``get_executor`` writes a ``pos4`` coordinate in the execution place +grid into ``*result``, and its input arguments are: - a coordinate within the shape described as a ``pos4`` object - the dimension of the shape expressed as a ``dim4`` object