From 5f33c27c1f0356fd1f0be74a293fbda472e6ea22 Mon Sep 17 00:00:00 2001 From: Cedric AUGONNET Date: Sat, 23 May 2026 01:41:51 +0200 Subject: [PATCH 1/2] [CUDAX] Use out parameter for partition mappers --- .../stf/include/cccl/c/experimental/stf/stf.h | 6 ++++-- c/experimental/stf/src/stf.cu | 7 ++++--- c/experimental/stf/test/test_places.cpp | 12 +++++------- .../experimental/__places/data_place_interface.cuh | 6 ++++-- .../cuda/experimental/__places/localized_array.cuh | 5 +++-- .../__places/partitions/blocked_partition.cuh | 11 ++++++----- .../__places/partitions/cyclic_shape.cuh | 4 ++-- .../__places/partitions/tiled_partition.cuh | 7 ++++--- docs/cudax/places.rst | 6 +++--- 9 files changed, 35 insertions(+), 29 deletions(-) diff --git a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h index 567e781b3bc..d15656ab0b1 100644 --- a/c/experimental/stf/include/cccl/c/experimental/stf/stf.h +++ b/c/experimental/stf/include/cccl/c/experimental/stf/stf.h @@ -136,8 +136,10 @@ typedef struct stf_dim4 } stf_dim4; //! \brief Partition (mapper) function: data coordinates -> grid position. -//! Can be implemented in C or provided from Python via ctypes/cffi. -typedef stf_pos4 (*stf_get_executor_fn)(stf_pos4 data_coords, stf_dim4 data_dims, stf_dim4 grid_dims); +//! Writes the result into \p *result. The out-pointer convention is used +//! instead of return-by-value so that the signature is trivially representable +//! in FFI frameworks (ctypes, cffi, Rust) that cannot return C structs. +typedef void (*stf_get_executor_fn)(stf_pos4* result, stf_pos4 data_coords, stf_dim4 data_dims, stf_dim4 grid_dims); //! \brief Create host execution place (CPU). stf_exec_place_handle stf_exec_place_host(void); diff --git a/c/experimental/stf/src/stf.cu b/c/experimental/stf/src/stf.cu index 8bdd3f6bd8f..ad30d1177b7 100644 --- a/c/experimental/stf/src/stf.cu +++ b/c/experimental/stf/src/stf.cu @@ -358,9 +358,10 @@ stf_data_place_handle stf_data_place_composite(stf_exec_place_handle grid, stf_g _CCCL_ASSERT(grid != nullptr, "exec place grid handle must not be null"); _CCCL_ASSERT(mapper != nullptr, "partitioner function (mapper) must not be null"); auto* grid_ptr = from_opaque(grid); - // Distinct function pointer types (C typedef vs C++ alias); not convertible via static_cast under nvcc. - partition_fn_t cpp_mapper = reinterpret_cast(mapper); - auto* dp = stf_try_allocate([cpp_mapper, grid_ptr] { + // Distinct function pointer types (C typedef vs C++ alias) are not + // convertible via static_cast under nvcc. + const partition_fn_t cpp_mapper = reinterpret_cast(mapper); + auto* dp = stf_try_allocate([cpp_mapper, grid_ptr] { return new data_place(data_place::composite(cpp_mapper, *grid_ptr)); }); return to_opaque(dp); diff --git a/c/experimental/stf/test/test_places.cpp b/c/experimental/stf/test/test_places.cpp index cf5cc2f481e..9b690813862 100644 --- a/c/experimental/stf/test/test_places.cpp +++ b/c/experimental/stf/test/test_places.cpp @@ -19,7 +19,7 @@ // Blocked partition along first dimension: maps data coordinates to grid position. // Used to exercise composite data place with a grid of execution places. -static stf_pos4 blocked_mapper_1d(stf_pos4 data_coords, stf_dim4 data_dims, stf_dim4 grid_dims) +static void blocked_mapper_1d(stf_pos4* result, stf_pos4 data_coords, stf_dim4 data_dims, stf_dim4 grid_dims) { uint64_t extent = data_dims.x; uint64_t nplaces = grid_dims.x; @@ -34,12 +34,10 @@ static stf_pos4 blocked_mapper_1d(stf_pos4 data_coords, stf_dim4 data_dims, stf_ { place_x = static_cast(nplaces) - 1; } - stf_pos4 result = {}; - result.x = place_x; - result.y = 0; - result.z = 0; - result.t = 0; - return result; + result->x = place_x; + result->y = 0; + result->z = 0; + result->t = 0; } C2H_TEST("empty stf tasks", "[task]") diff --git a/cudax/include/cuda/experimental/__places/data_place_interface.cuh b/cudax/include/cuda/experimental/__places/data_place_interface.cuh index b8a8f24e8ea..33ecab14ffe 100644 --- a/cudax/include/cuda/experimental/__places/data_place_interface.cuh +++ b/cudax/include/cuda/experimental/__places/data_place_interface.cuh @@ -52,8 +52,10 @@ using ::cuda::experimental::stf::pos4; // Forward declarations class exec_place; -//! Function type for computing executor placement from data coordinates -using partition_fn_t = pos4 (*)(pos4, dim4, dim4); +//! Function type for computing executor placement from data coordinates. +//! Uses an out-pointer convention so the signature is trivially representable +//! in FFI frameworks (ctypes, cffi, Rust) that cannot return C structs. +using partition_fn_t = void (*)(pos4* result, pos4 data_coords, dim4 data_dims, dim4 grid_dims); /** * @brief Abstract interface for data_place implementations diff --git a/cudax/include/cuda/experimental/__places/localized_array.cuh b/cudax/include/cuda/experimental/__places/localized_array.cuh index 59dead271b4..264edc49e2d 100644 --- a/cudax/include/cuda/experimental/__places/localized_array.cuh +++ b/cudax/include/cuda/experimental/__places/localized_array.cuh @@ -358,8 +358,9 @@ private: template pos4 index_to_grid_pos(size_t linearized_index, F&& delinearize) { - pos4 coords = delinearize(linearized_index); - pos4 eplace_coords = mapper(coords, data_dims, grid.get_dims()); + const pos4 coords = delinearize(linearized_index); + pos4 eplace_coords; + mapper(&eplace_coords, coords, data_dims, grid.get_dims()); return eplace_coords; } diff --git a/cudax/include/cuda/experimental/__places/partitions/blocked_partition.cuh b/cudax/include/cuda/experimental/__places/partitions/blocked_partition.cuh index cc4d5334eb7..dc1ab0f53a4 100644 --- a/cudax/include/cuda/experimental/__places/partitions/blocked_partition.cuh +++ b/cudax/include/cuda/experimental/__places/partitions/blocked_partition.cuh @@ -102,7 +102,7 @@ public: return box(bounds); } - _CCCL_HOST_DEVICE static pos4 get_executor(pos4 data_coords, dim4 data_dims, dim4 grid_dims) + _CCCL_HOST_DEVICE static void get_executor(pos4* result, pos4 data_coords, dim4 data_dims, dim4 grid_dims) { // Find the largest dimension size_t rank = data_dims.get_rank(); @@ -120,7 +120,7 @@ public: // Get the coordinate in the selected dimension size_t c = data_coords.get(target_dim); - return pos4(c / part_size); + *result = pos4(c / part_size); } }; @@ -150,9 +150,10 @@ UNITTEST("blocked partition with very large data arrays") pos4 middle_coord(200, 150, 100, 500); pos4 last_coord(399, 299, 199, 999); - pos4 first_pos = blocked_partition::get_executor(first_coord, massive_4d_dims, grid_dims); - pos4 middle_pos = blocked_partition::get_executor(middle_coord, massive_4d_dims, grid_dims); - pos4 last_pos = blocked_partition::get_executor(last_coord, massive_4d_dims, grid_dims); + pos4 first_pos, middle_pos, last_pos; + blocked_partition::get_executor(&first_pos, first_coord, massive_4d_dims, grid_dims); + blocked_partition::get_executor(&middle_pos, middle_coord, massive_4d_dims, grid_dims); + blocked_partition::get_executor(&last_pos, last_coord, massive_4d_dims, grid_dims); // part_size = ceil(1000/4) = 250 // t=0 -> block 0, t=500 -> block 2, t=999 -> block 3 diff --git a/cudax/include/cuda/experimental/__places/partitions/cyclic_shape.cuh b/cudax/include/cuda/experimental/__places/partitions/cyclic_shape.cuh index c90bffa2a71..b8bc7369231 100644 --- a/cudax/include/cuda/experimental/__places/partitions/cyclic_shape.cuh +++ b/cudax/include/cuda/experimental/__places/partitions/cyclic_shape.cuh @@ -247,10 +247,10 @@ public: return cyclic_shape(bounds); } - _CCCL_HOST_DEVICE static pos4 get_executor(pos4 /*unused*/, dim4 /*unused*/, dim4 /*unused*/) + _CCCL_HOST_DEVICE static void get_executor(pos4* result, pos4 /*unused*/, dim4 /*unused*/, dim4 /*unused*/) { abort(); - return pos4(0); + *result = pos4(0); } }; diff --git a/cudax/include/cuda/experimental/__places/partitions/tiled_partition.cuh b/cudax/include/cuda/experimental/__places/partitions/tiled_partition.cuh index 27c65b042e0..ba5262aa068 100644 --- a/cudax/include/cuda/experimental/__places/partitions/tiled_partition.cuh +++ b/cudax/include/cuda/experimental/__places/partitions/tiled_partition.cuh @@ -137,10 +137,10 @@ public: return reserved::tiled_mdspan_shape(in, place_position.x, grid_dims.x); } - _CCCL_HOST_DEVICE static pos4 get_executor(pos4 data_coords, dim4 /*unused*/, dim4 grid_dims) + _CCCL_HOST_DEVICE static void get_executor(pos4* result, pos4 data_coords, dim4 /*unused*/, dim4 grid_dims) { assert(grid_dims.x > 0); - return pos4((data_coords.x / tile_size) % grid_dims.x); + *result = pos4((data_coords.x / tile_size) % grid_dims.x); } }; @@ -178,7 +178,8 @@ UNITTEST("tiled partition with large 1D data") constexpr size_t tile_size = 1000; - pos4 tile_pos = tiled_partition::get_executor(large_coords, data_dims, grid_dims); + pos4 tile_pos; + tiled_partition::get_executor(&tile_pos, large_coords, data_dims, grid_dims); EXPECT(tile_pos.x == (test_coord / tile_size) % grid_dims.x); }; diff --git a/docs/cudax/places.rst b/docs/cudax/places.rst index f37c1e128cc..63a5b94734e 100644 --- a/docs/cudax/places.rst +++ b/docs/cudax/places.rst @@ -478,7 +478,7 @@ over the different places of a grid. template static const S_out apply(const S_in& in, pos4 position, dim4 grid_dims); - pos4 get_executor(pos4 data_coords, dim4 data_dims, dim4 grid_dims); + void get_executor(pos4* result, pos4 data_coords, dim4 data_dims, dim4 grid_dims); }; A partitioning class must implement an ``apply`` method which takes: @@ -504,8 +504,8 @@ method which allows localized data allocators. This method indicates, for each entry of a shape, on which place this entry should *preferably* be allocated. -``get_executor`` returns a ``pos4`` coordinate in the execution place -grid, and its arguments are: +``get_executor`` writes a ``pos4`` coordinate in the execution place +grid into ``*result``, and its input arguments are: - a coordinate within the shape described as a ``pos4`` object - the dimension of the shape expressed as a ``dim4`` object From 4c66f734771ea828c070888cc4507f4424c338d2 Mon Sep 17 00:00:00 2001 From: Cedric AUGONNET Date: Tue, 26 May 2026 08:57:32 +0200 Subject: [PATCH 2/2] [CUDAX] Initialize mapper output coordinates --- cudax/include/cuda/experimental/__places/localized_array.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cudax/include/cuda/experimental/__places/localized_array.cuh b/cudax/include/cuda/experimental/__places/localized_array.cuh index 264edc49e2d..708e06bd784 100644 --- a/cudax/include/cuda/experimental/__places/localized_array.cuh +++ b/cudax/include/cuda/experimental/__places/localized_array.cuh @@ -359,7 +359,7 @@ private: pos4 index_to_grid_pos(size_t linearized_index, F&& delinearize) { const pos4 coords = delinearize(linearized_index); - pos4 eplace_coords; + pos4 eplace_coords(0); mapper(&eplace_coords, coords, data_dims, grid.get_dims()); return eplace_coords; }