From e23c56cef67f94319e1f434f9e32277148146cca Mon Sep 17 00:00:00 2001 From: anon <79939935+thom-gg@users.noreply.github.com> Date: Sat, 23 May 2026 17:37:33 +0200 Subject: [PATCH 1/3] Asserting current device and CUB stream matches --- .../dispatch/dispatch_adjacent_difference.cuh | 2 ++ .../device/dispatch/dispatch_batch_memcpy.cuh | 1 + .../device/dispatch/dispatch_batched_topk.cuh | 1 + cub/cub/device/dispatch/dispatch_find.cuh | 1 + cub/cub/device/dispatch/dispatch_for.cuh | 1 + .../device/dispatch/dispatch_histogram.cuh | 1 + cub/cub/device/dispatch/dispatch_merge.cuh | 1 + .../device/dispatch/dispatch_merge_sort.cuh | 2 ++ .../device/dispatch/dispatch_radix_sort.cuh | 2 ++ cub/cub/device/dispatch/dispatch_reduce.cuh | 2 ++ .../dispatch/dispatch_reduce_by_key.cuh | 2 ++ .../dispatch_reduce_deterministic.cuh | 1 + .../dispatch_reduce_nondeterministic.cuh | 1 + cub/cub/device/dispatch/dispatch_rle.cuh | 2 ++ cub/cub/device/dispatch/dispatch_scan.cuh | 2 ++ .../device/dispatch/dispatch_scan_by_key.cuh | 2 ++ .../dispatch_segmented_radix_sort.cuh | 2 ++ .../dispatch/dispatch_segmented_reduce.cuh | 2 ++ .../dispatch/dispatch_segmented_scan.cuh | 1 + .../dispatch/dispatch_segmented_sort.cuh | 2 ++ .../device/dispatch/dispatch_select_if.cuh | 2 ++ .../dispatch/dispatch_three_way_partition.cuh | 2 ++ cub/cub/device/dispatch/dispatch_topk.cuh | 1 + .../device/dispatch/dispatch_transform.cuh | 1 + .../dispatch/dispatch_unique_by_key.cuh | 2 ++ cub/cub/util_device.cuh | 21 +++++++++++++++++++ 26 files changed, 60 insertions(+) diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh index 808a696d8f6..d8dae488d1a 100644 --- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh +++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh @@ -285,6 +285,7 @@ struct DispatchAdjacentDifference DifferenceOpT difference_op, cudaStream_t stream) { + validate_stream_device(stream); cudaError error = cudaSuccess; do { @@ -336,6 +337,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( PolicySelector policy_selector = {}, KernelLauncherFactory launcher_factory = {}) { + validate_stream_device(stream); using InputT = detail::it_value_t; ::cuda::compute_capability cc{}; diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh index b0df4be876f..7d6f23d5bbe 100644 --- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh +++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh @@ -305,6 +305,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( cudaStream_t stream, PolicySelectorT policy_selector = {}) { + validate_stream_device(stream); using per_invocation_buffer_offset_t = detail::batch_memcpy::per_invocation_buffer_offset_t; using BufferSizeT = cub::detail::it_value_t; using BLevBufferOffsetTileState = cub::ScanTileState; diff --git a/cub/cub/device/dispatch/dispatch_batched_topk.cuh b/cub/cub/device/dispatch/dispatch_batched_topk.cuh index ec7d39b0bcb..9908577a9db 100644 --- a/cub/cub/device/dispatch/dispatch_batched_topk.cuh +++ b/cub/cub/device/dispatch/dispatch_batched_topk.cuh @@ -213,6 +213,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( cudaStream_t stream = nullptr, [[maybe_unused]] PolicySelector policy_selector = {}) { + validate_stream_device(stream); using large_segment_tile_offset_t = typename TotalNumItemsGuaranteeT::value_type; // Helper that determines (a) whether there's any one-worker-per-segment policy supporting the range of segment // sizes and k, and (b) if so, which set of one-worker-per-segment policies to use diff --git a/cub/cub/device/dispatch/dispatch_find.cuh b/cub/cub/device/dispatch/dispatch_find.cuh index 024498e320a..c5824ecc4b3 100644 --- a/cub/cub/device/dispatch/dispatch_find.cuh +++ b/cub/cub/device/dispatch/dispatch_find.cuh @@ -97,6 +97,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( cudaStream_t stream, PolicySelector policy_selector = {}) { + validate_stream_device(stream); using output_t = it_value_t; // if the output iterator can be turned into a pointer, the value type is integral, and has the same size as OffsetT diff --git a/cub/cub/device/dispatch/dispatch_for.cuh b/cub/cub/device/dispatch/dispatch_for.cuh index 0c1ab4dd7e6..b1f6d9fb7c1 100644 --- a/cub/cub/device/dispatch/dispatch_for.cuh +++ b/cub/cub/device/dispatch/dispatch_for.cuh @@ -115,6 +115,7 @@ template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(OffsetT num_items, OpT op, cudaStream_t stream, PolicySelector policy_selector = {}) { + validate_stream_device(stream); if (num_items == 0) { return cudaSuccess; diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh index 9d5c8dc2874..2491d2b2f5f 100644 --- a/cub/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/cub/device/dispatch/dispatch_histogram.cuh @@ -192,6 +192,7 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { + validate_stream_device(stream); ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) { diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh index 18f0b310105..92e14b45725 100644 --- a/cub/cub/device/dispatch/dispatch_merge.cuh +++ b/cub/cub/device/dispatch/dispatch_merge.cuh @@ -210,6 +210,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( PolicySelector policy_selector = {}, KernelLauncherFactory launcher_factory = {}) { + validate_stream_device(stream); ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) { diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh index 788b217042f..224fe430a31 100644 --- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh @@ -403,6 +403,7 @@ public: KernelLauncherFactory launcher_factory = {}, MaxPolicyT max_policy = {}) { + validate_stream_device(stream); // Get PTX version int ptx_version = 0; if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version))) @@ -473,6 +474,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( KeyT* = nullptr /* for CCCL.C */, ValueT* = nullptr /* for CCCL.C */) -> cudaError_t { + validate_stream_device(stream); [[maybe_unused]] constexpr bool keys_only = ::cuda::std::is_same_v; if (num_items == 0) diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh index c6eaf3ca33c..dc23c9f12f8 100644 --- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh @@ -1138,6 +1138,7 @@ public: KernelLauncherFactory launcher_factory = {}, MaxPolicyT max_policy = {}) { + validate_stream_device(stream); // Get PTX version int ptx_version = 0; if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version))) @@ -1202,6 +1203,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { + validate_stream_device(stream); ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) { diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh index 9b71e37a569..2cb09a901dc 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh @@ -478,6 +478,7 @@ struct DispatchReduce KernelLauncherFactory launcher_factory = {}, MaxPolicyT max_policy = {}) { + validate_stream_device(stream); // Get PTX version int ptx_version = 0; if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version))) @@ -750,6 +751,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { + validate_stream_device(stream); // from Dispatch() ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index e00f0786de1..299ba312212 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -606,6 +606,7 @@ struct DispatchReduceByKey OffsetT num_items, cudaStream_t stream) { + validate_stream_device(stream); cudaError error = cudaSuccess; do @@ -694,6 +695,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( cudaStream_t stream, PolicySelector policy_selector = {}) { + validate_stream_device(stream); using streaming_context_t = NullType; // streaming context not used for ReduceByKey yet using ScanTileStateT = ReduceByKeyScanTileState; [[maybe_unused]] static constexpr int init_kernel_threads = 128; diff --git a/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh b/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh index 4b54e9d53bc..eb77d983bc1 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh @@ -339,6 +339,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( PolicySelector policy_selector = {}, KernelLauncherFactory launcher_factory = {}) { + validate_stream_device(stream); // Get CC ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) diff --git a/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh b/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh index b57bf088072..e1b8007d461 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh @@ -173,6 +173,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { + validate_stream_device(stream); // Get CC ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh index 1bfb5088c31..bb4bf4668d2 100644 --- a/cub/cub/device/dispatch/dispatch_rle.cuh +++ b/cub/cub/device/dispatch/dispatch_rle.cuh @@ -605,6 +605,7 @@ struct DeviceRleDispatch OffsetT num_items, cudaStream_t stream) { + validate_stream_device(stream); cudaError error = cudaSuccess; // Get PTX version @@ -662,6 +663,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( cudaStream_t stream, PolicySelector policy_selector = {}) { + validate_stream_device(stream); using local_offset_t = ::cuda::std::int32_t; using global_offset_t = OffsetT; static constexpr bool use_streaming_invocation = diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index 6eb80ab85cb..cfd9b4cf1a6 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -862,6 +862,7 @@ struct DispatchScan KernelLauncherFactory launcher_factory = {}, MaxPolicyT max_policy = {}) { + validate_stream_device(stream); // Get PTX version int ptx_version = 0; if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version))) @@ -929,6 +930,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) -> cudaError_t { + validate_stream_device(stream); static_assert(::cuda::std::is_unsigned_v && sizeof(OffsetT) >= 4, "DispatchScan only supports unsigned offset types of at least 4-bytes"); diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh index 1fbf5725649..f0957c0ea90 100644 --- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh @@ -596,6 +596,7 @@ struct DispatchScanByKey KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { + validate_stream_device(stream); int ptx_version = 0; if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version))) { @@ -733,6 +734,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) -> cudaError_t { + validate_stream_device(stream); static_assert(::cuda::std::is_unsigned_v && sizeof(OffsetT) >= 4, "DispatchScan only supports unsigned offset types of at least 4-bytes"); diff --git a/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh index 4b90a2f1978..5c37dbde822 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh @@ -617,6 +617,7 @@ struct DispatchSegmentedRadixSort KernelLauncherFactory launcher_factory = {}, MaxPolicyT max_policy = {}) { + validate_stream_device(stream); cudaError_t error; do { @@ -903,6 +904,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( DecomposerT decomposer = {}, TuningEnvT = {}) { + validate_stream_device(stream); using default_policy_selector_t = policy_selector_from_types; using policy_selector_t = ::cuda::std::decay_t< ::cuda::std::execution::__query_result_or_t>; diff --git a/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh b/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh index 63ffd1c7a7a..20cc042575e 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh @@ -421,6 +421,7 @@ struct DispatchSegmentedReduce KernelLauncherFactory launcher_factory = {}, MaxPolicyT max_policy = {}) { + validate_stream_device(stream); if (num_segments <= 0) { return cudaSuccess; @@ -527,6 +528,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { + validate_stream_device(stream); if (num_segments <= 0) { return cudaSuccess; diff --git a/cub/cub/device/dispatch/dispatch_segmented_scan.cuh b/cub/cub/device/dispatch/dispatch_segmented_scan.cuh index 221aca57b00..6c9b7dd5e07 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_scan.cuh @@ -129,6 +129,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { + validate_stream_device(stream); static_assert(::cuda::std::is_integral_v && sizeof(OffsetT) >= 4 && sizeof(OffsetT) <= 8, "dispatch_segmented_scan only supports integral offset types of 4- or 8-bytes"); diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh index 9f317ec85ee..2ac012906ac 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh @@ -689,6 +689,7 @@ struct DispatchSegmentedSort MaxPolicyT max_policy = {}, PartitionMaxPolicyT partition_max_policy = {}) { + validate_stream_device(stream); // Get PTX version int ptx_version = 0; if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version))) @@ -1281,6 +1282,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( PartitionKernelSource partition_kernel_source = {}, KernelLauncherFactory launcher_factory = {}) -> cudaError_t { + validate_stream_device(stream); [[maybe_unused]] static constexpr bool keys_only = ::cuda::std::is_same_v; const auto get_num_passes = [&](int radix_bits) { diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index 30626dac474..6eabeaa10b5 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -843,6 +843,7 @@ struct DispatchSelectIf OffsetT num_items, cudaStream_t stream) { + validate_stream_device(stream); int ptx_version = 0; if (cudaError_t error = CubDebug(PtxVersion(ptx_version))) { @@ -1101,6 +1102,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( PolicySelector policy_selector = {}, KernelLauncherFactory launcher_factory = {}) { + validate_stream_device(stream); ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) { diff --git a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh index 2cabaf3a31a..c69afafba45 100644 --- a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh +++ b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh @@ -364,6 +364,7 @@ struct DispatchThreeWayPartitionIf KernelLauncherFactory launcher_factory = {}, MaxPolicyT max_policy = {}) { + validate_stream_device(stream); // Get PTX version int ptx_version = 0; if (cudaError error = CubDebug(launcher_factory.PtxVersion(ptx_version)); cudaSuccess != error) @@ -434,6 +435,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { + validate_stream_device(stream); ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) { diff --git a/cub/cub/device/dispatch/dispatch_topk.cuh b/cub/cub/device/dispatch/dispatch_topk.cuh index d0d171eb88a..d2167f72a54 100644 --- a/cub/cub/device/dispatch/dispatch_topk.cuh +++ b/cub/cub/device/dispatch/dispatch_topk.cuh @@ -475,6 +475,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( PolicySelector policy_selector = {}, KernelLauncherFactory launcher_factory = {}) { + validate_stream_device(stream); ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) { diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh index 098b9b38f00..805d20e21b1 100644 --- a/cub/cub/device/dispatch/dispatch_transform.cuh +++ b/cub/cub/device/dispatch/dispatch_transform.cuh @@ -556,6 +556,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { + validate_stream_device(stream); static_assert( ::cuda::std::is_same_v || ::cuda::std::is_same_v, "cub::DeviceTransform is only tested and tuned for 32-bit or 64-bit signed offset types"); diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh index 957e5a6589c..cf8b5e4e096 100644 --- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh @@ -437,6 +437,7 @@ struct DispatchUniqueByKey KernelLauncherFactory launcher_factory = {}, MaxPolicyT max_policy = {}) { + validate_stream_device(stream); int ptx_version = 0; if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version))) { @@ -502,6 +503,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( KeyT* = nullptr /* for CCCL.C */, ValueT* = nullptr /* for CCCL.C */) -> cudaError_t { + validate_stream_device(stream); ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) { diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh index 09caa064b2a..7196d4fb1bc 100644 --- a/cub/cub/util_device.cuh +++ b/cub/cub/util_device.cuh @@ -456,6 +456,27 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream([[maybe_unused]] cudaStream_t NV_IF_ELSE_TARGET(NV_IS_HOST, (return CubDebug(cudaStreamSynchronize(stream));), (return cudaErrorNotSupported;)) } +CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t validate_stream_device(cudaStream_t stream) +{ + cudaError_t error = cudaSuccess; +# if _CCCL_CTK_AT_LEAST(12, 8) + int streamDevice; + error = cudaStreamGetDevice(stream, &streamDevice); + if (error != cudaSuccess) + { + return error; + } + int currentDevice; + error = cudaGetDevice(¤tDevice); + if (error != cudaSuccess) + { + return error; + } + static_assert(currentDevice == streamDevice); +# endif // _CCCL_CTK_AT_LEAST(12,8) + return error; +} + //! @brief Computes the maximum potential dynamic shared memory size per block for kernel @p kernel_ptr taking into //! account the amount of kernel's static and CUDA Driver's reserved shared memory. //! From a8d21f9804c321d5ceb2ae617b47419fd65356e8 Mon Sep 17 00:00:00 2001 From: anon <79939935+thom-gg@users.noreply.github.com> Date: Sat, 23 May 2026 19:44:27 +0200 Subject: [PATCH 2/3] Use _CCCL_ASSERT and catch error in calling function --- .../device/dispatch/dispatch_adjacent_difference.cuh | 12 ++++++++++-- cub/cub/device/dispatch/dispatch_batch_memcpy.cuh | 6 +++++- cub/cub/device/dispatch/dispatch_batched_topk.cuh | 6 +++++- cub/cub/device/dispatch/dispatch_find.cuh | 6 +++++- cub/cub/device/dispatch/dispatch_for.cuh | 6 +++++- cub/cub/device/dispatch/dispatch_histogram.cuh | 6 +++++- cub/cub/device/dispatch/dispatch_merge.cuh | 6 +++++- cub/cub/device/dispatch/dispatch_merge_sort.cuh | 12 ++++++++++-- cub/cub/device/dispatch/dispatch_radix_sort.cuh | 12 ++++++++++-- cub/cub/device/dispatch/dispatch_reduce.cuh | 12 ++++++++++-- cub/cub/device/dispatch/dispatch_reduce_by_key.cuh | 12 ++++++++++-- .../dispatch/dispatch_reduce_deterministic.cuh | 6 +++++- .../dispatch/dispatch_reduce_nondeterministic.cuh | 6 +++++- cub/cub/device/dispatch/dispatch_rle.cuh | 12 ++++++++++-- cub/cub/device/dispatch/dispatch_scan.cuh | 12 ++++++++++-- cub/cub/device/dispatch/dispatch_scan_by_key.cuh | 12 ++++++++++-- .../dispatch/dispatch_segmented_radix_sort.cuh | 12 ++++++++++-- .../device/dispatch/dispatch_segmented_reduce.cuh | 12 ++++++++++-- cub/cub/device/dispatch/dispatch_segmented_scan.cuh | 6 +++++- cub/cub/device/dispatch/dispatch_segmented_sort.cuh | 12 ++++++++++-- cub/cub/device/dispatch/dispatch_select_if.cuh | 12 ++++++++++-- .../device/dispatch/dispatch_three_way_partition.cuh | 12 ++++++++++-- cub/cub/device/dispatch/dispatch_topk.cuh | 6 +++++- cub/cub/device/dispatch/dispatch_transform.cuh | 6 +++++- cub/cub/device/dispatch/dispatch_unique_by_key.cuh | 12 ++++++++++-- cub/cub/util_device.cuh | 10 +++++++++- 26 files changed, 204 insertions(+), 40 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh index d8dae488d1a..f9779b87a7f 100644 --- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh +++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh @@ -285,7 +285,11 @@ struct DispatchAdjacentDifference DifferenceOpT difference_op, cudaStream_t stream) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + cudaError error = cudaSuccess; do { @@ -337,7 +341,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( PolicySelector policy_selector = {}, KernelLauncherFactory launcher_factory = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + using InputT = detail::it_value_t; ::cuda::compute_capability cc{}; diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh index 7d6f23d5bbe..294520d3705 100644 --- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh +++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh @@ -305,7 +305,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( cudaStream_t stream, PolicySelectorT policy_selector = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + using per_invocation_buffer_offset_t = detail::batch_memcpy::per_invocation_buffer_offset_t; using BufferSizeT = cub::detail::it_value_t; using BLevBufferOffsetTileState = cub::ScanTileState; diff --git a/cub/cub/device/dispatch/dispatch_batched_topk.cuh b/cub/cub/device/dispatch/dispatch_batched_topk.cuh index 9908577a9db..7a3e741a340 100644 --- a/cub/cub/device/dispatch/dispatch_batched_topk.cuh +++ b/cub/cub/device/dispatch/dispatch_batched_topk.cuh @@ -213,7 +213,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( cudaStream_t stream = nullptr, [[maybe_unused]] PolicySelector policy_selector = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + using large_segment_tile_offset_t = typename TotalNumItemsGuaranteeT::value_type; // Helper that determines (a) whether there's any one-worker-per-segment policy supporting the range of segment // sizes and k, and (b) if so, which set of one-worker-per-segment policies to use diff --git a/cub/cub/device/dispatch/dispatch_find.cuh b/cub/cub/device/dispatch/dispatch_find.cuh index c5824ecc4b3..fa8173511d9 100644 --- a/cub/cub/device/dispatch/dispatch_find.cuh +++ b/cub/cub/device/dispatch/dispatch_find.cuh @@ -97,7 +97,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( cudaStream_t stream, PolicySelector policy_selector = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + using output_t = it_value_t; // if the output iterator can be turned into a pointer, the value type is integral, and has the same size as OffsetT diff --git a/cub/cub/device/dispatch/dispatch_for.cuh b/cub/cub/device/dispatch/dispatch_for.cuh index b1f6d9fb7c1..89ff7f15aa5 100644 --- a/cub/cub/device/dispatch/dispatch_for.cuh +++ b/cub/cub/device/dispatch/dispatch_for.cuh @@ -115,7 +115,11 @@ template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(OffsetT num_items, OpT op, cudaStream_t stream, PolicySelector policy_selector = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + if (num_items == 0) { return cudaSuccess; diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh index 2491d2b2f5f..dcce53ffc77 100644 --- a/cub/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/cub/device/dispatch/dispatch_histogram.cuh @@ -192,7 +192,11 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) { diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh index 92e14b45725..b704cd16579 100644 --- a/cub/cub/device/dispatch/dispatch_merge.cuh +++ b/cub/cub/device/dispatch/dispatch_merge.cuh @@ -210,7 +210,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( PolicySelector policy_selector = {}, KernelLauncherFactory launcher_factory = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) { diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh index 224fe430a31..da951585a17 100644 --- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh @@ -403,7 +403,11 @@ public: KernelLauncherFactory launcher_factory = {}, MaxPolicyT max_policy = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + // Get PTX version int ptx_version = 0; if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version))) @@ -474,7 +478,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( KeyT* = nullptr /* for CCCL.C */, ValueT* = nullptr /* for CCCL.C */) -> cudaError_t { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + [[maybe_unused]] constexpr bool keys_only = ::cuda::std::is_same_v; if (num_items == 0) diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh index dc23c9f12f8..904ef2ca01c 100644 --- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh @@ -1138,7 +1138,11 @@ public: KernelLauncherFactory launcher_factory = {}, MaxPolicyT max_policy = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + // Get PTX version int ptx_version = 0; if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version))) @@ -1203,7 +1207,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) { diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh index 2cb09a901dc..3321a5c05ee 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh @@ -478,7 +478,11 @@ struct DispatchReduce KernelLauncherFactory launcher_factory = {}, MaxPolicyT max_policy = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + // Get PTX version int ptx_version = 0; if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version))) @@ -751,7 +755,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + // from Dispatch() ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index 299ba312212..80d71a0c815 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -606,7 +606,11 @@ struct DispatchReduceByKey OffsetT num_items, cudaStream_t stream) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + cudaError error = cudaSuccess; do @@ -695,7 +699,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( cudaStream_t stream, PolicySelector policy_selector = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + using streaming_context_t = NullType; // streaming context not used for ReduceByKey yet using ScanTileStateT = ReduceByKeyScanTileState; [[maybe_unused]] static constexpr int init_kernel_threads = 128; diff --git a/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh b/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh index eb77d983bc1..9b8dcffd5b0 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh @@ -339,7 +339,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( PolicySelector policy_selector = {}, KernelLauncherFactory launcher_factory = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + // Get CC ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) diff --git a/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh b/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh index e1b8007d461..10e4a34fe1e 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh @@ -173,7 +173,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + // Get CC ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh index bb4bf4668d2..ca25edcb21e 100644 --- a/cub/cub/device/dispatch/dispatch_rle.cuh +++ b/cub/cub/device/dispatch/dispatch_rle.cuh @@ -605,7 +605,11 @@ struct DeviceRleDispatch OffsetT num_items, cudaStream_t stream) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + cudaError error = cudaSuccess; // Get PTX version @@ -663,7 +667,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( cudaStream_t stream, PolicySelector policy_selector = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + using local_offset_t = ::cuda::std::int32_t; using global_offset_t = OffsetT; static constexpr bool use_streaming_invocation = diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index cfd9b4cf1a6..b34a8d2296c 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -862,7 +862,11 @@ struct DispatchScan KernelLauncherFactory launcher_factory = {}, MaxPolicyT max_policy = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + // Get PTX version int ptx_version = 0; if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version))) @@ -930,7 +934,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) -> cudaError_t { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + static_assert(::cuda::std::is_unsigned_v && sizeof(OffsetT) >= 4, "DispatchScan only supports unsigned offset types of at least 4-bytes"); diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh index f0957c0ea90..9a45c8a3c12 100644 --- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh @@ -596,7 +596,11 @@ struct DispatchScanByKey KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + int ptx_version = 0; if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version))) { @@ -734,7 +738,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) -> cudaError_t { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + static_assert(::cuda::std::is_unsigned_v && sizeof(OffsetT) >= 4, "DispatchScan only supports unsigned offset types of at least 4-bytes"); diff --git a/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh index 5c37dbde822..feb5b0d7722 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh @@ -617,7 +617,11 @@ struct DispatchSegmentedRadixSort KernelLauncherFactory launcher_factory = {}, MaxPolicyT max_policy = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + cudaError_t error; do { @@ -904,7 +908,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( DecomposerT decomposer = {}, TuningEnvT = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + using default_policy_selector_t = policy_selector_from_types; using policy_selector_t = ::cuda::std::decay_t< ::cuda::std::execution::__query_result_or_t>; diff --git a/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh b/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh index 20cc042575e..d06fb1e9743 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh @@ -421,7 +421,11 @@ struct DispatchSegmentedReduce KernelLauncherFactory launcher_factory = {}, MaxPolicyT max_policy = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + if (num_segments <= 0) { return cudaSuccess; @@ -528,7 +532,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + if (num_segments <= 0) { return cudaSuccess; diff --git a/cub/cub/device/dispatch/dispatch_segmented_scan.cuh b/cub/cub/device/dispatch/dispatch_segmented_scan.cuh index 6c9b7dd5e07..9b78a960190 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_scan.cuh @@ -129,7 +129,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + static_assert(::cuda::std::is_integral_v && sizeof(OffsetT) >= 4 && sizeof(OffsetT) <= 8, "dispatch_segmented_scan only supports integral offset types of 4- or 8-bytes"); diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh index 2ac012906ac..4c67b57702f 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh @@ -689,7 +689,11 @@ struct DispatchSegmentedSort MaxPolicyT max_policy = {}, PartitionMaxPolicyT partition_max_policy = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + // Get PTX version int ptx_version = 0; if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version))) @@ -1282,7 +1286,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( PartitionKernelSource partition_kernel_source = {}, KernelLauncherFactory launcher_factory = {}) -> cudaError_t { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + [[maybe_unused]] static constexpr bool keys_only = ::cuda::std::is_same_v; const auto get_num_passes = [&](int radix_bits) { diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index 6eabeaa10b5..76f3be27868 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -843,7 +843,11 @@ struct DispatchSelectIf OffsetT num_items, cudaStream_t stream) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + int ptx_version = 0; if (cudaError_t error = CubDebug(PtxVersion(ptx_version))) { @@ -1102,7 +1106,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( PolicySelector policy_selector = {}, KernelLauncherFactory launcher_factory = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) { diff --git a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh index c69afafba45..f7e77a5d15e 100644 --- a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh +++ b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh @@ -364,7 +364,11 @@ struct DispatchThreeWayPartitionIf KernelLauncherFactory launcher_factory = {}, MaxPolicyT max_policy = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + // Get PTX version int ptx_version = 0; if (cudaError error = CubDebug(launcher_factory.PtxVersion(ptx_version)); cudaSuccess != error) @@ -435,7 +439,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) { diff --git a/cub/cub/device/dispatch/dispatch_topk.cuh b/cub/cub/device/dispatch/dispatch_topk.cuh index d2167f72a54..6856b1928f3 100644 --- a/cub/cub/device/dispatch/dispatch_topk.cuh +++ b/cub/cub/device/dispatch/dispatch_topk.cuh @@ -475,7 +475,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( PolicySelector policy_selector = {}, KernelLauncherFactory launcher_factory = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) { diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh index 805d20e21b1..362f9827cb6 100644 --- a/cub/cub/device/dispatch/dispatch_transform.cuh +++ b/cub/cub/device/dispatch/dispatch_transform.cuh @@ -556,7 +556,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + static_assert( ::cuda::std::is_same_v || ::cuda::std::is_same_v, "cub::DeviceTransform is only tested and tuned for 32-bit or 64-bit signed offset types"); diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh index cf8b5e4e096..a67b1cb6e50 100644 --- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh @@ -437,7 +437,11 @@ struct DispatchUniqueByKey KernelLauncherFactory launcher_factory = {}, MaxPolicyT max_policy = {}) { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + int ptx_version = 0; if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version))) { @@ -503,7 +507,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( KeyT* = nullptr /* for CCCL.C */, ValueT* = nullptr /* for CCCL.C */) -> cudaError_t { - validate_stream_device(stream); + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) { diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh index 7196d4fb1bc..80dea8043a1 100644 --- a/cub/cub/util_device.cuh +++ b/cub/cub/util_device.cuh @@ -456,6 +456,9 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream([[maybe_unused]] cudaStream_t NV_IF_ELSE_TARGET(NV_IS_HOST, (return CubDebug(cudaStreamSynchronize(stream));), (return cudaErrorNotSupported;)) } +namespace detail +{ +// Validates stream's device is current device, when CTK >= 12.8, otherwise does nothing. CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t validate_stream_device(cudaStream_t stream) { cudaError_t error = cudaSuccess; @@ -472,10 +475,15 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t validate_stream_device(cudaSt { return error; } - static_assert(currentDevice == streamDevice); + _CCCL_ASSERT(currentDevice == streamDevice, "current device must match CUB stream device"); + if (currentDevice != streamDevice) + { + return cudaErrorInvalidDevice; + } # endif // _CCCL_CTK_AT_LEAST(12,8) return error; } +} // namespace detail //! @brief Computes the maximum potential dynamic shared memory size per block for kernel @p kernel_ptr taking into //! account the amount of kernel's static and CUDA Driver's reserved shared memory. From 8de37554ebab674485e2713efdd4461b2400c3c4 Mon Sep 17 00:00:00 2001 From: anon <79939935+thom-gg@users.noreply.github.com> Date: Sat, 23 May 2026 21:54:49 +0200 Subject: [PATCH 3/3] add missing assertions to some dispatch functions --- cub/cub/device/dispatch/dispatch_scan_by_key.cuh | 5 +++++ cub/cub/device/dispatch/dispatch_segmented_reduce.cuh | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh index 9a45c8a3c12..eae22bd33c8 100644 --- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh @@ -641,6 +641,11 @@ struct DispatchScanByKey KernelSourceT kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) { diff --git a/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh b/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh index d06fb1e9743..00ca7c42619 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh @@ -735,6 +735,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_fixed_size( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + if (num_segments <= 0) { if (d_temp_storage == nullptr)