diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh index 808a696d8f6..f9779b87a7f 100644 --- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh +++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh @@ -285,6 +285,11 @@ struct DispatchAdjacentDifference DifferenceOpT difference_op, cudaStream_t stream) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + cudaError error = cudaSuccess; do { @@ -336,6 +341,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( PolicySelector policy_selector = {}, KernelLauncherFactory launcher_factory = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + using InputT = detail::it_value_t; ::cuda::compute_capability cc{}; diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh index b0df4be876f..294520d3705 100644 --- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh +++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh @@ -305,6 +305,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( cudaStream_t stream, PolicySelectorT policy_selector = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + using per_invocation_buffer_offset_t = detail::batch_memcpy::per_invocation_buffer_offset_t; using BufferSizeT = cub::detail::it_value_t; using BLevBufferOffsetTileState = cub::ScanTileState; diff --git a/cub/cub/device/dispatch/dispatch_batched_topk.cuh b/cub/cub/device/dispatch/dispatch_batched_topk.cuh index ec7d39b0bcb..7a3e741a340 100644 --- a/cub/cub/device/dispatch/dispatch_batched_topk.cuh +++ b/cub/cub/device/dispatch/dispatch_batched_topk.cuh @@ -213,6 +213,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( cudaStream_t stream = nullptr, [[maybe_unused]] PolicySelector policy_selector = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + using large_segment_tile_offset_t = typename TotalNumItemsGuaranteeT::value_type; // Helper that determines (a) whether there's any one-worker-per-segment policy supporting the range of segment // sizes and k, and (b) if so, which set of one-worker-per-segment policies to use diff --git a/cub/cub/device/dispatch/dispatch_find.cuh b/cub/cub/device/dispatch/dispatch_find.cuh index 024498e320a..fa8173511d9 100644 --- a/cub/cub/device/dispatch/dispatch_find.cuh +++ b/cub/cub/device/dispatch/dispatch_find.cuh @@ -97,6 +97,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( cudaStream_t stream, PolicySelector policy_selector = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + using output_t = it_value_t; // if the output iterator can be turned into a pointer, the value type is integral, and has the same size as OffsetT diff --git a/cub/cub/device/dispatch/dispatch_for.cuh b/cub/cub/device/dispatch/dispatch_for.cuh index 0c1ab4dd7e6..89ff7f15aa5 100644 --- a/cub/cub/device/dispatch/dispatch_for.cuh +++ b/cub/cub/device/dispatch/dispatch_for.cuh @@ -115,6 +115,11 @@ template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(OffsetT num_items, OpT op, cudaStream_t stream, PolicySelector policy_selector = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + if (num_items == 0) { return cudaSuccess; diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh index 9d5c8dc2874..dcce53ffc77 100644 --- a/cub/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/cub/device/dispatch/dispatch_histogram.cuh @@ -192,6 +192,11 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) { diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh index 18f0b310105..b704cd16579 100644 --- a/cub/cub/device/dispatch/dispatch_merge.cuh +++ b/cub/cub/device/dispatch/dispatch_merge.cuh @@ -210,6 +210,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( PolicySelector policy_selector = {}, KernelLauncherFactory launcher_factory = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) { diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh index 788b217042f..da951585a17 100644 --- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh @@ -403,6 +403,11 @@ public: KernelLauncherFactory launcher_factory = {}, MaxPolicyT max_policy = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + // Get PTX version int ptx_version = 0; if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version))) @@ -473,6 +478,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( KeyT* = nullptr /* for CCCL.C */, ValueT* = nullptr /* for CCCL.C */) -> cudaError_t { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + [[maybe_unused]] constexpr bool keys_only = ::cuda::std::is_same_v; if (num_items == 0) diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh index c6eaf3ca33c..904ef2ca01c 100644 --- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh @@ -1138,6 +1138,11 @@ public: KernelLauncherFactory launcher_factory = {}, MaxPolicyT max_policy = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + // Get PTX version int ptx_version = 0; if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version))) @@ -1202,6 +1207,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) { diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh index 9b71e37a569..3321a5c05ee 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh @@ -478,6 +478,11 @@ struct DispatchReduce KernelLauncherFactory launcher_factory = {}, MaxPolicyT max_policy = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + // Get PTX version int ptx_version = 0; if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version))) @@ -750,6 +755,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + // from Dispatch() ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index e00f0786de1..80d71a0c815 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -606,6 +606,11 @@ struct DispatchReduceByKey OffsetT num_items, cudaStream_t stream) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + cudaError error = cudaSuccess; do @@ -694,6 +699,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( cudaStream_t stream, PolicySelector policy_selector = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + using streaming_context_t = NullType; // streaming context not used for ReduceByKey yet using ScanTileStateT = ReduceByKeyScanTileState; [[maybe_unused]] static constexpr int init_kernel_threads = 128; diff --git a/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh b/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh index 4b54e9d53bc..9b8dcffd5b0 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh @@ -339,6 +339,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( PolicySelector policy_selector = {}, KernelLauncherFactory launcher_factory = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + // Get CC ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) diff --git a/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh b/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh index b57bf088072..10e4a34fe1e 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh @@ -173,6 +173,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + // Get CC ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh index 1bfb5088c31..ca25edcb21e 100644 --- a/cub/cub/device/dispatch/dispatch_rle.cuh +++ b/cub/cub/device/dispatch/dispatch_rle.cuh @@ -605,6 +605,11 @@ struct DeviceRleDispatch OffsetT num_items, cudaStream_t stream) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + cudaError error = cudaSuccess; // Get PTX version @@ -662,6 +667,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( cudaStream_t stream, PolicySelector policy_selector = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + using local_offset_t = ::cuda::std::int32_t; using global_offset_t = OffsetT; static constexpr bool use_streaming_invocation = diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index 6eb80ab85cb..b34a8d2296c 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -862,6 +862,11 @@ struct DispatchScan KernelLauncherFactory launcher_factory = {}, MaxPolicyT max_policy = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + // Get PTX version int ptx_version = 0; if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version))) @@ -929,6 +934,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) -> cudaError_t { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + static_assert(::cuda::std::is_unsigned_v && sizeof(OffsetT) >= 4, "DispatchScan only supports unsigned offset types of at least 4-bytes"); diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh index 1fbf5725649..eae22bd33c8 100644 --- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh @@ -596,6 +596,11 @@ struct DispatchScanByKey KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + int ptx_version = 0; if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version))) { @@ -636,6 +641,11 @@ struct DispatchScanByKey KernelSourceT kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) { @@ -733,6 +743,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) -> cudaError_t { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + static_assert(::cuda::std::is_unsigned_v && sizeof(OffsetT) >= 4, "DispatchScan only supports unsigned offset types of at least 4-bytes"); diff --git a/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh index 4b90a2f1978..feb5b0d7722 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh @@ -617,6 +617,11 @@ struct DispatchSegmentedRadixSort KernelLauncherFactory launcher_factory = {}, MaxPolicyT max_policy = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + cudaError_t error; do { @@ -903,6 +908,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( DecomposerT decomposer = {}, TuningEnvT = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + using default_policy_selector_t = policy_selector_from_types; using policy_selector_t = ::cuda::std::decay_t< ::cuda::std::execution::__query_result_or_t>; diff --git a/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh b/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh index 63ffd1c7a7a..00ca7c42619 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh @@ -421,6 +421,11 @@ struct DispatchSegmentedReduce KernelLauncherFactory launcher_factory = {}, MaxPolicyT max_policy = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + if (num_segments <= 0) { return cudaSuccess; @@ -527,6 +532,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + if (num_segments <= 0) { return cudaSuccess; @@ -725,6 +735,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_fixed_size( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + if (num_segments <= 0) { if (d_temp_storage == nullptr) diff --git a/cub/cub/device/dispatch/dispatch_segmented_scan.cuh b/cub/cub/device/dispatch/dispatch_segmented_scan.cuh index 221aca57b00..9b78a960190 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_scan.cuh @@ -129,6 +129,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + static_assert(::cuda::std::is_integral_v && sizeof(OffsetT) >= 4 && sizeof(OffsetT) <= 8, "dispatch_segmented_scan only supports integral offset types of 4- or 8-bytes"); diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh index 9f317ec85ee..4c67b57702f 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh @@ -689,6 +689,11 @@ struct DispatchSegmentedSort MaxPolicyT max_policy = {}, PartitionMaxPolicyT partition_max_policy = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + // Get PTX version int ptx_version = 0; if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version))) @@ -1281,6 +1286,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( PartitionKernelSource partition_kernel_source = {}, KernelLauncherFactory launcher_factory = {}) -> cudaError_t { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + [[maybe_unused]] static constexpr bool keys_only = ::cuda::std::is_same_v; const auto get_num_passes = [&](int radix_bits) { diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index 30626dac474..76f3be27868 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -843,6 +843,11 @@ struct DispatchSelectIf OffsetT num_items, cudaStream_t stream) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + int ptx_version = 0; if (cudaError_t error = CubDebug(PtxVersion(ptx_version))) { @@ -1101,6 +1106,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( PolicySelector policy_selector = {}, KernelLauncherFactory launcher_factory = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) { diff --git a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh index 2cabaf3a31a..f7e77a5d15e 100644 --- a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh +++ b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh @@ -364,6 +364,11 @@ struct DispatchThreeWayPartitionIf KernelLauncherFactory launcher_factory = {}, MaxPolicyT max_policy = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + // Get PTX version int ptx_version = 0; if (cudaError error = CubDebug(launcher_factory.PtxVersion(ptx_version)); cudaSuccess != error) @@ -434,6 +439,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) { diff --git a/cub/cub/device/dispatch/dispatch_topk.cuh b/cub/cub/device/dispatch/dispatch_topk.cuh index d0d171eb88a..6856b1928f3 100644 --- a/cub/cub/device/dispatch/dispatch_topk.cuh +++ b/cub/cub/device/dispatch/dispatch_topk.cuh @@ -475,6 +475,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( PolicySelector policy_selector = {}, KernelLauncherFactory launcher_factory = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) { diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh index 098b9b38f00..362f9827cb6 100644 --- a/cub/cub/device/dispatch/dispatch_transform.cuh +++ b/cub/cub/device/dispatch/dispatch_transform.cuh @@ -556,6 +556,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch( KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + static_assert( ::cuda::std::is_same_v || ::cuda::std::is_same_v, "cub::DeviceTransform is only tested and tuned for 32-bit or 64-bit signed offset types"); diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh index 957e5a6589c..a67b1cb6e50 100644 --- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh @@ -437,6 +437,11 @@ struct DispatchUniqueByKey KernelLauncherFactory launcher_factory = {}, MaxPolicyT max_policy = {}) { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + int ptx_version = 0; if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version))) { @@ -502,6 +507,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( KeyT* = nullptr /* for CCCL.C */, ValueT* = nullptr /* for CCCL.C */) -> cudaError_t { + if (const auto error = CubDebug(detail::validate_stream_device(stream))) + { + return error; + } + ::cuda::compute_capability cc{}; if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc))) { diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh index 09caa064b2a..80dea8043a1 100644 --- a/cub/cub/util_device.cuh +++ b/cub/cub/util_device.cuh @@ -456,6 +456,35 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream([[maybe_unused]] cudaStream_t NV_IF_ELSE_TARGET(NV_IS_HOST, (return CubDebug(cudaStreamSynchronize(stream));), (return cudaErrorNotSupported;)) } +namespace detail +{ +// Validates stream's device is current device, when CTK >= 12.8, otherwise does nothing. +CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t validate_stream_device(cudaStream_t stream) +{ + cudaError_t error = cudaSuccess; +# if _CCCL_CTK_AT_LEAST(12, 8) + int streamDevice; + error = cudaStreamGetDevice(stream, &streamDevice); + if (error != cudaSuccess) + { + return error; + } + int currentDevice; + error = cudaGetDevice(¤tDevice); + if (error != cudaSuccess) + { + return error; + } + _CCCL_ASSERT(currentDevice == streamDevice, "current device must match CUB stream device"); + if (currentDevice != streamDevice) + { + return cudaErrorInvalidDevice; + } +# endif // _CCCL_CTK_AT_LEAST(12,8) + return error; +} +} // namespace detail + //! @brief Computes the maximum potential dynamic shared memory size per block for kernel @p kernel_ptr taking into //! account the amount of kernel's static and CUDA Driver's reserved shared memory. //!