From e23c56cef67f94319e1f434f9e32277148146cca Mon Sep 17 00:00:00 2001
From: anon <79939935+thom-gg@users.noreply.github.com>
Date: Sat, 23 May 2026 17:37:33 +0200
Subject: [PATCH 1/3] Asserting current device and CUB stream matches

---
 .../dispatch/dispatch_adjacent_difference.cuh |  2 ++
 .../device/dispatch/dispatch_batch_memcpy.cuh |  1 +
 .../device/dispatch/dispatch_batched_topk.cuh |  1 +
 cub/cub/device/dispatch/dispatch_find.cuh     |  1 +
 cub/cub/device/dispatch/dispatch_for.cuh      |  1 +
 .../device/dispatch/dispatch_histogram.cuh    |  1 +
 cub/cub/device/dispatch/dispatch_merge.cuh    |  1 +
 .../device/dispatch/dispatch_merge_sort.cuh   |  2 ++
 .../device/dispatch/dispatch_radix_sort.cuh   |  2 ++
 cub/cub/device/dispatch/dispatch_reduce.cuh   |  2 ++
 .../dispatch/dispatch_reduce_by_key.cuh       |  2 ++
 .../dispatch_reduce_deterministic.cuh         |  1 +
 .../dispatch_reduce_nondeterministic.cuh      |  1 +
 cub/cub/device/dispatch/dispatch_rle.cuh      |  2 ++
 cub/cub/device/dispatch/dispatch_scan.cuh     |  2 ++
 .../device/dispatch/dispatch_scan_by_key.cuh  |  2 ++
 .../dispatch_segmented_radix_sort.cuh         |  2 ++
 .../dispatch/dispatch_segmented_reduce.cuh    |  2 ++
 .../dispatch/dispatch_segmented_scan.cuh      |  1 +
 .../dispatch/dispatch_segmented_sort.cuh      |  2 ++
 .../device/dispatch/dispatch_select_if.cuh    |  2 ++
 .../dispatch/dispatch_three_way_partition.cuh |  2 ++
 cub/cub/device/dispatch/dispatch_topk.cuh     |  1 +
 .../device/dispatch/dispatch_transform.cuh    |  1 +
 .../dispatch/dispatch_unique_by_key.cuh       |  2 ++
 cub/cub/util_device.cuh                       | 21 +++++++++++++++++++
 26 files changed, 60 insertions(+)
diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
index 808a696d8f6..d8dae488d1a 100644
--- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
+++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
@@ -285,6 +285,7 @@ struct DispatchAdjacentDifference
     DifferenceOpT difference_op,
     cudaStream_t stream)
   {
+    validate_stream_device(stream);
     cudaError error = cudaSuccess;
     do
     {
@@ -336,6 +337,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   PolicySelector policy_selector         = {},
   KernelLauncherFactory launcher_factory = {})
 {
+  validate_stream_device(stream);
   using InputT = detail::it_value_t<InputIteratorT>;
 
   ::cuda::compute_capability cc{};
diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
index b0df4be876f..7d6f23d5bbe 100644
--- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
+++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
@@ -305,6 +305,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   cudaStream_t stream,
   PolicySelectorT policy_selector = {})
 {
+  validate_stream_device(stream);
   using per_invocation_buffer_offset_t = detail::batch_memcpy::per_invocation_buffer_offset_t;
   using BufferSizeT                    = cub::detail::it_value_t<BufferSizeIteratorT>;
   using BLevBufferOffsetTileState      = cub::ScanTileState<per_invocation_buffer_offset_t>;
diff --git a/cub/cub/device/dispatch/dispatch_batched_topk.cuh b/cub/cub/device/dispatch/dispatch_batched_topk.cuh
index ec7d39b0bcb..9908577a9db 100644
--- a/cub/cub/device/dispatch/dispatch_batched_topk.cuh
+++ b/cub/cub/device/dispatch/dispatch_batched_topk.cuh
@@ -213,6 +213,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   cudaStream_t stream                             = nullptr,
   [[maybe_unused]] PolicySelector policy_selector = {})
 {
+  validate_stream_device(stream);
   using large_segment_tile_offset_t = typename TotalNumItemsGuaranteeT::value_type;
   // Helper that determines (a) whether there's any one-worker-per-segment policy supporting the range of segment
   // sizes and k, and (b) if so, which set of one-worker-per-segment policies to use
diff --git a/cub/cub/device/dispatch/dispatch_find.cuh b/cub/cub/device/dispatch/dispatch_find.cuh
index 024498e320a..c5824ecc4b3 100644
--- a/cub/cub/device/dispatch/dispatch_find.cuh
+++ b/cub/cub/device/dispatch/dispatch_find.cuh
@@ -97,6 +97,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   cudaStream_t stream,
   PolicySelector policy_selector = {})
 {
+  validate_stream_device(stream);
   using output_t = it_value_t<OutputIteratorT>;
 
   // if the output iterator can be turned into a pointer, the value type is integral, and has the same size as OffsetT
diff --git a/cub/cub/device/dispatch/dispatch_for.cuh b/cub/cub/device/dispatch/dispatch_for.cuh
index 0c1ab4dd7e6..b1f6d9fb7c1 100644
--- a/cub/cub/device/dispatch/dispatch_for.cuh
+++ b/cub/cub/device/dispatch/dispatch_for.cuh
@@ -115,6 +115,7 @@ template <class OffsetT, class OpT, class PolicySelector = policy_selector>
 CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
 dispatch(OffsetT num_items, OpT op, cudaStream_t stream, PolicySelector policy_selector = {})
 {
+  validate_stream_device(stream);
   if (num_items == 0)
   {
     return cudaSuccess;
diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh
index 9d5c8dc2874..2491d2b2f5f 100644
--- a/cub/cub/device/dispatch/dispatch_histogram.cuh
+++ b/cub/cub/device/dispatch/dispatch_histogram.cuh
@@ -192,6 +192,7 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto dispatch(
   KernelSource kernel_source             = {},
   KernelLauncherFactory launcher_factory = {})
 {
+  validate_stream_device(stream);
   ::cuda::compute_capability cc{};
   if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
   {
diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh
index 18f0b310105..92e14b45725 100644
--- a/cub/cub/device/dispatch/dispatch_merge.cuh
+++ b/cub/cub/device/dispatch/dispatch_merge.cuh
@@ -210,6 +210,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   PolicySelector policy_selector         = {},
   KernelLauncherFactory launcher_factory = {})
 {
+  validate_stream_device(stream);
   ::cuda::compute_capability cc{};
   if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
   {
diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh
index 788b217042f..224fe430a31 100644
--- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh
@@ -403,6 +403,7 @@ public:
     KernelLauncherFactory launcher_factory = {},
     MaxPolicyT max_policy                  = {})
   {
+    validate_stream_device(stream);
     // Get PTX version
     int ptx_version = 0;
     if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version)))
@@ -473,6 +474,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   KeyT*                                  = nullptr /* for CCCL.C */,
   ValueT*                                = nullptr /* for CCCL.C */) -> cudaError_t
 {
+  validate_stream_device(stream);
   [[maybe_unused]] constexpr bool keys_only = ::cuda::std::is_same_v<ValueT, NullType>;
 
   if (num_items == 0)
diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
index c6eaf3ca33c..dc23c9f12f8 100644
--- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -1138,6 +1138,7 @@ public:
     KernelLauncherFactory launcher_factory = {},
     MaxPolicyT max_policy                  = {})
   {
+    validate_stream_device(stream);
     // Get PTX version
     int ptx_version = 0;
     if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version)))
@@ -1202,6 +1203,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   KernelSource kernel_source             = {},
   KernelLauncherFactory launcher_factory = {})
 {
+  validate_stream_device(stream);
   ::cuda::compute_capability cc{};
   if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
   {
diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
index 9b71e37a569..2cb09a901dc 100644
--- a/cub/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
@@ -478,6 +478,7 @@ struct DispatchReduce
     KernelLauncherFactory launcher_factory = {},
     MaxPolicyT max_policy                  = {})
   {
+    validate_stream_device(stream);
     // Get PTX version
     int ptx_version = 0;
     if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version)))
@@ -750,6 +751,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   KernelSource kernel_source             = {},
   KernelLauncherFactory launcher_factory = {})
 {
+  validate_stream_device(stream);
   // from Dispatch()
   ::cuda::compute_capability cc{};
   if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
index e00f0786de1..299ba312212 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -606,6 +606,7 @@ struct DispatchReduceByKey
     OffsetT num_items,
     cudaStream_t stream)
   {
+    validate_stream_device(stream);
     cudaError error = cudaSuccess;
 
     do
@@ -694,6 +695,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
   cudaStream_t stream,
   PolicySelector policy_selector = {})
 {
+  validate_stream_device(stream);
   using streaming_context_t = NullType; // streaming context not used for ReduceByKey yet
   using ScanTileStateT      = ReduceByKeyScanTileState<AccumT, OffsetT>;
   [[maybe_unused]] static constexpr int init_kernel_threads = 128;
diff --git a/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh b/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh
index 4b54e9d53bc..eb77d983bc1 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh
@@ -339,6 +339,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   PolicySelector policy_selector         = {},
   KernelLauncherFactory launcher_factory = {})
 {
+  validate_stream_device(stream);
   // Get CC
   ::cuda::compute_capability cc{};
   if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
diff --git a/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh b/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh
index b57bf088072..e1b8007d461 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh
@@ -173,6 +173,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   KernelSource kernel_source             = {},
   KernelLauncherFactory launcher_factory = {})
 {
+  validate_stream_device(stream);
   // Get CC
   ::cuda::compute_capability cc{};
   if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh
index 1bfb5088c31..bb4bf4668d2 100644
--- a/cub/cub/device/dispatch/dispatch_rle.cuh
+++ b/cub/cub/device/dispatch/dispatch_rle.cuh
@@ -605,6 +605,7 @@ struct DeviceRleDispatch
     OffsetT num_items,
     cudaStream_t stream)
   {
+    validate_stream_device(stream);
     cudaError error = cudaSuccess;
 
     // Get PTX version
@@ -662,6 +663,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
   cudaStream_t stream,
   PolicySelector policy_selector = {})
 {
+  validate_stream_device(stream);
   using local_offset_t  = ::cuda::std::int32_t;
   using global_offset_t = OffsetT;
   static constexpr bool use_streaming_invocation =
diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index 6eb80ab85cb..cfd9b4cf1a6 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -862,6 +862,7 @@ struct DispatchScan
     KernelLauncherFactory launcher_factory = {},
     MaxPolicyT max_policy                  = {})
   {
+    validate_stream_device(stream);
     // Get PTX version
     int ptx_version = 0;
     if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version)))
@@ -929,6 +930,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   KernelSource kernel_source             = {},
   KernelLauncherFactory launcher_factory = {}) -> cudaError_t
 {
+  validate_stream_device(stream);
   static_assert(::cuda::std::is_unsigned_v<OffsetT> && sizeof(OffsetT) >= 4,
                 "DispatchScan only supports unsigned offset types of at least 4-bytes");
 
diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
index 1fbf5725649..f0957c0ea90 100644
--- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
@@ -596,6 +596,7 @@ struct DispatchScanByKey
     KernelSource kernel_source             = {},
     KernelLauncherFactory launcher_factory = {})
   {
+    validate_stream_device(stream);
     int ptx_version = 0;
     if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version)))
     {
@@ -733,6 +734,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   KernelSource kernel_source             = {},
   KernelLauncherFactory launcher_factory = {}) -> cudaError_t
 {
+  validate_stream_device(stream);
   static_assert(::cuda::std::is_unsigned_v<OffsetT> && sizeof(OffsetT) >= 4,
                 "DispatchScan only supports unsigned offset types of at least 4-bytes");
 
diff --git a/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh
index 4b90a2f1978..5c37dbde822 100644
--- a/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh
@@ -617,6 +617,7 @@ struct DispatchSegmentedRadixSort
     KernelLauncherFactory launcher_factory = {},
     MaxPolicyT max_policy                  = {})
   {
+    validate_stream_device(stream);
     cudaError_t error;
     do
     {
@@ -903,6 +904,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   DecomposerT decomposer = {},
   TuningEnvT             = {})
 {
+  validate_stream_device(stream);
   using default_policy_selector_t = policy_selector_from_types<KeyT, ValueT, SegmentSizeT>;
   using policy_selector_t         = ::cuda::std::decay_t<
             ::cuda::std::execution::__query_result_or_t<TuningEnvT, segmented_radix_sort_policy, default_policy_selector_t>>;
diff --git a/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh b/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh
index 63ffd1c7a7a..20cc042575e 100644
--- a/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh
+++ b/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh
@@ -421,6 +421,7 @@ struct DispatchSegmentedReduce
     KernelLauncherFactory launcher_factory = {},
     MaxPolicyT max_policy                  = {})
   {
+    validate_stream_device(stream);
     if (num_segments <= 0)
     {
       return cudaSuccess;
@@ -527,6 +528,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   KernelSource kernel_source             = {},
   KernelLauncherFactory launcher_factory = {})
 {
+  validate_stream_device(stream);
   if (num_segments <= 0)
   {
     return cudaSuccess;
diff --git a/cub/cub/device/dispatch/dispatch_segmented_scan.cuh b/cub/cub/device/dispatch/dispatch_segmented_scan.cuh
index 221aca57b00..6c9b7dd5e07 100644
--- a/cub/cub/device/dispatch/dispatch_segmented_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_segmented_scan.cuh
@@ -129,6 +129,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   KernelSource kernel_source             = {},
   KernelLauncherFactory launcher_factory = {})
 {
+  validate_stream_device(stream);
   static_assert(::cuda::std::is_integral_v<OffsetT> && sizeof(OffsetT) >= 4 && sizeof(OffsetT) <= 8,
                 "dispatch_segmented_scan only supports integral offset types of 4- or 8-bytes");
 
diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
index 9f317ec85ee..2ac012906ac 100644
--- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
@@ -689,6 +689,7 @@ struct DispatchSegmentedSort
     MaxPolicyT max_policy                         = {},
     PartitionMaxPolicyT partition_max_policy      = {})
   {
+    validate_stream_device(stream);
     // Get PTX version
     int ptx_version = 0;
     if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version)))
@@ -1281,6 +1282,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   PartitionKernelSource partition_kernel_source     = {},
   KernelLauncherFactory launcher_factory            = {}) -> cudaError_t
 {
+  validate_stream_device(stream);
   [[maybe_unused]] static constexpr bool keys_only = ::cuda::std::is_same_v<ValueT, NullType>;
 
   const auto get_num_passes = [&](int radix_bits) {
diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index 30626dac474..6eabeaa10b5 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -843,6 +843,7 @@ struct DispatchSelectIf
     OffsetT num_items,
     cudaStream_t stream)
   {
+    validate_stream_device(stream);
     int ptx_version = 0;
     if (cudaError_t error = CubDebug(PtxVersion(ptx_version)))
     {
@@ -1101,6 +1102,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   PolicySelector policy_selector         = {},
   KernelLauncherFactory launcher_factory = {})
 {
+  validate_stream_device(stream);
   ::cuda::compute_capability cc{};
   if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
   {
diff --git a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
index 2cabaf3a31a..c69afafba45 100644
--- a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
+++ b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
@@ -364,6 +364,7 @@ struct DispatchThreeWayPartitionIf
     KernelLauncherFactory launcher_factory = {},
     MaxPolicyT max_policy                  = {})
   {
+    validate_stream_device(stream);
     // Get PTX version
     int ptx_version = 0;
     if (cudaError error = CubDebug(launcher_factory.PtxVersion(ptx_version)); cudaSuccess != error)
@@ -434,6 +435,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   KernelSource kernel_source             = {},
   KernelLauncherFactory launcher_factory = {})
 {
+  validate_stream_device(stream);
   ::cuda::compute_capability cc{};
   if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
   {
diff --git a/cub/cub/device/dispatch/dispatch_topk.cuh b/cub/cub/device/dispatch/dispatch_topk.cuh
index d0d171eb88a..d2167f72a54 100644
--- a/cub/cub/device/dispatch/dispatch_topk.cuh
+++ b/cub/cub/device/dispatch/dispatch_topk.cuh
@@ -475,6 +475,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   PolicySelector policy_selector         = {},
   KernelLauncherFactory launcher_factory = {})
 {
+  validate_stream_device(stream);
   ::cuda::compute_capability cc{};
   if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
   {
diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh
index 098b9b38f00..805d20e21b1 100644
--- a/cub/cub/device/dispatch/dispatch_transform.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform.cuh
@@ -556,6 +556,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
   KernelSource kernel_source             = {},
   KernelLauncherFactory launcher_factory = {})
 {
+  validate_stream_device(stream);
   static_assert(
     ::cuda::std::is_same_v<Offset, ::cuda::std::int32_t> || ::cuda::std::is_same_v<Offset, ::cuda::std::int64_t>,
     "cub::DeviceTransform is only tested and tuned for 32-bit or 64-bit signed offset types");
diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
index 957e5a6589c..cf8b5e4e096 100644
--- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
@@ -437,6 +437,7 @@ struct DispatchUniqueByKey
     KernelLauncherFactory launcher_factory = {},
     MaxPolicyT max_policy                  = {})
   {
+    validate_stream_device(stream);
     int ptx_version = 0;
     if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version)))
     {
@@ -502,6 +503,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   KeyT*                                  = nullptr /* for CCCL.C */,
   ValueT*                                = nullptr /* for CCCL.C */) -> cudaError_t
 {
+  validate_stream_device(stream);
   ::cuda::compute_capability cc{};
   if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
   {
diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh
index 09caa064b2a..7196d4fb1bc 100644
--- a/cub/cub/util_device.cuh
+++ b/cub/cub/util_device.cuh
@@ -456,6 +456,27 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream([[maybe_unused]] cudaStream_t
   NV_IF_ELSE_TARGET(NV_IS_HOST, (return CubDebug(cudaStreamSynchronize(stream));), (return cudaErrorNotSupported;))
 }
 
+CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t validate_stream_device(cudaStream_t stream)
+{
+  cudaError_t error = cudaSuccess;
+#  if _CCCL_CTK_AT_LEAST(12, 8)
+  int streamDevice;
+  error = cudaStreamGetDevice(stream, &streamDevice);
+  if (error != cudaSuccess)
+  {
+    return error;
+  }
+  int currentDevice;
+  error = cudaGetDevice(&currentDevice);
+  if (error != cudaSuccess)
+  {
+    return error;
+  }
+  static_assert(currentDevice == streamDevice);
+#  endif // _CCCL_CTK_AT_LEAST(12,8)
+  return error;
+}
+
 //! @brief Computes the maximum potential dynamic shared memory size per block for kernel @p kernel_ptr taking into
 //!        account the amount of kernel's static and CUDA Driver's reserved shared memory.
 //!

From a8d21f9804c321d5ceb2ae617b47419fd65356e8 Mon Sep 17 00:00:00 2001
From: anon <79939935+thom-gg@users.noreply.github.com>
Date: Sat, 23 May 2026 19:44:27 +0200
Subject: [PATCH 2/3] Use _CCCL_ASSERT and catch error in calling function

---
 .../device/dispatch/dispatch_adjacent_difference.cuh | 12 ++++++++++--
 cub/cub/device/dispatch/dispatch_batch_memcpy.cuh    |  6 +++++-
 cub/cub/device/dispatch/dispatch_batched_topk.cuh    |  6 +++++-
 cub/cub/device/dispatch/dispatch_find.cuh            |  6 +++++-
 cub/cub/device/dispatch/dispatch_for.cuh             |  6 +++++-
 cub/cub/device/dispatch/dispatch_histogram.cuh       |  6 +++++-
 cub/cub/device/dispatch/dispatch_merge.cuh           |  6 +++++-
 cub/cub/device/dispatch/dispatch_merge_sort.cuh      | 12 ++++++++++--
 cub/cub/device/dispatch/dispatch_radix_sort.cuh      | 12 ++++++++++--
 cub/cub/device/dispatch/dispatch_reduce.cuh          | 12 ++++++++++--
 cub/cub/device/dispatch/dispatch_reduce_by_key.cuh   | 12 ++++++++++--
 .../dispatch/dispatch_reduce_deterministic.cuh       |  6 +++++-
 .../dispatch/dispatch_reduce_nondeterministic.cuh    |  6 +++++-
 cub/cub/device/dispatch/dispatch_rle.cuh             | 12 ++++++++++--
 cub/cub/device/dispatch/dispatch_scan.cuh            | 12 ++++++++++--
 cub/cub/device/dispatch/dispatch_scan_by_key.cuh     | 12 ++++++++++--
 .../dispatch/dispatch_segmented_radix_sort.cuh       | 12 ++++++++++--
 .../device/dispatch/dispatch_segmented_reduce.cuh    | 12 ++++++++++--
 cub/cub/device/dispatch/dispatch_segmented_scan.cuh  |  6 +++++-
 cub/cub/device/dispatch/dispatch_segmented_sort.cuh  | 12 ++++++++++--
 cub/cub/device/dispatch/dispatch_select_if.cuh       | 12 ++++++++++--
 .../device/dispatch/dispatch_three_way_partition.cuh | 12 ++++++++++--
 cub/cub/device/dispatch/dispatch_topk.cuh            |  6 +++++-
 cub/cub/device/dispatch/dispatch_transform.cuh       |  6 +++++-
 cub/cub/device/dispatch/dispatch_unique_by_key.cuh   | 12 ++++++++++--
 cub/cub/util_device.cuh                              | 10 +++++++++-
 26 files changed, 204 insertions(+), 40 deletions(-)

diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
index d8dae488d1a..f9779b87a7f 100644
--- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
+++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
@@ -285,7 +285,11 @@ struct DispatchAdjacentDifference
     DifferenceOpT difference_op,
     cudaStream_t stream)
   {
-    validate_stream_device(stream);
+    if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+    {
+      return error;
+    }
+
     cudaError error = cudaSuccess;
     do
     {
@@ -337,7 +341,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   PolicySelector policy_selector         = {},
   KernelLauncherFactory launcher_factory = {})
 {
-  validate_stream_device(stream);
+  if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+  {
+    return error;
+  }
+
   using InputT = detail::it_value_t<InputIteratorT>;
 
   ::cuda::compute_capability cc{};
diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
index 7d6f23d5bbe..294520d3705 100644
--- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
+++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
@@ -305,7 +305,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   cudaStream_t stream,
   PolicySelectorT policy_selector = {})
 {
-  validate_stream_device(stream);
+  if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+  {
+    return error;
+  }
+
   using per_invocation_buffer_offset_t = detail::batch_memcpy::per_invocation_buffer_offset_t;
   using BufferSizeT                    = cub::detail::it_value_t<BufferSizeIteratorT>;
   using BLevBufferOffsetTileState      = cub::ScanTileState<per_invocation_buffer_offset_t>;
diff --git a/cub/cub/device/dispatch/dispatch_batched_topk.cuh b/cub/cub/device/dispatch/dispatch_batched_topk.cuh
index 9908577a9db..7a3e741a340 100644
--- a/cub/cub/device/dispatch/dispatch_batched_topk.cuh
+++ b/cub/cub/device/dispatch/dispatch_batched_topk.cuh
@@ -213,7 +213,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   cudaStream_t stream                             = nullptr,
   [[maybe_unused]] PolicySelector policy_selector = {})
 {
-  validate_stream_device(stream);
+  if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+  {
+    return error;
+  }
+
   using large_segment_tile_offset_t = typename TotalNumItemsGuaranteeT::value_type;
   // Helper that determines (a) whether there's any one-worker-per-segment policy supporting the range of segment
   // sizes and k, and (b) if so, which set of one-worker-per-segment policies to use
diff --git a/cub/cub/device/dispatch/dispatch_find.cuh b/cub/cub/device/dispatch/dispatch_find.cuh
index c5824ecc4b3..fa8173511d9 100644
--- a/cub/cub/device/dispatch/dispatch_find.cuh
+++ b/cub/cub/device/dispatch/dispatch_find.cuh
@@ -97,7 +97,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   cudaStream_t stream,
   PolicySelector policy_selector = {})
 {
-  validate_stream_device(stream);
+  if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+  {
+    return error;
+  }
+
   using output_t = it_value_t<OutputIteratorT>;
 
   // if the output iterator can be turned into a pointer, the value type is integral, and has the same size as OffsetT
diff --git a/cub/cub/device/dispatch/dispatch_for.cuh b/cub/cub/device/dispatch/dispatch_for.cuh
index b1f6d9fb7c1..89ff7f15aa5 100644
--- a/cub/cub/device/dispatch/dispatch_for.cuh
+++ b/cub/cub/device/dispatch/dispatch_for.cuh
@@ -115,7 +115,11 @@ template <class OffsetT, class OpT, class PolicySelector = policy_selector>
 CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
 dispatch(OffsetT num_items, OpT op, cudaStream_t stream, PolicySelector policy_selector = {})
 {
-  validate_stream_device(stream);
+  if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+  {
+    return error;
+  }
+
   if (num_items == 0)
   {
     return cudaSuccess;
diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh
index 2491d2b2f5f..dcce53ffc77 100644
--- a/cub/cub/device/dispatch/dispatch_histogram.cuh
+++ b/cub/cub/device/dispatch/dispatch_histogram.cuh
@@ -192,7 +192,11 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto dispatch(
   KernelSource kernel_source             = {},
   KernelLauncherFactory launcher_factory = {})
 {
-  validate_stream_device(stream);
+  if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+  {
+    return error;
+  }
+
   ::cuda::compute_capability cc{};
   if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
   {
diff --git a/cub/cub/device/dispatch/dispatch_merge.cuh b/cub/cub/device/dispatch/dispatch_merge.cuh
index 92e14b45725..b704cd16579 100644
--- a/cub/cub/device/dispatch/dispatch_merge.cuh
+++ b/cub/cub/device/dispatch/dispatch_merge.cuh
@@ -210,7 +210,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   PolicySelector policy_selector         = {},
   KernelLauncherFactory launcher_factory = {})
 {
-  validate_stream_device(stream);
+  if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+  {
+    return error;
+  }
+
   ::cuda::compute_capability cc{};
   if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
   {
diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh
index 224fe430a31..da951585a17 100644
--- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh
@@ -403,7 +403,11 @@ public:
     KernelLauncherFactory launcher_factory = {},
     MaxPolicyT max_policy                  = {})
   {
-    validate_stream_device(stream);
+    if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+    {
+      return error;
+    }
+
     // Get PTX version
     int ptx_version = 0;
     if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version)))
@@ -474,7 +478,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   KeyT*                                  = nullptr /* for CCCL.C */,
   ValueT*                                = nullptr /* for CCCL.C */) -> cudaError_t
 {
-  validate_stream_device(stream);
+  if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+  {
+    return error;
+  }
+
   [[maybe_unused]] constexpr bool keys_only = ::cuda::std::is_same_v<ValueT, NullType>;
 
   if (num_items == 0)
diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
index dc23c9f12f8..904ef2ca01c 100644
--- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -1138,7 +1138,11 @@ public:
     KernelLauncherFactory launcher_factory = {},
     MaxPolicyT max_policy                  = {})
   {
-    validate_stream_device(stream);
+    if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+    {
+      return error;
+    }
+
     // Get PTX version
     int ptx_version = 0;
     if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version)))
@@ -1203,7 +1207,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   KernelSource kernel_source             = {},
   KernelLauncherFactory launcher_factory = {})
 {
-  validate_stream_device(stream);
+  if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+  {
+    return error;
+  }
+
   ::cuda::compute_capability cc{};
   if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
   {
diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
index 2cb09a901dc..3321a5c05ee 100644
--- a/cub/cub/device/dispatch/dispatch_reduce.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
@@ -478,7 +478,11 @@ struct DispatchReduce
     KernelLauncherFactory launcher_factory = {},
     MaxPolicyT max_policy                  = {})
   {
-    validate_stream_device(stream);
+    if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+    {
+      return error;
+    }
+
     // Get PTX version
     int ptx_version = 0;
     if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version)))
@@ -751,7 +755,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   KernelSource kernel_source             = {},
   KernelLauncherFactory launcher_factory = {})
 {
-  validate_stream_device(stream);
+  if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+  {
+    return error;
+  }
+
   // from Dispatch()
   ::cuda::compute_capability cc{};
   if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
index 299ba312212..80d71a0c815 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -606,7 +606,11 @@ struct DispatchReduceByKey
     OffsetT num_items,
     cudaStream_t stream)
   {
-    validate_stream_device(stream);
+    if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+    {
+      return error;
+    }
+
     cudaError error = cudaSuccess;
 
     do
@@ -695,7 +699,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
   cudaStream_t stream,
   PolicySelector policy_selector = {})
 {
-  validate_stream_device(stream);
+  if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+  {
+    return error;
+  }
+
   using streaming_context_t = NullType; // streaming context not used for ReduceByKey yet
   using ScanTileStateT      = ReduceByKeyScanTileState<AccumT, OffsetT>;
   [[maybe_unused]] static constexpr int init_kernel_threads = 128;
diff --git a/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh b/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh
index eb77d983bc1..9b8dcffd5b0 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh
@@ -339,7 +339,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   PolicySelector policy_selector         = {},
   KernelLauncherFactory launcher_factory = {})
 {
-  validate_stream_device(stream);
+  if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+  {
+    return error;
+  }
+
   // Get CC
   ::cuda::compute_capability cc{};
   if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
diff --git a/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh b/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh
index e1b8007d461..10e4a34fe1e 100644
--- a/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh
+++ b/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh
@@ -173,7 +173,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   KernelSource kernel_source             = {},
   KernelLauncherFactory launcher_factory = {})
 {
-  validate_stream_device(stream);
+  if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+  {
+    return error;
+  }
+
   // Get CC
   ::cuda::compute_capability cc{};
   if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh
index bb4bf4668d2..ca25edcb21e 100644
--- a/cub/cub/device/dispatch/dispatch_rle.cuh
+++ b/cub/cub/device/dispatch/dispatch_rle.cuh
@@ -605,7 +605,11 @@ struct DeviceRleDispatch
     OffsetT num_items,
     cudaStream_t stream)
   {
-    validate_stream_device(stream);
+    if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+    {
+      return error;
+    }
+
     cudaError error = cudaSuccess;
 
     // Get PTX version
@@ -663,7 +667,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
   cudaStream_t stream,
   PolicySelector policy_selector = {})
 {
-  validate_stream_device(stream);
+  if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+  {
+    return error;
+  }
+
   using local_offset_t  = ::cuda::std::int32_t;
   using global_offset_t = OffsetT;
   static constexpr bool use_streaming_invocation =
diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh
index cfd9b4cf1a6..b34a8d2296c 100644
--- a/cub/cub/device/dispatch/dispatch_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan.cuh
@@ -862,7 +862,11 @@ struct DispatchScan
     KernelLauncherFactory launcher_factory = {},
     MaxPolicyT max_policy                  = {})
   {
-    validate_stream_device(stream);
+    if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+    {
+      return error;
+    }
+
     // Get PTX version
     int ptx_version = 0;
     if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version)))
@@ -930,7 +934,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   KernelSource kernel_source             = {},
   KernelLauncherFactory launcher_factory = {}) -> cudaError_t
 {
-  validate_stream_device(stream);
+  if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+  {
+    return error;
+  }
+
   static_assert(::cuda::std::is_unsigned_v<OffsetT> && sizeof(OffsetT) >= 4,
                 "DispatchScan only supports unsigned offset types of at least 4-bytes");
 
diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
index f0957c0ea90..9a45c8a3c12 100644
--- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
@@ -596,7 +596,11 @@ struct DispatchScanByKey
     KernelSource kernel_source             = {},
     KernelLauncherFactory launcher_factory = {})
   {
-    validate_stream_device(stream);
+    if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+    {
+      return error;
+    }
+
     int ptx_version = 0;
     if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version)))
     {
@@ -734,7 +738,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   KernelSource kernel_source             = {},
   KernelLauncherFactory launcher_factory = {}) -> cudaError_t
 {
-  validate_stream_device(stream);
+  if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+  {
+    return error;
+  }
+
   static_assert(::cuda::std::is_unsigned_v<OffsetT> && sizeof(OffsetT) >= 4,
                 "DispatchScan only supports unsigned offset types of at least 4-bytes");
 
diff --git a/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh
index 5c37dbde822..feb5b0d7722 100644
--- a/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh
@@ -617,7 +617,11 @@ struct DispatchSegmentedRadixSort
     KernelLauncherFactory launcher_factory = {},
     MaxPolicyT max_policy                  = {})
   {
-    validate_stream_device(stream);
+    if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+    {
+      return error;
+    }
+
     cudaError_t error;
     do
     {
@@ -904,7 +908,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   DecomposerT decomposer = {},
   TuningEnvT             = {})
 {
-  validate_stream_device(stream);
+  if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+  {
+    return error;
+  }
+
   using default_policy_selector_t = policy_selector_from_types<KeyT, ValueT, SegmentSizeT>;
   using policy_selector_t         = ::cuda::std::decay_t<
             ::cuda::std::execution::__query_result_or_t<TuningEnvT, segmented_radix_sort_policy, default_policy_selector_t>>;
diff --git a/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh b/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh
index 20cc042575e..d06fb1e9743 100644
--- a/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh
+++ b/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh
@@ -421,7 +421,11 @@ struct DispatchSegmentedReduce
     KernelLauncherFactory launcher_factory = {},
     MaxPolicyT max_policy                  = {})
   {
-    validate_stream_device(stream);
+    if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+    {
+      return error;
+    }
+
     if (num_segments <= 0)
     {
       return cudaSuccess;
@@ -528,7 +532,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   KernelSource kernel_source             = {},
   KernelLauncherFactory launcher_factory = {})
 {
-  validate_stream_device(stream);
+  if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+  {
+    return error;
+  }
+
   if (num_segments <= 0)
   {
     return cudaSuccess;
diff --git a/cub/cub/device/dispatch/dispatch_segmented_scan.cuh b/cub/cub/device/dispatch/dispatch_segmented_scan.cuh
index 6c9b7dd5e07..9b78a960190 100644
--- a/cub/cub/device/dispatch/dispatch_segmented_scan.cuh
+++ b/cub/cub/device/dispatch/dispatch_segmented_scan.cuh
@@ -129,7 +129,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   KernelSource kernel_source             = {},
   KernelLauncherFactory launcher_factory = {})
 {
-  validate_stream_device(stream);
+  if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+  {
+    return error;
+  }
+
   static_assert(::cuda::std::is_integral_v<OffsetT> && sizeof(OffsetT) >= 4 && sizeof(OffsetT) <= 8,
                 "dispatch_segmented_scan only supports integral offset types of 4- or 8-bytes");
 
diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
index 2ac012906ac..4c67b57702f 100644
--- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh
@@ -689,7 +689,11 @@ struct DispatchSegmentedSort
     MaxPolicyT max_policy                         = {},
     PartitionMaxPolicyT partition_max_policy      = {})
   {
-    validate_stream_device(stream);
+    if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+    {
+      return error;
+    }
+
     // Get PTX version
     int ptx_version = 0;
     if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version)))
@@ -1282,7 +1286,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   PartitionKernelSource partition_kernel_source     = {},
   KernelLauncherFactory launcher_factory            = {}) -> cudaError_t
 {
-  validate_stream_device(stream);
+  if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+  {
+    return error;
+  }
+
   [[maybe_unused]] static constexpr bool keys_only = ::cuda::std::is_same_v<ValueT, NullType>;
 
   const auto get_num_passes = [&](int radix_bits) {
diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh
index 6eabeaa10b5..76f3be27868 100644
--- a/cub/cub/device/dispatch/dispatch_select_if.cuh
+++ b/cub/cub/device/dispatch/dispatch_select_if.cuh
@@ -843,7 +843,11 @@ struct DispatchSelectIf
     OffsetT num_items,
     cudaStream_t stream)
   {
-    validate_stream_device(stream);
+    if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+    {
+      return error;
+    }
+
     int ptx_version = 0;
     if (cudaError_t error = CubDebug(PtxVersion(ptx_version)))
     {
@@ -1102,7 +1106,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   PolicySelector policy_selector         = {},
   KernelLauncherFactory launcher_factory = {})
 {
-  validate_stream_device(stream);
+  if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+  {
+    return error;
+  }
+
   ::cuda::compute_capability cc{};
   if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
   {
diff --git a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
index c69afafba45..f7e77a5d15e 100644
--- a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
+++ b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh
@@ -364,7 +364,11 @@ struct DispatchThreeWayPartitionIf
     KernelLauncherFactory launcher_factory = {},
     MaxPolicyT max_policy                  = {})
   {
-    validate_stream_device(stream);
+    if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+    {
+      return error;
+    }
+
     // Get PTX version
     int ptx_version = 0;
     if (cudaError error = CubDebug(launcher_factory.PtxVersion(ptx_version)); cudaSuccess != error)
@@ -435,7 +439,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   KernelSource kernel_source             = {},
   KernelLauncherFactory launcher_factory = {})
 {
-  validate_stream_device(stream);
+  if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+  {
+    return error;
+  }
+
   ::cuda::compute_capability cc{};
   if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
   {
diff --git a/cub/cub/device/dispatch/dispatch_topk.cuh b/cub/cub/device/dispatch/dispatch_topk.cuh
index d2167f72a54..6856b1928f3 100644
--- a/cub/cub/device/dispatch/dispatch_topk.cuh
+++ b/cub/cub/device/dispatch/dispatch_topk.cuh
@@ -475,7 +475,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   PolicySelector policy_selector         = {},
   KernelLauncherFactory launcher_factory = {})
 {
-  validate_stream_device(stream);
+  if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+  {
+    return error;
+  }
+
   ::cuda::compute_capability cc{};
   if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
   {
diff --git a/cub/cub/device/dispatch/dispatch_transform.cuh b/cub/cub/device/dispatch/dispatch_transform.cuh
index 805d20e21b1..362f9827cb6 100644
--- a/cub/cub/device/dispatch/dispatch_transform.cuh
+++ b/cub/cub/device/dispatch/dispatch_transform.cuh
@@ -556,7 +556,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
   KernelSource kernel_source             = {},
   KernelLauncherFactory launcher_factory = {})
 {
-  validate_stream_device(stream);
+  if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+  {
+    return error;
+  }
+
   static_assert(
     ::cuda::std::is_same_v<Offset, ::cuda::std::int32_t> || ::cuda::std::is_same_v<Offset, ::cuda::std::int64_t>,
     "cub::DeviceTransform is only tested and tuned for 32-bit or 64-bit signed offset types");
diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
index cf8b5e4e096..a67b1cb6e50 100644
--- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
@@ -437,7 +437,11 @@ struct DispatchUniqueByKey
     KernelLauncherFactory launcher_factory = {},
     MaxPolicyT max_policy                  = {})
   {
-    validate_stream_device(stream);
+    if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+    {
+      return error;
+    }
+
     int ptx_version = 0;
     if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version)))
     {
@@ -503,7 +507,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
   KeyT*                                  = nullptr /* for CCCL.C */,
   ValueT*                                = nullptr /* for CCCL.C */) -> cudaError_t
 {
-  validate_stream_device(stream);
+  if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+  {
+    return error;
+  }
+
   ::cuda::compute_capability cc{};
   if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
   {
diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh
index 7196d4fb1bc..80dea8043a1 100644
--- a/cub/cub/util_device.cuh
+++ b/cub/cub/util_device.cuh
@@ -456,6 +456,9 @@ CUB_RUNTIME_FUNCTION inline cudaError_t SyncStream([[maybe_unused]] cudaStream_t
   NV_IF_ELSE_TARGET(NV_IS_HOST, (return CubDebug(cudaStreamSynchronize(stream));), (return cudaErrorNotSupported;))
 }
 
+namespace detail
+{
+// Validates stream's device is current device, when CTK >= 12.8, otherwise does nothing.
 CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t validate_stream_device(cudaStream_t stream)
 {
   cudaError_t error = cudaSuccess;
@@ -472,10 +475,15 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t validate_stream_device(cudaSt
   {
     return error;
   }
-  static_assert(currentDevice == streamDevice);
+  _CCCL_ASSERT(currentDevice == streamDevice, "current device must match CUB stream device");
+  if (currentDevice != streamDevice)
+  {
+    return cudaErrorInvalidDevice;
+  }
 #  endif // _CCCL_CTK_AT_LEAST(12,8)
   return error;
 }
+} // namespace detail
 
 //! @brief Computes the maximum potential dynamic shared memory size per block for kernel @p kernel_ptr taking into
 //!        account the amount of kernel's static and CUDA Driver's reserved shared memory.

From 8de37554ebab674485e2713efdd4461b2400c3c4 Mon Sep 17 00:00:00 2001
From: anon <79939935+thom-gg@users.noreply.github.com>
Date: Sat, 23 May 2026 21:54:49 +0200
Subject: [PATCH 3/3] add missing assertions to some dispatch functions

---
 cub/cub/device/dispatch/dispatch_scan_by_key.cuh      | 5 +++++
 cub/cub/device/dispatch/dispatch_segmented_reduce.cuh | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
index 9a45c8a3c12..eae22bd33c8 100644
--- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh
@@ -641,6 +641,11 @@ struct DispatchScanByKey
     KernelSourceT kernel_source            = {},
     KernelLauncherFactory launcher_factory = {})
   {
+    if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+    {
+      return error;
+    }
+
     ::cuda::compute_capability cc{};
     if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
     {
diff --git a/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh b/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh
index d06fb1e9743..00ca7c42619 100644
--- a/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh
+++ b/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh
@@ -735,6 +735,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_fixed_size(
   KernelSource kernel_source             = {},
   KernelLauncherFactory launcher_factory = {})
 {
+  if (const auto error = CubDebug(detail::validate_stream_device(stream)))
+  {
+    return error;
+  }
+
   if (num_segments <= 0)
   {
     if (d_temp_storage == nullptr)