Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,11 @@ struct DispatchAdjacentDifference
DifferenceOpT difference_op,
cudaStream_t stream)
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

cudaError error = cudaSuccess;
do
{
Expand Down Expand Up @@ -336,6 +341,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
PolicySelector policy_selector = {},
KernelLauncherFactory launcher_factory = {})
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

using InputT = detail::it_value_t<InputIteratorT>;

::cuda::compute_capability cc{};
Expand Down
5 changes: 5 additions & 0 deletions cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
cudaStream_t stream,
PolicySelectorT policy_selector = {})
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

using per_invocation_buffer_offset_t = detail::batch_memcpy::per_invocation_buffer_offset_t;
using BufferSizeT = cub::detail::it_value_t<BufferSizeIteratorT>;
using BLevBufferOffsetTileState = cub::ScanTileState<per_invocation_buffer_offset_t>;
Expand Down
5 changes: 5 additions & 0 deletions cub/cub/device/dispatch/dispatch_batched_topk.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
cudaStream_t stream = nullptr,
[[maybe_unused]] PolicySelector policy_selector = {})
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

using large_segment_tile_offset_t = typename TotalNumItemsGuaranteeT::value_type;
// Helper that determines (a) whether there's any one-worker-per-segment policy supporting the range of segment
// sizes and k, and (b) if so, which set of one-worker-per-segment policies to use
Expand Down
5 changes: 5 additions & 0 deletions cub/cub/device/dispatch/dispatch_find.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
cudaStream_t stream,
PolicySelector policy_selector = {})
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

using output_t = it_value_t<OutputIteratorT>;

// if the output iterator can be turned into a pointer, the value type is integral, and has the same size as OffsetT
Expand Down
5 changes: 5 additions & 0 deletions cub/cub/device/dispatch/dispatch_for.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,11 @@ template <class OffsetT, class OpT, class PolicySelector = policy_selector>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t
dispatch(OffsetT num_items, OpT op, cudaStream_t stream, PolicySelector policy_selector = {})
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

if (num_items == 0)
{
return cudaSuccess;
Expand Down
5 changes: 5 additions & 0 deletions cub/cub/device/dispatch/dispatch_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,11 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE auto dispatch(
KernelSource kernel_source = {},
KernelLauncherFactory launcher_factory = {})
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

::cuda::compute_capability cc{};
if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
{
Expand Down
5 changes: 5 additions & 0 deletions cub/cub/device/dispatch/dispatch_merge.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
PolicySelector policy_selector = {},
KernelLauncherFactory launcher_factory = {})
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

::cuda::compute_capability cc{};
if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
{
Expand Down
10 changes: 10 additions & 0 deletions cub/cub/device/dispatch/dispatch_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,11 @@ public:
KernelLauncherFactory launcher_factory = {},
MaxPolicyT max_policy = {})
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

// Get PTX version
int ptx_version = 0;
if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version)))
Expand Down Expand Up @@ -473,6 +478,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
KeyT* = nullptr /* for CCCL.C */,
ValueT* = nullptr /* for CCCL.C */) -> cudaError_t
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

[[maybe_unused]] constexpr bool keys_only = ::cuda::std::is_same_v<ValueT, NullType>;

if (num_items == 0)
Expand Down
10 changes: 10 additions & 0 deletions cub/cub/device/dispatch/dispatch_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1138,6 +1138,11 @@ public:
KernelLauncherFactory launcher_factory = {},
MaxPolicyT max_policy = {})
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

// Get PTX version
int ptx_version = 0;
if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version)))
Expand Down Expand Up @@ -1202,6 +1207,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
KernelSource kernel_source = {},
KernelLauncherFactory launcher_factory = {})
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

::cuda::compute_capability cc{};
if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
{
Expand Down
10 changes: 10 additions & 0 deletions cub/cub/device/dispatch/dispatch_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,11 @@ struct DispatchReduce
KernelLauncherFactory launcher_factory = {},
MaxPolicyT max_policy = {})
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

// Get PTX version
int ptx_version = 0;
if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version)))
Expand Down Expand Up @@ -750,6 +755,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
KernelSource kernel_source = {},
KernelLauncherFactory launcher_factory = {})
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

// from Dispatch()
::cuda::compute_capability cc{};
if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
Expand Down
10 changes: 10 additions & 0 deletions cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,11 @@ struct DispatchReduceByKey
OffsetT num_items,
cudaStream_t stream)
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

cudaError error = cudaSuccess;

do
Expand Down Expand Up @@ -694,6 +699,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
cudaStream_t stream,
PolicySelector policy_selector = {})
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

using streaming_context_t = NullType; // streaming context not used for ReduceByKey yet
using ScanTileStateT = ReduceByKeyScanTileState<AccumT, OffsetT>;
[[maybe_unused]] static constexpr int init_kernel_threads = 128;
Expand Down
5 changes: 5 additions & 0 deletions cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
PolicySelector policy_selector = {},
KernelLauncherFactory launcher_factory = {})
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

// Get CC
::cuda::compute_capability cc{};
if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
KernelSource kernel_source = {},
KernelLauncherFactory launcher_factory = {})
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

// Get CC
::cuda::compute_capability cc{};
if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
Expand Down
10 changes: 10 additions & 0 deletions cub/cub/device/dispatch/dispatch_rle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -605,6 +605,11 @@ struct DeviceRleDispatch
OffsetT num_items,
cudaStream_t stream)
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

cudaError error = cudaSuccess;

// Get PTX version
Expand Down Expand Up @@ -662,6 +667,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t dispatch(
cudaStream_t stream,
PolicySelector policy_selector = {})
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

using local_offset_t = ::cuda::std::int32_t;
using global_offset_t = OffsetT;
static constexpr bool use_streaming_invocation =
Expand Down
10 changes: 10 additions & 0 deletions cub/cub/device/dispatch/dispatch_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -862,6 +862,11 @@ struct DispatchScan
KernelLauncherFactory launcher_factory = {},
MaxPolicyT max_policy = {})
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

// Get PTX version
int ptx_version = 0;
if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version)))
Expand Down Expand Up @@ -929,6 +934,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
KernelSource kernel_source = {},
KernelLauncherFactory launcher_factory = {}) -> cudaError_t
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

static_assert(::cuda::std::is_unsigned_v<OffsetT> && sizeof(OffsetT) >= 4,
"DispatchScan only supports unsigned offset types of at least 4-bytes");

Expand Down
15 changes: 15 additions & 0 deletions cub/cub/device/dispatch/dispatch_scan_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -596,6 +596,11 @@ struct DispatchScanByKey
KernelSource kernel_source = {},
KernelLauncherFactory launcher_factory = {})
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

int ptx_version = 0;
if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version)))
{
Expand Down Expand Up @@ -636,6 +641,11 @@ struct DispatchScanByKey
KernelSourceT kernel_source = {},
KernelLauncherFactory launcher_factory = {})
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

::cuda::compute_capability cc{};
if (const auto error = CubDebug(launcher_factory.PtxComputeCap(cc)))
{
Expand Down Expand Up @@ -733,6 +743,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
KernelSource kernel_source = {},
KernelLauncherFactory launcher_factory = {}) -> cudaError_t
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

static_assert(::cuda::std::is_unsigned_v<OffsetT> && sizeof(OffsetT) >= 4,
"DispatchScan only supports unsigned offset types of at least 4-bytes");

Expand Down
10 changes: 10 additions & 0 deletions cub/cub/device/dispatch/dispatch_segmented_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -617,6 +617,11 @@ struct DispatchSegmentedRadixSort
KernelLauncherFactory launcher_factory = {},
MaxPolicyT max_policy = {})
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

cudaError_t error;
do
{
Expand Down Expand Up @@ -903,6 +908,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
DecomposerT decomposer = {},
TuningEnvT = {})
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

using default_policy_selector_t = policy_selector_from_types<KeyT, ValueT, SegmentSizeT>;
using policy_selector_t = ::cuda::std::decay_t<
::cuda::std::execution::__query_result_or_t<TuningEnvT, segmented_radix_sort_policy, default_policy_selector_t>>;
Expand Down
15 changes: 15 additions & 0 deletions cub/cub/device/dispatch/dispatch_segmented_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,11 @@ struct DispatchSegmentedReduce
KernelLauncherFactory launcher_factory = {},
MaxPolicyT max_policy = {})
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

if (num_segments <= 0)
{
return cudaSuccess;
Expand Down Expand Up @@ -527,6 +532,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
KernelSource kernel_source = {},
KernelLauncherFactory launcher_factory = {})
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

if (num_segments <= 0)
{
return cudaSuccess;
Expand Down Expand Up @@ -725,6 +735,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_fixed_size(
KernelSource kernel_source = {},
KernelLauncherFactory launcher_factory = {})
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

if (num_segments <= 0)
{
if (d_temp_storage == nullptr)
Expand Down
5 changes: 5 additions & 0 deletions cub/cub/device/dispatch/dispatch_segmented_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
KernelSource kernel_source = {},
KernelLauncherFactory launcher_factory = {})
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

static_assert(::cuda::std::is_integral_v<OffsetT> && sizeof(OffsetT) >= 4 && sizeof(OffsetT) <= 8,
"dispatch_segmented_scan only supports integral offset types of 4- or 8-bytes");

Expand Down
10 changes: 10 additions & 0 deletions cub/cub/device/dispatch/dispatch_segmented_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -689,6 +689,11 @@ struct DispatchSegmentedSort
MaxPolicyT max_policy = {},
PartitionMaxPolicyT partition_max_policy = {})
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

// Get PTX version
int ptx_version = 0;
if (const auto error = CubDebug(launcher_factory.PtxVersion(ptx_version)))
Expand Down Expand Up @@ -1281,6 +1286,11 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch(
PartitionKernelSource partition_kernel_source = {},
KernelLauncherFactory launcher_factory = {}) -> cudaError_t
{
if (const auto error = CubDebug(detail::validate_stream_device(stream)))
{
return error;
}

[[maybe_unused]] static constexpr bool keys_only = ::cuda::std::is_same_v<ValueT, NullType>;

const auto get_num_passes = [&](int radix_bits) {
Expand Down
Loading