From 77b2090853443c59c60fed4b411ec2cbdc5debaa Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Mon, 9 Feb 2026 13:31:35 -0500 Subject: [PATCH 01/17] Create helper function for determining maximum block size --- include/core/detail/hip_utils.hpp | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp index d8a918f1..59e092d6 100644 --- a/include/core/detail/hip_utils.hpp +++ b/include/core/detail/hip_utils.hpp @@ -25,6 +25,8 @@ #include #include +#include "core/hip_assert.h" + namespace roccv::detail { static void StreamCallback(void* userData) { std::function* func = static_cast*>(userData); @@ -46,4 +48,26 @@ void LaunchHostFuncAsync(hipStream_t stream, Callable&& cb) { HIP_VALIDATE_NO_ERRORS(hipLaunchHostFunc(stream, StreamCallback, data.get())); data.release(); // Release ownership, StreamCallback is responsible for it now } + +/** + * @brief Get the maximum potential block size for a 2D kernel. + * + * @param[in] kernel The kernel function to get the maximum potential block size for. + * @param[in] sharedMemSizePerBlock The shared memory size per block. + * @return The maximum potential block size. + */ +template +dim3 GetMaximumPotentialBlockSize2D(KernalFunc kernel, size_t sharedMemSizePerBlock) { + int minimumGridSize; + int blockSize; + int deviceId; + int warpSize; + + HIP_VALIDATE_NO_ERRORS(hipGetDevice(&deviceId)); + HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, deviceId)); + HIP_VALIDATE_NO_ERRORS( + hipOccupancyMaxPotentialBlockSize(&minimumGridSize, &blockSize, kernel, sharedMemSizePerBlock, warpSize)); + + return dim3(warpSize, blockSize / warpSize, 1); +} } // namespace roccv::detail \ No newline at end of file From 6d616afe5ab9ffe9bcc775537520249be92b3c13 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Mon, 9 Feb 2026 13:44:22 -0500 Subject: [PATCH 02/17] Define StreamCallback as static inline --- include/core/detail/hip_utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp index 59e092d6..0030221a 100644 --- a/include/core/detail/hip_utils.hpp +++ b/include/core/detail/hip_utils.hpp @@ -28,7 +28,7 @@ #include "core/hip_assert.h" namespace roccv::detail { -static void StreamCallback(void* userData) { +static inline void StreamCallback(void* userData) { std::function* func = static_cast*>(userData); (*func)(); delete func; From 348816dd58b536f932948c72a9458ace1b84ea84 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Mon, 9 Feb 2026 13:44:48 -0500 Subject: [PATCH 03/17] Use dynamic block size helper for Composite operator --- src/op_composite.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/op_composite.cpp b/src/op_composite.cpp index 6d49a156..8073c7f4 100644 --- a/src/op_composite.cpp +++ b/src/op_composite.cpp @@ -24,6 +24,7 @@ #include #include "common/validation_helpers.hpp" +#include "core/detail/hip_utils.hpp" #include "core/wrappers/image_wrapper.hpp" #include "kernels/device/composite_device.hpp" #include "kernels/host/composite_host.hpp" @@ -40,7 +41,8 @@ void dispatch_composite_masktype(hipStream_t stream, const Tensor& foreground, c switch (device) { case eDeviceType::GPU: { - dim3 block(64, 16); + dim3 block = detail::GetMaximumPotentialBlockSize2D( + Kernels::Device::composite, ImageWrapper, ImageWrapper>, 0); dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y, outputWrapper.batches()); Kernels::Device::composite<<>>(fgWrapper, bgWrapper, maskWrapper, outputWrapper); From 3b0488e4d298921d5d895241393e17611bd734c4 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Mon, 9 Feb 2026 14:18:04 -0500 Subject: [PATCH 04/17] Add helper for grid size calculations --- include/core/detail/hip_utils.hpp | 14 ++++++++++++++ src/op_bnd_box.cpp | 5 +++-- src/op_composite.cpp | 4 ++-- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp index 0030221a..0195df9c 100644 --- a/include/core/detail/hip_utils.hpp +++ b/include/core/detail/hip_utils.hpp @@ -70,4 +70,18 @@ dim3 GetMaximumPotentialBlockSize2D(KernalFunc kernel, size_t sharedMemSizePerBl return dim3(warpSize, blockSize / warpSize, 1); } + +/** + * @brief Get the grid size for a 2D kernel. + * + * @param[in] width The width of the image. + * @param[in] height The height of the image. + * @param[in] batchSize The batch size of the image. + * @param[in] blockSize The block size of the kernel. + * @return The grid size. + */ +static inline dim3 GetGridSize2D(size_t width, size_t height, size_t batchSize, dim3 blockSize) { + return dim3((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y, batchSize); +} + } // namespace roccv::detail \ No newline at end of file diff --git a/src/op_bnd_box.cpp b/src/op_bnd_box.cpp index e3cfa717..82c1ba5c 100644 --- a/src/op_bnd_box.cpp +++ b/src/op_bnd_box.cpp @@ -52,8 +52,9 @@ void dispatch_bnd_box_dtype(hipStream_t stream, const Tensor &input, const Tenso auto batchSize = inputWrapper.batches(); switch (device) { case eDeviceType::GPU: { - const dim3 block(32, 32); - const dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y, batchSize); + const dim3 block = detail::GetMaximumPotentialBlockSize2D( + Kernels::Device::bndbox_kernel, ImageWrapper>, 0); + const dim3 grid = detail::GetGridSize2D(width, height, batchSize, block); Rect_t *rects_ptr = nullptr; const auto n_rects = rects->size(); diff --git a/src/op_composite.cpp b/src/op_composite.cpp index 8073c7f4..1b21cc58 100644 --- a/src/op_composite.cpp +++ b/src/op_composite.cpp @@ -43,8 +43,8 @@ void dispatch_composite_masktype(hipStream_t stream, const Tensor& foreground, c case eDeviceType::GPU: { dim3 block = detail::GetMaximumPotentialBlockSize2D( Kernels::Device::composite, ImageWrapper, ImageWrapper>, 0); - dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y, - outputWrapper.batches()); + dim3 grid = + detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); Kernels::Device::composite<<>>(fgWrapper, bgWrapper, maskWrapper, outputWrapper); break; } From 2ceddcedaaa141fe9413c8f163c8fcf251b88118 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Mon, 9 Feb 2026 14:48:54 -0500 Subject: [PATCH 05/17] Add dynamic block size calculations to operators --- include/kernels/device/bnd_box_device.hpp | 1 + include/kernels/host/bnd_box_host.hpp | 1 + src/op_bnd_box.cpp | 2 - src/op_copy_make_border.cpp | 11 +-- src/op_custom_crop.cpp | 8 +- src/op_cvt_color.cpp | 92 ++++++++++++++++------- src/op_flip.cpp | 8 +- src/op_gamma_contrast.cpp | 13 ++-- src/op_normalize.cpp | 12 ++- src/op_resize.cpp | 18 +++-- src/op_rotate.cpp | 7 +- src/op_thresholding.cpp | 36 ++++++--- src/op_warp_perspective.cpp | 8 +- 13 files changed, 140 insertions(+), 77 deletions(-) diff --git a/include/kernels/device/bnd_box_device.hpp b/include/kernels/device/bnd_box_device.hpp index 9076e634..14900490 100644 --- a/include/kernels/device/bnd_box_device.hpp +++ b/include/kernels/device/bnd_box_device.hpp @@ -26,6 +26,7 @@ THE SOFTWARE. #include +#include "common/math_vector.hpp" #include "kernels/kernel_helpers.hpp" #include "operator_types.h" diff --git a/include/kernels/host/bnd_box_host.hpp b/include/kernels/host/bnd_box_host.hpp index db1af9f7..7b3826c1 100644 --- a/include/kernels/host/bnd_box_host.hpp +++ b/include/kernels/host/bnd_box_host.hpp @@ -26,6 +26,7 @@ THE SOFTWARE. #include +#include "common/math_vector.hpp" #include "kernels/kernel_helpers.hpp" #include "operator_types.h" diff --git a/src/op_bnd_box.cpp b/src/op_bnd_box.cpp index 82c1ba5c..6564658a 100644 --- a/src/op_bnd_box.cpp +++ b/src/op_bnd_box.cpp @@ -25,10 +25,8 @@ THE SOFTWARE. #include #include -#include #include -#include "common/math_vector.hpp" #include "common/validation_helpers.hpp" #include "core/detail/hip_utils.hpp" #include "core/tensor.hpp" diff --git a/src/op_copy_make_border.cpp b/src/op_copy_make_border.cpp index 74d45894..78579393 100644 --- a/src/op_copy_make_border.cpp +++ b/src/op_copy_make_border.cpp @@ -24,9 +24,10 @@ #include #include "common/validation_helpers.hpp" +#include "core/detail/casting.hpp" +#include "core/detail/hip_utils.hpp" #include "core/wrappers/border_wrapper.hpp" #include "core/wrappers/image_wrapper.hpp" -#include "core/wrappers/interpolation_wrapper.hpp" #include "kernels/device/copy_make_border_device.hpp" #include "kernels/host/copy_make_border_host.hpp" @@ -43,10 +44,10 @@ void dispatch_copy_make_border_border_mode(hipStream_t stream, const Tensor& inp switch (device) { case eDeviceType::GPU: { - dim3 block_dim(64, 16); - dim3 grid_dim((out_desc.width() + block_dim.x - 1) / block_dim.x, - (out_desc.height() + block_dim.y - 1) / block_dim.y, out_desc.batches()); - Kernels::Device::copy_make_border<<>>(in_desc, out_desc, top, left); + dim3 block = detail::GetMaximumPotentialBlockSize2D( + Kernels::Device::copy_make_border, ImageWrapper>, 0); + dim3 grid = detail::GetGridSize2D(out_desc.width(), out_desc.height(), out_desc.batches(), block); + Kernels::Device::copy_make_border<<>>(in_desc, out_desc, top, left); break; } case eDeviceType::CPU: { diff --git a/src/op_custom_crop.cpp b/src/op_custom_crop.cpp index e0e84c77..049777b9 100644 --- a/src/op_custom_crop.cpp +++ b/src/op_custom_crop.cpp @@ -26,6 +26,7 @@ THE SOFTWARE. #include #include "common/validation_helpers.hpp" +#include "core/detail/hip_utils.hpp" #include "core/wrappers/image_wrapper.hpp" #include "kernels/device/custom_crop_device.hpp" #include "kernels/host/custom_crop_host.hpp" @@ -40,9 +41,10 @@ void dispatch_custom_crop_dtype(hipStream_t stream, const Tensor& input, const T switch (device) { case eDeviceType::GPU: { - dim3 block(64, 16); - dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y, - outputWrapper.batches()); + dim3 block = detail::GetMaximumPotentialBlockSize2D( + Kernels::Device::custom_crop, ImageWrapper>, 0); + dim3 grid = + detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); Kernels::Device::custom_crop<<>>(inputWrapper, outputWrapper, cropRect); break; } diff --git a/src/op_cvt_color.cpp b/src/op_cvt_color.cpp index 31abd103..0a29518e 100644 --- a/src/op_cvt_color.cpp +++ b/src/op_cvt_color.cpp @@ -23,9 +23,8 @@ THE SOFTWARE. #include -#include - #include "common/validation_helpers.hpp" +#include "core/detail/hip_utils.hpp" #include "core/tensor.hpp" #include "core/wrappers/image_wrapper.hpp" #include "kernels/device/cvt_color_device.hpp" @@ -81,45 +80,82 @@ void CvtColor::operator()(hipStream_t stream, const Tensor &input, Tensor &outpu if (device == eDeviceType::GPU) { // Dispatch appropriate device kernel based on given conversion code - dim3 blockSize(32, 16); - dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y, samples); - switch (conversionCode) { - case eColorConversionCode::COLOR_BGR2GRAY: + case eColorConversionCode::COLOR_BGR2GRAY: { + dim3 block = detail::GetMaximumPotentialBlockSize2D( + Kernels::Device::rgb_or_bgr_to_grayscale, + ImageWrapper>, + 0); + dim3 grid = detail::GetGridSize2D(width, height, samples, block); Kernels::Device::rgb_or_bgr_to_grayscale - <<>>(ImageWrapper(input), ImageWrapper(output)); + <<>>(ImageWrapper(input), ImageWrapper(output)); break; - - case eColorConversionCode::COLOR_RGB2GRAY: + } + + case eColorConversionCode::COLOR_RGB2GRAY: { + dim3 block = detail::GetMaximumPotentialBlockSize2D( + Kernels::Device::rgb_or_bgr_to_grayscale, + ImageWrapper>, + 0); + dim3 grid = detail::GetGridSize2D(width, height, samples, block); Kernels::Device::rgb_or_bgr_to_grayscale - <<>>(ImageWrapper(input), ImageWrapper(output)); + <<>>(ImageWrapper(input), ImageWrapper(output)); break; + } case eColorConversionCode::COLOR_BGR2RGB: - case eColorConversionCode::COLOR_RGB2BGR: + case eColorConversionCode::COLOR_RGB2BGR: { + dim3 block = detail::GetMaximumPotentialBlockSize2D( + Kernels::Device::reorder, ImageWrapper>, 0); + dim3 grid = detail::GetGridSize2D(width, height, samples, block); Kernels::Device::reorder - <<>>(ImageWrapper(input), ImageWrapper(output)); + <<>>(ImageWrapper(input), ImageWrapper(output)); break; - - case eColorConversionCode::COLOR_BGR2YUV: - Kernels::Device::rgb_or_bgr_to_yuv<<>>( - ImageWrapper(input), ImageWrapper(output), 128.0f); + } + + case eColorConversionCode::COLOR_BGR2YUV: { + dim3 block = detail::GetMaximumPotentialBlockSize2D( + Kernels::Device::rgb_or_bgr_to_yuv, + ImageWrapper>, + 0); + dim3 grid = detail::GetGridSize2D(width, height, samples, block); + Kernels::Device::rgb_or_bgr_to_yuv + <<>>(ImageWrapper(input), ImageWrapper(output), 128.0f); break; - - case eColorConversionCode::COLOR_RGB2YUV: - Kernels::Device::rgb_or_bgr_to_yuv<<>>( - ImageWrapper(input), ImageWrapper(output), 128.0f); + } + + case eColorConversionCode::COLOR_RGB2YUV: { + dim3 block = detail::GetMaximumPotentialBlockSize2D( + Kernels::Device::rgb_or_bgr_to_yuv, + ImageWrapper>, + 0); + dim3 grid = detail::GetGridSize2D(width, height, samples, block); + Kernels::Device::rgb_or_bgr_to_yuv + <<>>(ImageWrapper(input), ImageWrapper(output), 128.0f); break; - - case eColorConversionCode::COLOR_YUV2BGR: - Kernels::Device::yuv_to_rgb_or_bgr<<>>( - ImageWrapper(input), ImageWrapper(output), 128.0f); + } + + case eColorConversionCode::COLOR_YUV2BGR: { + dim3 block = detail::GetMaximumPotentialBlockSize2D( + Kernels::Device::yuv_to_rgb_or_bgr, + ImageWrapper>, + 0); + dim3 grid = detail::GetGridSize2D(width, height, samples, block); + Kernels::Device::yuv_to_rgb_or_bgr + <<>>(ImageWrapper(input), ImageWrapper(output), 128.0f); break; - - case eColorConversionCode::COLOR_YUV2RGB: - Kernels::Device::yuv_to_rgb_or_bgr<<>>( - ImageWrapper(input), ImageWrapper(output), 128.0f); + } + + case eColorConversionCode::COLOR_YUV2RGB: { + dim3 block = detail::GetMaximumPotentialBlockSize2D( + Kernels::Device::yuv_to_rgb_or_bgr, + ImageWrapper>, + 0); + dim3 grid = detail::GetGridSize2D(width, height, samples, block); + Kernels::Device::yuv_to_rgb_or_bgr + <<>>(ImageWrapper(input), ImageWrapper(output), 128.0f); break; + } default: throw Exception("Not implemented", eStatusType::NOT_IMPLEMENTED); diff --git a/src/op_flip.cpp b/src/op_flip.cpp index 84e5ccd9..66219220 100644 --- a/src/op_flip.cpp +++ b/src/op_flip.cpp @@ -27,6 +27,7 @@ THE SOFTWARE. #include #include "common/validation_helpers.hpp" +#include "core/detail/hip_utils.hpp" #include "core/exception.hpp" #include "core/status_type.h" #include "core/wrappers/image_wrapper.hpp" @@ -42,9 +43,10 @@ void dispatch_flip_axis(hipStream_t stream, const Tensor& input, const Tensor& o switch (device) { case eDeviceType::GPU: { - dim3 block(64, 16); - dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y, - outputWrapper.batches()); + dim3 block = detail::GetMaximumPotentialBlockSize2D( + Kernels::Device::flip, ImageWrapper>, 0); + dim3 grid = + detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); Kernels::Device::flip<<>>(inputWrapper, outputWrapper); break; } diff --git a/src/op_gamma_contrast.cpp b/src/op_gamma_contrast.cpp index 98a44a06..04eee593 100644 --- a/src/op_gamma_contrast.cpp +++ b/src/op_gamma_contrast.cpp @@ -24,15 +24,11 @@ THE SOFTWARE. #include -#include #include #include -#include -#include -#include "common/array_wrapper.hpp" -#include "common/math_vector.hpp" #include "common/validation_helpers.hpp" +#include "core/detail/hip_utils.hpp" #include "core/tensor.hpp" #include "core/wrappers/image_wrapper.hpp" #include "kernels/device/gamma_contrast_device.hpp" @@ -47,9 +43,10 @@ void dispatch_gamma_contrast_dtype(hipStream_t stream, const Tensor &input, cons ImageWrapper outputWrapper(output); if (device == eDeviceType::GPU) { - dim3 block(64, 16); - dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y, - outputWrapper.batches()); + dim3 block = detail::GetMaximumPotentialBlockSize2D( + Kernels::Device::gamma_contrast, ImageWrapper>, 0); + dim3 grid = + detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); Kernels::Device::gamma_contrast<<>>(inputWrapper, outputWrapper, gamma); } else if (device == eDeviceType::CPU) { diff --git a/src/op_normalize.cpp b/src/op_normalize.cpp index 7cc2569e..90493ff3 100644 --- a/src/op_normalize.cpp +++ b/src/op_normalize.cpp @@ -26,6 +26,7 @@ THE SOFTWARE. #include #include "common/validation_helpers.hpp" +#include "core/detail/hip_utils.hpp" #include "core/detail/type_traits.hpp" #include "core/tensor.hpp" #include "core/wrappers/image_wrapper.hpp" @@ -49,9 +50,11 @@ void dispatch_normalize_stddev(hipStream_t stream, const Tensor& input, const Te switch (device) { case eDeviceType::GPU: { - dim3 block(32, 8); - dim3 grid((outputWrap.width() + block.x - 1) / block.x, (outputWrap.height() + block.y - 1) / block.y, - outputWrap.batches()); + dim3 block = detail::GetMaximumPotentialBlockSize2D( + Kernels::Device::normalize, ImageWrapper, ImageWrapper, + ImageWrapper>, + 0); + dim3 grid = detail::GetGridSize2D(outputWrap.width(), outputWrap.height(), outputWrap.batches(), block); Kernels::Device::normalize <<>>(inputWrap, baseWrap, scaleWrap, outputWrap, global_scale, shift, epsilon); break; @@ -112,7 +115,8 @@ void Normalize::operator()(hipStream_t stream, const Tensor& input, const Tensor // TODO: Need to support scalar base/scale tensors at some point. Will require some extra handling on the kernel // level. Once in place, this check can be removed. CHECK_TENSOR_COMPARISON(base.shape(base.layout().channels_index()) == input.shape(input.layout().channels_index())); - CHECK_TENSOR_COMPARISON(scale.shape(scale.layout().channels_index()) == input.shape(input.layout().channels_index())); + CHECK_TENSOR_COMPARISON(scale.shape(scale.layout().channels_index()) == + input.shape(input.layout().channels_index())); // Create kernel dispatching table based on input/output datatype and number of channels. // clang-format off diff --git a/src/op_resize.cpp b/src/op_resize.cpp index ef508d5d..bc74c8a3 100644 --- a/src/op_resize.cpp +++ b/src/op_resize.cpp @@ -25,7 +25,7 @@ THE SOFTWARE. #include #include "common/validation_helpers.hpp" -#include "core/detail/casting.hpp" +#include "core/detail/hip_utils.hpp" #include "core/exception.hpp" #include "core/status_type.h" #include "core/wrappers/interpolation_wrapper.hpp" @@ -45,9 +45,10 @@ void dispatch_resize_interp(hipStream_t stream, const Tensor& input, const Tenso switch (device) { case eDeviceType::GPU: { - dim3 block(64, 16); - dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y, - outputWrapper.batches()); + dim3 block = + detail::GetMaximumPotentialBlockSize2D(Kernels::Device::resize, ImageWrapper>, 0); + dim3 grid = + detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); Kernels::Device::resize<<>>(inputWrapper, outputWrapper, scaleX, scaleY); break; } @@ -65,10 +66,11 @@ void dispatch_resize_dtype(hipStream_t stream, const Tensor& input, const Tensor static const std::unordered_map< eInterpolationType, std::function> - funcs = {{eInterpolationType::INTERP_TYPE_NEAREST, dispatch_resize_interp}, - {eInterpolationType::INTERP_TYPE_LINEAR, dispatch_resize_interp}, - {eInterpolationType::INTERP_TYPE_CUBIC, dispatch_resize_interp} - }; + funcs = { + {eInterpolationType::INTERP_TYPE_NEAREST, + dispatch_resize_interp}, + {eInterpolationType::INTERP_TYPE_LINEAR, dispatch_resize_interp}, + {eInterpolationType::INTERP_TYPE_CUBIC, dispatch_resize_interp}}; if (!funcs.contains(interpolation)) { throw Exception("Operation does not support the given interpolation mode.", eStatusType::NOT_IMPLEMENTED); diff --git a/src/op_rotate.cpp b/src/op_rotate.cpp index d692631f..a9f2bbb0 100644 --- a/src/op_rotate.cpp +++ b/src/op_rotate.cpp @@ -26,6 +26,7 @@ THE SOFTWARE. #include "common/array_wrapper.hpp" #include "common/validation_helpers.hpp" +#include "core/detail/hip_utils.hpp" #include "core/wrappers/interpolation_wrapper.hpp" #include "kernels/device/rotate_device.hpp" #include "kernels/host/rotate_host.hpp" @@ -59,9 +60,9 @@ void dispatch_rotate_interp(hipStream_t stream, const Tensor &input, const Tenso switch (device) { case eDeviceType::GPU: { - dim3 block(32, 16); - dim3 grid((outputWrap.width() + block.x - 1) / block.x, (outputWrap.height() + block.y - 1) / block.y, - outputWrap.batches()); + dim3 block = detail::GetMaximumPotentialBlockSize2D( + Kernels::Device::rotate, ImageWrapper, ArrayWrapper>, 0); + dim3 grid = detail::GetGridSize2D(outputWrap.width(), outputWrap.height(), outputWrap.batches(), block); Kernels::Device::rotate<<>>(inputWrap, outputWrap, matWrap); break; } diff --git a/src/op_thresholding.cpp b/src/op_thresholding.cpp index 04613a2c..72ea45d9 100644 --- a/src/op_thresholding.cpp +++ b/src/op_thresholding.cpp @@ -26,8 +26,7 @@ THE SOFTWARE. #include #include "common/validation_helpers.hpp" -#include "core/exception.hpp" -#include "core/status_type.h" +#include "core/detail/hip_utils.hpp" #include "core/wrappers/generic_tensor_wrapper.hpp" #include "core/wrappers/image_wrapper.hpp" #include "kernels/device/thresholding_device.hpp" @@ -50,32 +49,49 @@ void dispatch_threshold_dtype(hipStream_t stream, const Tensor &input, const Ten const auto width = input.shape()[input.shape().layout().width_index()]; if (device == eDeviceType::GPU) { - dim3 block(64, 16); - dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y, m_maxBatchSize); - switch (m_threshType) { - case THRESH_BINARY: + case THRESH_BINARY: { + dim3 block = detail::GetMaximumPotentialBlockSize2D( + Kernels::Device::binary_generic, ImageWrapper>, 0); + dim3 grid = detail::GetGridSize2D(width, height, m_maxBatchSize, block); Kernels::Device::binary_generic<<>>( inputWrapper, outputWrapper, GenericTensorWrapper(thresh), GenericTensorWrapper(maxVal), m_maxBatchSize); break; - case THRESH_BINARY_INV: + } + case THRESH_BINARY_INV: { + dim3 block = detail::GetMaximumPotentialBlockSize2D( + Kernels::Device::binary_inv_generic, ImageWrapper>, 0); + dim3 grid = detail::GetGridSize2D(width, height, m_maxBatchSize, block); Kernels::Device::binary_inv_generic<<>>( inputWrapper, outputWrapper, GenericTensorWrapper(thresh), GenericTensorWrapper(maxVal), m_maxBatchSize); break; - case THRESH_TRUNC: + } + case THRESH_TRUNC: { + dim3 block = detail::GetMaximumPotentialBlockSize2D( + Kernels::Device::trunc_generic, ImageWrapper>, 0); + dim3 grid = detail::GetGridSize2D(width, height, m_maxBatchSize, block); Kernels::Device::trunc_generic<<>>( inputWrapper, outputWrapper, GenericTensorWrapper(thresh), m_maxBatchSize); break; - case THRESH_TOZERO: + } + case THRESH_TOZERO: { + dim3 block = detail::GetMaximumPotentialBlockSize2D( + Kernels::Device::tozero_generic, ImageWrapper>, 0); + dim3 grid = detail::GetGridSize2D(width, height, m_maxBatchSize, block); Kernels::Device::tozero_generic<<>>( inputWrapper, outputWrapper, GenericTensorWrapper(thresh), m_maxBatchSize); break; - case THRESH_TOZERO_INV: + } + case THRESH_TOZERO_INV: { + dim3 block = detail::GetMaximumPotentialBlockSize2D( + Kernels::Device::tozeroinv_generic, ImageWrapper>, 0); + dim3 grid = detail::GetGridSize2D(width, height, m_maxBatchSize, block); Kernels::Device::tozeroinv_generic<<>>( inputWrapper, outputWrapper, GenericTensorWrapper(thresh), m_maxBatchSize); break; + } } } else if (device == eDeviceType::CPU) { diff --git a/src/op_warp_perspective.cpp b/src/op_warp_perspective.cpp index c90e36fb..6cf9052a 100644 --- a/src/op_warp_perspective.cpp +++ b/src/op_warp_perspective.cpp @@ -26,6 +26,7 @@ THE SOFTWARE. #include "common/array_wrapper.hpp" #include "common/validation_helpers.hpp" #include "core/detail/casting.hpp" +#include "core/detail/hip_utils.hpp" #include "core/detail/math/math.hpp" #include "core/detail/type_traits.hpp" #include "kernels/device/warp_perspective_device.hpp" @@ -43,9 +44,10 @@ void dispatch_warp_perspective_interp(hipStream_t stream, const Tensor &input, c // Launch CPU/GPU kernel depending on requested device type. switch (device) { case eDeviceType::GPU: { - dim3 block(64, 16); - dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y, - outputWrapper.batches()); + dim3 block = detail::GetMaximumPotentialBlockSize2D( + Kernels::Device::warp_perspective, ImageWrapper, ArrayWrapper>, 0); + dim3 grid = + detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); Kernels::Device::warp_perspective<<>>(inputWrapper, outputWrapper, transform); break; } From 8c84900e2413ca529573810e24dfbbdc65a91dca Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Mon, 9 Feb 2026 16:46:11 -0500 Subject: [PATCH 06/17] Add dynamic block sizes to rest of operators --- src/op_bilateral_filter.cpp | 8 +++----- src/op_convert_to.cpp | 35 +++++++++++++++++----------------- src/op_histogram.cpp | 2 -- src/op_non_max_suppression.cpp | 1 - src/op_remap.cpp | 11 +++++------ src/op_warp_perspective.cpp | 1 - 6 files changed, 25 insertions(+), 33 deletions(-) diff --git a/src/op_bilateral_filter.cpp b/src/op_bilateral_filter.cpp index 7c39ac40..13e5c317 100644 --- a/src/op_bilateral_filter.cpp +++ b/src/op_bilateral_filter.cpp @@ -24,14 +24,11 @@ THE SOFTWARE. #include #include -#include #include -#include "common/array_wrapper.hpp" #include "common/validation_helpers.hpp" #include "core/detail/casting.hpp" -#include "core/detail/math/math.hpp" -#include "core/detail/type_traits.hpp" +#include "core/detail/hip_utils.hpp" #include "kernels/device/bilateral_filter_device.hpp" #include "kernels/host/bilateral_filter_host.hpp" @@ -72,7 +69,8 @@ void dispatch_bilateral_filter_border_mode(hipStream_t stream, const Tensor &inp float colorCoeff = -1 / (2 * sigmaColor * sigmaColor); if (device == eDeviceType::GPU) { - dim3 block(8, 8); + dim3 block = detail::GetMaximumPotentialBlockSize2D( + Kernels::Device::bilateral_filter, ImageWrapper>, 0); uint32_t xGridSize = (outputWrapper.width() + (block.x * 2) - 1) / (block.x * 2); uint32_t yGridSize = (outputWrapper.height() + (block.y * 2) - 1) / (block.y * 2); uint32_t zGridSize = outputWrapper.batches(); diff --git a/src/op_convert_to.cpp b/src/op_convert_to.cpp index 6a8173d4..8e59d446 100644 --- a/src/op_convert_to.cpp +++ b/src/op_convert_to.cpp @@ -21,22 +21,23 @@ THE SOFTWARE. */ #include "op_convert_to.hpp" +#include + #include -#include -#include "core/wrappers/image_wrapper.hpp" #include "common/validation_helpers.hpp" #include "core/detail/casting.hpp" +#include "core/detail/hip_utils.hpp" #include "core/detail/type_traits.hpp" +#include "core/wrappers/image_wrapper.hpp" #include "kernels/device/convert_to_device.hpp" #include "kernels/host/convert_to_host.hpp" namespace roccv { template -void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const Tensor &output, - const double alpha, const double beta, const eDeviceType device) { - +void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const Tensor &output, const double alpha, + const double beta, const eDeviceType device) { using SRC_DT_NC = detail::MakeType; using DST_DT_NC = detail::MakeType; @@ -47,16 +48,17 @@ void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const using DST_BT = detail::BaseType; using DT_AB = decltype(float() * SRC_BT() * DST_BT()); - + DT_AB alpha_ab = detail::SaturateCast(alpha); DT_AB beta_ab = detail::SaturateCast(beta); // Launch CPU/GPU kernel depending on requested device type. switch (device) { case eDeviceType::GPU: { - dim3 block(64, 16); - dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y, - outputWrapper.batches()); + dim3 block = detail::GetMaximumPotentialBlockSize2D( + Kernels::Device::convert_to, ImageWrapper, DT_AB>, 0); + dim3 grid = + detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); Kernels::Device::convert_to<<>>(inputWrapper, outputWrapper, alpha_ab, beta_ab); break; } @@ -68,16 +70,14 @@ void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const } template -void dispatch_convert_to_output_dtype(hipStream_t stream, const Tensor &input, const Tensor &output, - const double alpha, const double beta, const eDeviceType device) { - +void dispatch_convert_to_output_dtype(hipStream_t stream, const Tensor &input, const Tensor &output, const double alpha, + const double beta, const eDeviceType device) { int64_t channels = output.shape(output.layout().channels_index()); // Select kernel dispatcher based on number of channels. // clang-format off static const std::array, 4> funcs = {dispatch_convert_to_channels, dispatch_convert_to_channels, dispatch_convert_to_channels, dispatch_convert_to_channels}; - - + // clang-format on auto func = funcs.at(channels - 1); @@ -86,11 +86,10 @@ void dispatch_convert_to_output_dtype(hipStream_t stream, const Tensor &input, c } template -void dispatch_convert_to_input_dtype(hipStream_t stream, const Tensor &input, const Tensor &output, - const double alpha, const double beta, const eDeviceType device) { - +void dispatch_convert_to_input_dtype(hipStream_t stream, const Tensor &input, const Tensor &output, const double alpha, + const double beta, const eDeviceType device) { eDataType output_dtype = output.dtype().etype(); - + // Select kernel dispatcher based on a base input datatype. // clang-format off static const std::unordered_map> diff --git a/src/op_histogram.cpp b/src/op_histogram.cpp index 13cb6ac2..832a7dfe 100644 --- a/src/op_histogram.cpp +++ b/src/op_histogram.cpp @@ -22,13 +22,11 @@ THE SOFTWARE. #include "op_histogram.hpp" #include -#include #include #include #include -#include "common/array_wrapper.hpp" #include "common/validation_helpers.hpp" #include "core/wrappers/generic_tensor_wrapper.hpp" #include "core/wrappers/image_wrapper.hpp" diff --git a/src/op_non_max_suppression.cpp b/src/op_non_max_suppression.cpp index 0900fc50..fc61b51b 100644 --- a/src/op_non_max_suppression.cpp +++ b/src/op_non_max_suppression.cpp @@ -24,7 +24,6 @@ THE SOFTWARE. #include #include "common/validation_helpers.hpp" -#include "core/hip_assert.h" #include "kernels/device/non_max_suppression_device.hpp" #include "kernels/host/non_max_suppression_host.hpp" diff --git a/src/op_remap.cpp b/src/op_remap.cpp index 27f2e284..c7bde25b 100644 --- a/src/op_remap.cpp +++ b/src/op_remap.cpp @@ -23,11 +23,9 @@ THE SOFTWARE. #include -#include "common/array_wrapper.hpp" #include "common/validation_helpers.hpp" #include "core/detail/casting.hpp" -#include "core/detail/math/math.hpp" -#include "core/detail/type_traits.hpp" +#include "core/detail/hip_utils.hpp" #include "core/wrappers/image_wrapper.hpp" #include "core/wrappers/interpolation_wrapper.hpp" #include "kernels/device/remap_device.hpp" @@ -49,9 +47,10 @@ void dispatch_remap_mapInterp(hipStream_t stream, const Tensor &input, const Ten // Launch CPU/GPU kernel depending on requested device type. switch (device) { case eDeviceType::GPU: { - dim3 block(64, 16); - dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y, - outputWrapper.batches()); + dim3 block = detail::GetMaximumPotentialBlockSize2D( + Kernels::Device::remap, ImageWrapper, InterpolationWrapper>, 0); + dim3 grid = + detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); Kernels::Device::remap<<>>(inputWrapper, outputWrapper, wrappedMapTensor); break; } diff --git a/src/op_warp_perspective.cpp b/src/op_warp_perspective.cpp index 6cf9052a..5857e2b6 100644 --- a/src/op_warp_perspective.cpp +++ b/src/op_warp_perspective.cpp @@ -28,7 +28,6 @@ THE SOFTWARE. #include "core/detail/casting.hpp" #include "core/detail/hip_utils.hpp" #include "core/detail/math/math.hpp" -#include "core/detail/type_traits.hpp" #include "kernels/device/warp_perspective_device.hpp" #include "kernels/host/warp_perspective_host.hpp" From 284eca8366697a0ce084d3f023dab192c62367a3 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Tue, 17 Feb 2026 12:47:24 -0500 Subject: [PATCH 07/17] Address PR comments --- include/core/detail/hip_utils.hpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp index 0195df9c..ad1c264f 100644 --- a/include/core/detail/hip_utils.hpp +++ b/include/core/detail/hip_utils.hpp @@ -56,8 +56,8 @@ void LaunchHostFuncAsync(hipStream_t stream, Callable&& cb) { * @param[in] sharedMemSizePerBlock The shared memory size per block. * @return The maximum potential block size. */ -template -dim3 GetMaximumPotentialBlockSize2D(KernalFunc kernel, size_t sharedMemSizePerBlock) { +template +dim3 GetMaximumPotentialBlockSize2D(KernelFunc kernel, size_t sharedMemSizePerBlock) { int minimumGridSize; int blockSize; int deviceId; @@ -66,9 +66,14 @@ dim3 GetMaximumPotentialBlockSize2D(KernalFunc kernel, size_t sharedMemSizePerBl HIP_VALIDATE_NO_ERRORS(hipGetDevice(&deviceId)); HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, deviceId)); HIP_VALIDATE_NO_ERRORS( - hipOccupancyMaxPotentialBlockSize(&minimumGridSize, &blockSize, kernel, sharedMemSizePerBlock, warpSize)); + hipOccupancyMaxPotentialBlockSize(&minimumGridSize, &blockSize, kernel, sharedMemSizePerBlock, 0)); - return dim3(warpSize, blockSize / warpSize, 1); + if (blockSize >= warpSize && (blockSize % warpSize) == 0) { + return dim3(warpSize, blockSize / warpSize, 1); + } + + // Fallback to block size if it's not a multiple of the warp size + return dim3(blockSize, 1, 1); } /** From 77d79e464e1a860005a9d27f724683e1af1c4f0d Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Thu, 19 Feb 2026 16:26:59 -0500 Subject: [PATCH 08/17] Update block calculations to floor result --- include/core/detail/hip_utils.hpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp index ad1c264f..331d67b4 100644 --- a/include/core/detail/hip_utils.hpp +++ b/include/core/detail/hip_utils.hpp @@ -68,12 +68,7 @@ dim3 GetMaximumPotentialBlockSize2D(KernelFunc kernel, size_t sharedMemSizePerBl HIP_VALIDATE_NO_ERRORS( hipOccupancyMaxPotentialBlockSize(&minimumGridSize, &blockSize, kernel, sharedMemSizePerBlock, 0)); - if (blockSize >= warpSize && (blockSize % warpSize) == 0) { - return dim3(warpSize, blockSize / warpSize, 1); - } - - // Fallback to block size if it's not a multiple of the warp size - return dim3(blockSize, 1, 1); + return dim3(warpSize, blockSize / warpSize, 1); } /** From 1c37b9108c659d7a97fd7866c7695a7315a0ac47 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Fri, 20 Feb 2026 12:05:20 -0500 Subject: [PATCH 09/17] Hardcode target block size instead of querying runtime --- include/core/detail/hip_utils.hpp | 16 +++++---------- src/op_bilateral_filter.cpp | 3 +-- src/op_bnd_box.cpp | 3 +-- src/op_composite.cpp | 3 +-- src/op_convert_to.cpp | 3 +-- src/op_copy_make_border.cpp | 3 +-- src/op_custom_crop.cpp | 3 +-- src/op_cvt_color.cpp | 33 +++++++------------------------ src/op_flip.cpp | 3 +-- src/op_gamma_contrast.cpp | 3 +-- src/op_normalize.cpp | 5 +---- src/op_remap.cpp | 3 +-- src/op_resize.cpp | 3 +-- src/op_rotate.cpp | 3 +-- src/op_thresholding.cpp | 15 +++++--------- src/op_warp_perspective.cpp | 3 +-- 16 files changed, 30 insertions(+), 75 deletions(-) diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp index 331d67b4..be59eba6 100644 --- a/include/core/detail/hip_utils.hpp +++ b/include/core/detail/hip_utils.hpp @@ -50,25 +50,19 @@ void LaunchHostFuncAsync(hipStream_t stream, Callable&& cb) { } /** - * @brief Get the maximum potential block size for a 2D kernel. + * @brief Get the block size for a 2D kernel. * - * @param[in] kernel The kernel function to get the maximum potential block size for. - * @param[in] sharedMemSizePerBlock The shared memory size per block. - * @return The maximum potential block size. + * @param[in] targetBlockSize The target block size. + * @return The block size. */ -template -dim3 GetMaximumPotentialBlockSize2D(KernelFunc kernel, size_t sharedMemSizePerBlock) { - int minimumGridSize; - int blockSize; +static inline dim3 GetBlockSize2D(int targetBlockSize = 512) { int deviceId; int warpSize; HIP_VALIDATE_NO_ERRORS(hipGetDevice(&deviceId)); HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, deviceId)); - HIP_VALIDATE_NO_ERRORS( - hipOccupancyMaxPotentialBlockSize(&minimumGridSize, &blockSize, kernel, sharedMemSizePerBlock, 0)); - return dim3(warpSize, blockSize / warpSize, 1); + return dim3(warpSize, targetBlockSize / warpSize, 1); } /** diff --git a/src/op_bilateral_filter.cpp b/src/op_bilateral_filter.cpp index 13e5c317..115821f0 100644 --- a/src/op_bilateral_filter.cpp +++ b/src/op_bilateral_filter.cpp @@ -69,8 +69,7 @@ void dispatch_bilateral_filter_border_mode(hipStream_t stream, const Tensor &inp float colorCoeff = -1 / (2 * sigmaColor * sigmaColor); if (device == eDeviceType::GPU) { - dim3 block = detail::GetMaximumPotentialBlockSize2D( - Kernels::Device::bilateral_filter, ImageWrapper>, 0); + dim3 block = detail::GetBlockSize2D(); uint32_t xGridSize = (outputWrapper.width() + (block.x * 2) - 1) / (block.x * 2); uint32_t yGridSize = (outputWrapper.height() + (block.y * 2) - 1) / (block.y * 2); uint32_t zGridSize = outputWrapper.batches(); diff --git a/src/op_bnd_box.cpp b/src/op_bnd_box.cpp index 6564658a..9e03736f 100644 --- a/src/op_bnd_box.cpp +++ b/src/op_bnd_box.cpp @@ -50,8 +50,7 @@ void dispatch_bnd_box_dtype(hipStream_t stream, const Tensor &input, const Tenso auto batchSize = inputWrapper.batches(); switch (device) { case eDeviceType::GPU: { - const dim3 block = detail::GetMaximumPotentialBlockSize2D( - Kernels::Device::bndbox_kernel, ImageWrapper>, 0); + const dim3 block = detail::GetBlockSize2D(); const dim3 grid = detail::GetGridSize2D(width, height, batchSize, block); Rect_t *rects_ptr = nullptr; diff --git a/src/op_composite.cpp b/src/op_composite.cpp index 1b21cc58..db6545f7 100644 --- a/src/op_composite.cpp +++ b/src/op_composite.cpp @@ -41,8 +41,7 @@ void dispatch_composite_masktype(hipStream_t stream, const Tensor& foreground, c switch (device) { case eDeviceType::GPU: { - dim3 block = detail::GetMaximumPotentialBlockSize2D( - Kernels::Device::composite, ImageWrapper, ImageWrapper>, 0); + dim3 block = detail::GetBlockSize2D(); dim3 grid = detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); Kernels::Device::composite<<>>(fgWrapper, bgWrapper, maskWrapper, outputWrapper); diff --git a/src/op_convert_to.cpp b/src/op_convert_to.cpp index 8e59d446..34503cfb 100644 --- a/src/op_convert_to.cpp +++ b/src/op_convert_to.cpp @@ -55,8 +55,7 @@ void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const // Launch CPU/GPU kernel depending on requested device type. switch (device) { case eDeviceType::GPU: { - dim3 block = detail::GetMaximumPotentialBlockSize2D( - Kernels::Device::convert_to, ImageWrapper, DT_AB>, 0); + dim3 block = detail::GetBlockSize2D(); dim3 grid = detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); Kernels::Device::convert_to<<>>(inputWrapper, outputWrapper, alpha_ab, beta_ab); diff --git a/src/op_copy_make_border.cpp b/src/op_copy_make_border.cpp index 78579393..887e3b38 100644 --- a/src/op_copy_make_border.cpp +++ b/src/op_copy_make_border.cpp @@ -44,8 +44,7 @@ void dispatch_copy_make_border_border_mode(hipStream_t stream, const Tensor& inp switch (device) { case eDeviceType::GPU: { - dim3 block = detail::GetMaximumPotentialBlockSize2D( - Kernels::Device::copy_make_border, ImageWrapper>, 0); + dim3 block = detail::GetBlockSize2D(); dim3 grid = detail::GetGridSize2D(out_desc.width(), out_desc.height(), out_desc.batches(), block); Kernels::Device::copy_make_border<<>>(in_desc, out_desc, top, left); break; diff --git a/src/op_custom_crop.cpp b/src/op_custom_crop.cpp index 049777b9..3d5d2e8f 100644 --- a/src/op_custom_crop.cpp +++ b/src/op_custom_crop.cpp @@ -41,8 +41,7 @@ void dispatch_custom_crop_dtype(hipStream_t stream, const Tensor& input, const T switch (device) { case eDeviceType::GPU: { - dim3 block = detail::GetMaximumPotentialBlockSize2D( - Kernels::Device::custom_crop, ImageWrapper>, 0); + dim3 block = detail::GetBlockSize2D(); dim3 grid = detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); Kernels::Device::custom_crop<<>>(inputWrapper, outputWrapper, cropRect); diff --git a/src/op_cvt_color.cpp b/src/op_cvt_color.cpp index 0a29518e..9094854c 100644 --- a/src/op_cvt_color.cpp +++ b/src/op_cvt_color.cpp @@ -82,10 +82,7 @@ void CvtColor::operator()(hipStream_t stream, const Tensor &input, Tensor &outpu switch (conversionCode) { case eColorConversionCode::COLOR_BGR2GRAY: { - dim3 block = detail::GetMaximumPotentialBlockSize2D( - Kernels::Device::rgb_or_bgr_to_grayscale, - ImageWrapper>, - 0); + dim3 block = detail::GetBlockSize2D(); dim3 grid = detail::GetGridSize2D(width, height, samples, block); Kernels::Device::rgb_or_bgr_to_grayscale <<>>(ImageWrapper(input), ImageWrapper(output)); @@ -93,10 +90,7 @@ void CvtColor::operator()(hipStream_t stream, const Tensor &input, Tensor &outpu } case eColorConversionCode::COLOR_RGB2GRAY: { - dim3 block = detail::GetMaximumPotentialBlockSize2D( - Kernels::Device::rgb_or_bgr_to_grayscale, - ImageWrapper>, - 0); + dim3 block = detail::GetBlockSize2D(); dim3 grid = detail::GetGridSize2D(width, height, samples, block); Kernels::Device::rgb_or_bgr_to_grayscale <<>>(ImageWrapper(input), ImageWrapper(output)); @@ -105,8 +99,7 @@ void CvtColor::operator()(hipStream_t stream, const Tensor &input, Tensor &outpu case eColorConversionCode::COLOR_BGR2RGB: case eColorConversionCode::COLOR_RGB2BGR: { - dim3 block = detail::GetMaximumPotentialBlockSize2D( - Kernels::Device::reorder, ImageWrapper>, 0); + dim3 block = detail::GetBlockSize2D(); dim3 grid = detail::GetGridSize2D(width, height, samples, block); Kernels::Device::reorder <<>>(ImageWrapper(input), ImageWrapper(output)); @@ -114,10 +107,7 @@ void CvtColor::operator()(hipStream_t stream, const Tensor &input, Tensor &outpu } case eColorConversionCode::COLOR_BGR2YUV: { - dim3 block = detail::GetMaximumPotentialBlockSize2D( - Kernels::Device::rgb_or_bgr_to_yuv, - ImageWrapper>, - 0); + dim3 block = detail::GetBlockSize2D(); dim3 grid = detail::GetGridSize2D(width, height, samples, block); Kernels::Device::rgb_or_bgr_to_yuv <<>>(ImageWrapper(input), ImageWrapper(output), 128.0f); @@ -125,10 +115,7 @@ void CvtColor::operator()(hipStream_t stream, const Tensor &input, Tensor &outpu } case eColorConversionCode::COLOR_RGB2YUV: { - dim3 block = detail::GetMaximumPotentialBlockSize2D( - Kernels::Device::rgb_or_bgr_to_yuv, - ImageWrapper>, - 0); + dim3 block = detail::GetBlockSize2D(); dim3 grid = detail::GetGridSize2D(width, height, samples, block); Kernels::Device::rgb_or_bgr_to_yuv <<>>(ImageWrapper(input), ImageWrapper(output), 128.0f); @@ -136,10 +123,7 @@ void CvtColor::operator()(hipStream_t stream, const Tensor &input, Tensor &outpu } case eColorConversionCode::COLOR_YUV2BGR: { - dim3 block = detail::GetMaximumPotentialBlockSize2D( - Kernels::Device::yuv_to_rgb_or_bgr, - ImageWrapper>, - 0); + dim3 block = detail::GetBlockSize2D(); dim3 grid = detail::GetGridSize2D(width, height, samples, block); Kernels::Device::yuv_to_rgb_or_bgr <<>>(ImageWrapper(input), ImageWrapper(output), 128.0f); @@ -147,10 +131,7 @@ void CvtColor::operator()(hipStream_t stream, const Tensor &input, Tensor &outpu } case eColorConversionCode::COLOR_YUV2RGB: { - dim3 block = detail::GetMaximumPotentialBlockSize2D( - Kernels::Device::yuv_to_rgb_or_bgr, - ImageWrapper>, - 0); + dim3 block = detail::GetBlockSize2D(); dim3 grid = detail::GetGridSize2D(width, height, samples, block); Kernels::Device::yuv_to_rgb_or_bgr <<>>(ImageWrapper(input), ImageWrapper(output), 128.0f); diff --git a/src/op_flip.cpp b/src/op_flip.cpp index 66219220..002d4e48 100644 --- a/src/op_flip.cpp +++ b/src/op_flip.cpp @@ -43,8 +43,7 @@ void dispatch_flip_axis(hipStream_t stream, const Tensor& input, const Tensor& o switch (device) { case eDeviceType::GPU: { - dim3 block = detail::GetMaximumPotentialBlockSize2D( - Kernels::Device::flip, ImageWrapper>, 0); + dim3 block = detail::GetBlockSize2D(); dim3 grid = detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); Kernels::Device::flip<<>>(inputWrapper, outputWrapper); diff --git a/src/op_gamma_contrast.cpp b/src/op_gamma_contrast.cpp index 04eee593..0362046f 100644 --- a/src/op_gamma_contrast.cpp +++ b/src/op_gamma_contrast.cpp @@ -43,8 +43,7 @@ void dispatch_gamma_contrast_dtype(hipStream_t stream, const Tensor &input, cons ImageWrapper outputWrapper(output); if (device == eDeviceType::GPU) { - dim3 block = detail::GetMaximumPotentialBlockSize2D( - Kernels::Device::gamma_contrast, ImageWrapper>, 0); + dim3 block = detail::GetBlockSize2D(); dim3 grid = detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); diff --git a/src/op_normalize.cpp b/src/op_normalize.cpp index 90493ff3..802ec680 100644 --- a/src/op_normalize.cpp +++ b/src/op_normalize.cpp @@ -50,10 +50,7 @@ void dispatch_normalize_stddev(hipStream_t stream, const Tensor& input, const Te switch (device) { case eDeviceType::GPU: { - dim3 block = detail::GetMaximumPotentialBlockSize2D( - Kernels::Device::normalize, ImageWrapper, ImageWrapper, - ImageWrapper>, - 0); + dim3 block = detail::GetBlockSize2D(); dim3 grid = detail::GetGridSize2D(outputWrap.width(), outputWrap.height(), outputWrap.batches(), block); Kernels::Device::normalize <<>>(inputWrap, baseWrap, scaleWrap, outputWrap, global_scale, shift, epsilon); diff --git a/src/op_remap.cpp b/src/op_remap.cpp index c7bde25b..1fdfced6 100644 --- a/src/op_remap.cpp +++ b/src/op_remap.cpp @@ -47,8 +47,7 @@ void dispatch_remap_mapInterp(hipStream_t stream, const Tensor &input, const Ten // Launch CPU/GPU kernel depending on requested device type. switch (device) { case eDeviceType::GPU: { - dim3 block = detail::GetMaximumPotentialBlockSize2D( - Kernels::Device::remap, ImageWrapper, InterpolationWrapper>, 0); + dim3 block = detail::GetBlockSize2D(); dim3 grid = detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); Kernels::Device::remap<<>>(inputWrapper, outputWrapper, wrappedMapTensor); diff --git a/src/op_resize.cpp b/src/op_resize.cpp index bc74c8a3..a8df3b6d 100644 --- a/src/op_resize.cpp +++ b/src/op_resize.cpp @@ -45,8 +45,7 @@ void dispatch_resize_interp(hipStream_t stream, const Tensor& input, const Tenso switch (device) { case eDeviceType::GPU: { - dim3 block = - detail::GetMaximumPotentialBlockSize2D(Kernels::Device::resize, ImageWrapper>, 0); + dim3 block = detail::GetBlockSize2D(); dim3 grid = detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); Kernels::Device::resize<<>>(inputWrapper, outputWrapper, scaleX, scaleY); diff --git a/src/op_rotate.cpp b/src/op_rotate.cpp index a9f2bbb0..c922fa6f 100644 --- a/src/op_rotate.cpp +++ b/src/op_rotate.cpp @@ -60,8 +60,7 @@ void dispatch_rotate_interp(hipStream_t stream, const Tensor &input, const Tenso switch (device) { case eDeviceType::GPU: { - dim3 block = detail::GetMaximumPotentialBlockSize2D( - Kernels::Device::rotate, ImageWrapper, ArrayWrapper>, 0); + dim3 block = detail::GetBlockSize2D(); dim3 grid = detail::GetGridSize2D(outputWrap.width(), outputWrap.height(), outputWrap.batches(), block); Kernels::Device::rotate<<>>(inputWrap, outputWrap, matWrap); break; diff --git a/src/op_thresholding.cpp b/src/op_thresholding.cpp index 72ea45d9..5fc96005 100644 --- a/src/op_thresholding.cpp +++ b/src/op_thresholding.cpp @@ -51,8 +51,7 @@ void dispatch_threshold_dtype(hipStream_t stream, const Tensor &input, const Ten if (device == eDeviceType::GPU) { switch (m_threshType) { case THRESH_BINARY: { - dim3 block = detail::GetMaximumPotentialBlockSize2D( - Kernels::Device::binary_generic, ImageWrapper>, 0); + dim3 block = detail::GetBlockSize2D(); dim3 grid = detail::GetGridSize2D(width, height, m_maxBatchSize, block); Kernels::Device::binary_generic<<>>( inputWrapper, outputWrapper, GenericTensorWrapper(thresh), @@ -60,8 +59,7 @@ void dispatch_threshold_dtype(hipStream_t stream, const Tensor &input, const Ten break; } case THRESH_BINARY_INV: { - dim3 block = detail::GetMaximumPotentialBlockSize2D( - Kernels::Device::binary_inv_generic, ImageWrapper>, 0); + dim3 block = detail::GetBlockSize2D(); dim3 grid = detail::GetGridSize2D(width, height, m_maxBatchSize, block); Kernels::Device::binary_inv_generic<<>>( inputWrapper, outputWrapper, GenericTensorWrapper(thresh), @@ -69,24 +67,21 @@ void dispatch_threshold_dtype(hipStream_t stream, const Tensor &input, const Ten break; } case THRESH_TRUNC: { - dim3 block = detail::GetMaximumPotentialBlockSize2D( - Kernels::Device::trunc_generic, ImageWrapper>, 0); + dim3 block = detail::GetBlockSize2D(); dim3 grid = detail::GetGridSize2D(width, height, m_maxBatchSize, block); Kernels::Device::trunc_generic<<>>( inputWrapper, outputWrapper, GenericTensorWrapper(thresh), m_maxBatchSize); break; } case THRESH_TOZERO: { - dim3 block = detail::GetMaximumPotentialBlockSize2D( - Kernels::Device::tozero_generic, ImageWrapper>, 0); + dim3 block = detail::GetBlockSize2D(); dim3 grid = detail::GetGridSize2D(width, height, m_maxBatchSize, block); Kernels::Device::tozero_generic<<>>( inputWrapper, outputWrapper, GenericTensorWrapper(thresh), m_maxBatchSize); break; } case THRESH_TOZERO_INV: { - dim3 block = detail::GetMaximumPotentialBlockSize2D( - Kernels::Device::tozeroinv_generic, ImageWrapper>, 0); + dim3 block = detail::GetBlockSize2D(); dim3 grid = detail::GetGridSize2D(width, height, m_maxBatchSize, block); Kernels::Device::tozeroinv_generic<<>>( inputWrapper, outputWrapper, GenericTensorWrapper(thresh), m_maxBatchSize); diff --git a/src/op_warp_perspective.cpp b/src/op_warp_perspective.cpp index 5857e2b6..430d2130 100644 --- a/src/op_warp_perspective.cpp +++ b/src/op_warp_perspective.cpp @@ -43,8 +43,7 @@ void dispatch_warp_perspective_interp(hipStream_t stream, const Tensor &input, c // Launch CPU/GPU kernel depending on requested device type. switch (device) { case eDeviceType::GPU: { - dim3 block = detail::GetMaximumPotentialBlockSize2D( - Kernels::Device::warp_perspective, ImageWrapper, ArrayWrapper>, 0); + dim3 block = detail::GetBlockSize2D(); dim3 grid = detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); Kernels::Device::warp_perspective<<>>(inputWrapper, outputWrapper, transform); From a899bb0debca028310cf48e962e62a78157d3686 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Mon, 27 Apr 2026 11:39:34 -0400 Subject: [PATCH 10/17] Use 1D blocks for pointwise kernels --- include/core/detail/hip_utils.hpp | 55 +++++++++++++++++++++++++++++++ src/op_bnd_box.cpp | 4 +-- src/op_composite.cpp | 4 +-- src/op_convert_to.cpp | 4 +-- src/op_copy_make_border.cpp | 4 +-- src/op_custom_crop.cpp | 4 +-- src/op_cvt_color.cpp | 28 ++++++++-------- src/op_flip.cpp | 4 +-- src/op_gamma_contrast.cpp | 4 +-- src/op_normalize.cpp | 4 +-- src/op_thresholding.cpp | 4 +-- 11 files changed, 87 insertions(+), 32 deletions(-) diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp index be59eba6..668b3323 100644 --- a/include/core/detail/hip_utils.hpp +++ b/include/core/detail/hip_utils.hpp @@ -78,4 +78,59 @@ static inline dim3 GetGridSize2D(size_t width, size_t height, size_t batchSize, return dim3((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y, batchSize); } +/** + * @brief Get the block size for a 1D kernel — all threads on the x axis. + * + * Use for pointwise kernels (no neighborhood reads). There is no locality + * benefit to grouping threads from different rows in the same block when + * each thread only touches its own pixel; a 1D block keeps every wavefront + * on a single contiguous row, maximizing coalescing and eliminating the + * y-axis index math and bottom-edge tail-wave waste of a 2D launch. + * + * Pair with GetGridSize1D, which lays the rows of the image out along + * gridDim.y so existing kernels can derive y directly from blockIdx.y + * without any indexing changes (since blockDim.y == 1 collapses the + * standard `y = blockDim.y * blockIdx.y + threadIdx.y` to `y = blockIdx.y`). + * + * The default of 256 threads is intentionally smaller than the 2D default + * of 512: pointwise ops are bandwidth-bound, so 4 wavefronts per block on + * AMD (warp=64) is enough to keep memory in flight on every CU while + * producing less tail waste than 512 on narrow images. + * + * @param[in] targetBlockSize Total threads per block. Should be a multiple + * of warpSize; otherwise it is silently floored + * to the nearest multiple. Defaults to 256. + * @return The block size: dim3(targetBlockSize, 1, 1), aligned to warpSize. + */ +static inline dim3 GetBlockSize1D(int targetBlockSize = 256) { + int deviceId; + int warpSize; + + HIP_VALIDATE_NO_ERRORS(hipGetDevice(&deviceId)); + HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, deviceId)); + + return dim3((targetBlockSize / warpSize) * warpSize, 1, 1); +} + +/** + * @brief Get the grid size for a 1D-row-major launch. + * + * Lays one image row along gridDim.y and the batch along gridDim.z. The + * block from GetBlockSize1D has blockDim.y == 1, so the kernel's standard + * `y = blockDim.y * blockIdx.y + threadIdx.y` collapses to `y = blockIdx.y` + * with no kernel changes required. + * + * Caller is responsible for ensuring height does not exceed the device's + * gridDim.y limit (typically 65535) and batchSize does not exceed gridDim.z. + * + * @param[in] width The width of the image. + * @param[in] height The height of the image (becomes gridDim.y). + * @param[in] batchSize The number of images (becomes gridDim.z). + * @param[in] blockSize Block size from GetBlockSize1D. + * @return The grid size: dim3(ceil(width / blockSize.x), height, batchSize). + */ +static inline dim3 GetGridSize1D(size_t width, size_t height, size_t batchSize, dim3 blockSize) { + return dim3((width + blockSize.x - 1) / blockSize.x, height, batchSize); +} + } // namespace roccv::detail \ No newline at end of file diff --git a/src/op_bnd_box.cpp b/src/op_bnd_box.cpp index a46c35f7..42da0eac 100644 --- a/src/op_bnd_box.cpp +++ b/src/op_bnd_box.cpp @@ -50,8 +50,8 @@ void dispatch_bnd_box_dtype(hipStream_t stream, const Tensor &input, const Tenso auto batchSize = inputWrapper.batches(); switch (device) { case eDeviceType::GPU: { - const dim3 block = detail::GetBlockSize2D(); - const dim3 grid = detail::GetGridSize2D(width, height, batchSize, block); + const dim3 block = detail::GetBlockSize1D(); + const dim3 grid = detail::GetGridSize1D(width, height, batchSize, block); Rect_t *rects_ptr = nullptr; const auto n_rects = rects->size(); diff --git a/src/op_composite.cpp b/src/op_composite.cpp index 091c54f0..559582fb 100644 --- a/src/op_composite.cpp +++ b/src/op_composite.cpp @@ -41,9 +41,9 @@ void dispatch_composite_masktype(hipStream_t stream, const Tensor& foreground, c switch (device) { case eDeviceType::GPU: { - dim3 block = detail::GetBlockSize2D(); + dim3 block = detail::GetBlockSize1D(); dim3 grid = - detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); + detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); Kernels::Device::composite<<>>(fgWrapper, bgWrapper, maskWrapper, outputWrapper); break; } diff --git a/src/op_convert_to.cpp b/src/op_convert_to.cpp index 27900fd6..11067706 100644 --- a/src/op_convert_to.cpp +++ b/src/op_convert_to.cpp @@ -55,9 +55,9 @@ void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const // Launch CPU/GPU kernel depending on requested device type. switch (device) { case eDeviceType::GPU: { - dim3 block = detail::GetBlockSize2D(); + dim3 block = detail::GetBlockSize1D(); dim3 grid = - detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); + detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); Kernels::Device::convert_to<<>>(inputWrapper, outputWrapper, alpha_ab, beta_ab); break; } diff --git a/src/op_copy_make_border.cpp b/src/op_copy_make_border.cpp index 36a29e91..7d8c2794 100644 --- a/src/op_copy_make_border.cpp +++ b/src/op_copy_make_border.cpp @@ -44,8 +44,8 @@ void dispatch_copy_make_border_border_mode(hipStream_t stream, const Tensor& inp switch (device) { case eDeviceType::GPU: { - dim3 block = detail::GetBlockSize2D(); - dim3 grid = detail::GetGridSize2D(out_desc.width(), out_desc.height(), out_desc.batches(), block); + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(out_desc.width(), out_desc.height(), out_desc.batches(), block); Kernels::Device::copy_make_border<<>>(in_desc, out_desc, top, left); break; } diff --git a/src/op_custom_crop.cpp b/src/op_custom_crop.cpp index e4ae6480..0e54db1a 100644 --- a/src/op_custom_crop.cpp +++ b/src/op_custom_crop.cpp @@ -41,9 +41,9 @@ void dispatch_custom_crop_dtype(hipStream_t stream, const Tensor& input, const T switch (device) { case eDeviceType::GPU: { - dim3 block = detail::GetBlockSize2D(); + dim3 block = detail::GetBlockSize1D(); dim3 grid = - detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); + detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); Kernels::Device::custom_crop<<>>(inputWrapper, outputWrapper, cropRect); break; } diff --git a/src/op_cvt_color.cpp b/src/op_cvt_color.cpp index 9094854c..98c0826b 100644 --- a/src/op_cvt_color.cpp +++ b/src/op_cvt_color.cpp @@ -82,16 +82,16 @@ void CvtColor::operator()(hipStream_t stream, const Tensor &input, Tensor &outpu switch (conversionCode) { case eColorConversionCode::COLOR_BGR2GRAY: { - dim3 block = detail::GetBlockSize2D(); - dim3 grid = detail::GetGridSize2D(width, height, samples, block); + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(width, height, samples, block); Kernels::Device::rgb_or_bgr_to_grayscale <<>>(ImageWrapper(input), ImageWrapper(output)); break; } case eColorConversionCode::COLOR_RGB2GRAY: { - dim3 block = detail::GetBlockSize2D(); - dim3 grid = detail::GetGridSize2D(width, height, samples, block); + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(width, height, samples, block); Kernels::Device::rgb_or_bgr_to_grayscale <<>>(ImageWrapper(input), ImageWrapper(output)); break; @@ -99,40 +99,40 @@ void CvtColor::operator()(hipStream_t stream, const Tensor &input, Tensor &outpu case eColorConversionCode::COLOR_BGR2RGB: case eColorConversionCode::COLOR_RGB2BGR: { - dim3 block = detail::GetBlockSize2D(); - dim3 grid = detail::GetGridSize2D(width, height, samples, block); + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(width, height, samples, block); Kernels::Device::reorder <<>>(ImageWrapper(input), ImageWrapper(output)); break; } case eColorConversionCode::COLOR_BGR2YUV: { - dim3 block = detail::GetBlockSize2D(); - dim3 grid = detail::GetGridSize2D(width, height, samples, block); + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(width, height, samples, block); Kernels::Device::rgb_or_bgr_to_yuv <<>>(ImageWrapper(input), ImageWrapper(output), 128.0f); break; } case eColorConversionCode::COLOR_RGB2YUV: { - dim3 block = detail::GetBlockSize2D(); - dim3 grid = detail::GetGridSize2D(width, height, samples, block); + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(width, height, samples, block); Kernels::Device::rgb_or_bgr_to_yuv <<>>(ImageWrapper(input), ImageWrapper(output), 128.0f); break; } case eColorConversionCode::COLOR_YUV2BGR: { - dim3 block = detail::GetBlockSize2D(); - dim3 grid = detail::GetGridSize2D(width, height, samples, block); + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(width, height, samples, block); Kernels::Device::yuv_to_rgb_or_bgr <<>>(ImageWrapper(input), ImageWrapper(output), 128.0f); break; } case eColorConversionCode::COLOR_YUV2RGB: { - dim3 block = detail::GetBlockSize2D(); - dim3 grid = detail::GetGridSize2D(width, height, samples, block); + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(width, height, samples, block); Kernels::Device::yuv_to_rgb_or_bgr <<>>(ImageWrapper(input), ImageWrapper(output), 128.0f); break; diff --git a/src/op_flip.cpp b/src/op_flip.cpp index 79ec12b6..6bb7dd33 100644 --- a/src/op_flip.cpp +++ b/src/op_flip.cpp @@ -43,9 +43,9 @@ void dispatch_flip_axis(hipStream_t stream, const Tensor& input, const Tensor& o switch (device) { case eDeviceType::GPU: { - dim3 block = detail::GetBlockSize2D(); + dim3 block = detail::GetBlockSize1D(); dim3 grid = - detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); + detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); Kernels::Device::flip<<>>(inputWrapper, outputWrapper); break; } diff --git a/src/op_gamma_contrast.cpp b/src/op_gamma_contrast.cpp index f596e5bf..43e66ed8 100644 --- a/src/op_gamma_contrast.cpp +++ b/src/op_gamma_contrast.cpp @@ -43,9 +43,9 @@ void dispatch_gamma_contrast_dtype(hipStream_t stream, const Tensor &input, cons ImageWrapper outputWrapper(output); if (device == eDeviceType::GPU) { - dim3 block = detail::GetBlockSize2D(); + dim3 block = detail::GetBlockSize1D(); dim3 grid = - detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); + detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); Kernels::Device::gamma_contrast<<>>(inputWrapper, outputWrapper, gamma); } else if (device == eDeviceType::CPU) { diff --git a/src/op_normalize.cpp b/src/op_normalize.cpp index 5aeb1a8a..35dd5b25 100644 --- a/src/op_normalize.cpp +++ b/src/op_normalize.cpp @@ -50,8 +50,8 @@ void dispatch_normalize_stddev(hipStream_t stream, const Tensor& input, const Te switch (device) { case eDeviceType::GPU: { - dim3 block = detail::GetBlockSize2D(); - dim3 grid = detail::GetGridSize2D(outputWrap.width(), outputWrap.height(), outputWrap.batches(), block); + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(outputWrap.width(), outputWrap.height(), outputWrap.batches(), block); Kernels::Device::normalize <<>>(inputWrap, baseWrap, scaleWrap, outputWrap, global_scale, shift, epsilon); break; diff --git a/src/op_thresholding.cpp b/src/op_thresholding.cpp index 837e9982..c78c9884 100644 --- a/src/op_thresholding.cpp +++ b/src/op_thresholding.cpp @@ -51,8 +51,8 @@ void dispatch_threshold_dtype(hipStream_t stream, const Tensor &input, const Ten const auto width = input.shape()[input.shape().layout().width_index()]; if (device == eDeviceType::GPU) { - dim3 block = detail::GetBlockSize2D(); - dim3 grid = detail::GetGridSize2D(width, height, outputWrapper.batches(), block); + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(width, height, outputWrapper.batches(), block); switch (m_threshType) { case THRESH_BINARY: From 063f97f703755bbe9cc1ecf97839c4959c43ec29 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Mon, 27 Apr 2026 11:49:55 -0400 Subject: [PATCH 11/17] Change default 2D block size to 256 --- include/core/detail/hip_utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp index 668b3323..7acc635c 100644 --- a/include/core/detail/hip_utils.hpp +++ b/include/core/detail/hip_utils.hpp @@ -55,7 +55,7 @@ void LaunchHostFuncAsync(hipStream_t stream, Callable&& cb) { * @param[in] targetBlockSize The target block size. * @return The block size. */ -static inline dim3 GetBlockSize2D(int targetBlockSize = 512) { +static inline dim3 GetBlockSize2D(int targetBlockSize = 256) { int deviceId; int warpSize; From 9ba59d468583e882b8eb0a73dd2c91e2f988c890 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Mon, 27 Apr 2026 11:59:12 -0400 Subject: [PATCH 12/17] Switch default target block size to 128 --- include/core/detail/hip_utils.hpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp index 7acc635c..522c7bde 100644 --- a/include/core/detail/hip_utils.hpp +++ b/include/core/detail/hip_utils.hpp @@ -55,7 +55,7 @@ void LaunchHostFuncAsync(hipStream_t stream, Callable&& cb) { * @param[in] targetBlockSize The target block size. * @return The block size. */ -static inline dim3 GetBlockSize2D(int targetBlockSize = 256) { +static inline dim3 GetBlockSize2D(int targetBlockSize = 128) { int deviceId; int warpSize; @@ -92,17 +92,12 @@ static inline dim3 GetGridSize2D(size_t width, size_t height, size_t batchSize, * without any indexing changes (since blockDim.y == 1 collapses the * standard `y = blockDim.y * blockIdx.y + threadIdx.y` to `y = blockIdx.y`). * - * The default of 256 threads is intentionally smaller than the 2D default - * of 512: pointwise ops are bandwidth-bound, so 4 wavefronts per block on - * AMD (warp=64) is enough to keep memory in flight on every CU while - * producing less tail waste than 512 on narrow images. - * * @param[in] targetBlockSize Total threads per block. Should be a multiple * of warpSize; otherwise it is silently floored * to the nearest multiple. Defaults to 256. * @return The block size: dim3(targetBlockSize, 1, 1), aligned to warpSize. */ -static inline dim3 GetBlockSize1D(int targetBlockSize = 256) { +static inline dim3 GetBlockSize1D(int targetBlockSize = 128) { int deviceId; int warpSize; From 981346addd26255e25ef5ac06238e9390429b348 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Mon, 27 Apr 2026 13:47:55 -0400 Subject: [PATCH 13/17] Use runtime API to determine maximum occupancy --- include/core/detail/hip_utils.hpp | 120 +++++++++++++++++++++--------- src/op_bilateral_filter.cpp | 6 +- src/op_bnd_box.cpp | 7 +- src/op_composite.cpp | 6 +- src/op_convert_to.cpp | 6 +- src/op_copy_make_border.cpp | 6 +- src/op_custom_crop.cpp | 5 +- src/op_cvt_color.cpp | 49 ++++++------ src/op_flip.cpp | 5 +- src/op_gamma_contrast.cpp | 5 +- src/op_normalize.cpp | 9 ++- src/op_remap.cpp | 7 +- src/op_resize.cpp | 6 +- src/op_rotate.cpp | 7 +- src/op_thresholding.cpp | 52 ++++++++----- src/op_warp_perspective.cpp | 6 +- 16 files changed, 194 insertions(+), 108 deletions(-) diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp index 522c7bde..74a17108 100644 --- a/include/core/detail/hip_utils.hpp +++ b/include/core/detail/hip_utils.hpp @@ -50,19 +50,94 @@ void LaunchHostFuncAsync(hipStream_t stream, Callable&& cb) { } /** - * @brief Get the block size for a 2D kernel. + * @brief Get the device's wavefront/warp size, cached per-thread. * - * @param[in] targetBlockSize The target block size. - * @return The block size. + * Re-queries the runtime if the active HIP device has changed since the + * last call from the calling thread (so multi-device callers stay correct + * without paying for a query on every launch). */ -static inline dim3 GetBlockSize2D(int targetBlockSize = 128) { +inline int CachedWarpSize() { + static thread_local int cachedDeviceId = -1; + static thread_local int cachedWarpSize = 0; int deviceId; - int warpSize; + HIP_VALIDATE_NO_ERRORS(hipGetDevice(&deviceId)); + if (deviceId != cachedDeviceId) { + HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&cachedWarpSize, hipDeviceAttributeWarpSize, deviceId)); + cachedDeviceId = deviceId; + } + return cachedWarpSize; +} +/** + * @brief Cache hipOccupancyMaxPotentialBlockSize per (kernel, device). + * + * The driver picks a thread count that maximizes resident wavefronts per CU + * for the given kernel on the current device, accounting for the kernel's + * register and static-shared-memory usage. The result is bounded above by + * Cap so the API's drive toward maximum occupancy can't override workload- + * class judgment (memory-bound ops gain nothing past ~50% occupancy and + * can lose throughput to cache pressure with overly large blocks). + * + * Each (Kernel, Cap) instantiation gets its own thread-local cache slot, + * so the runtime query runs once per (kernel, device, cap) per thread. + * + * @tparam Kernel The __global__ function pointer (auto NTTP — each unique + * kernel address gets its own cached result). + * @tparam Cap Upper bound on the returned block size. + */ +template +inline int CachedOccupancyBlockSize() { + static thread_local int cachedDeviceId = -1; + static thread_local int cachedBlockSize = 0; + int deviceId; HIP_VALIDATE_NO_ERRORS(hipGetDevice(&deviceId)); - HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, deviceId)); + if (deviceId != cachedDeviceId) { + int minGridSize; + HIP_VALIDATE_NO_ERRORS(hipOccupancyMaxPotentialBlockSize(&minGridSize, &cachedBlockSize, Kernel, 0, Cap)); + cachedDeviceId = deviceId; + } + return cachedBlockSize; +} + +/** + * @brief Pick a 1D block size for a pointwise kernel via runtime occupancy + * query, capped at Cap, and cached per (kernel, device). + * + * Use for pointwise kernels (no neighborhood reads). The driver returns a + * thread count tuned to this specific kernel's register pressure on the + * current device — important on architectures with very different SIMD-per-CU + * counts and register-file sizes (e.g. CDNA wants more wavefronts in flight + * per CU than RDNA to hide HBM latency). + * + * Pair with GetGridSize1D — see its docs for the row-major launch shape. + * + * @tparam Kernel The __global__ function pointer. + * @tparam Cap Upper bound on threads per block. + * @return dim3(blockSize, 1, 1). + */ +template +inline dim3 GetBlockSize1D() { + return dim3(CachedOccupancyBlockSize(), 1, 1); +} - return dim3(warpSize, targetBlockSize / warpSize, 1); +/** + * @brief Pick a 2D block size for a stencil/transform kernel via runtime + * occupancy query, capped at Cap, and cached per (kernel, device). + * + * Use for kernels with 2D locality (stencils, interpolation neighborhoods, + * affine warps). Reshapes the queried thread count as + * (warpSize, blockSize / warpSize, 1) so threadIdx.x is wavefront-aligned + * for coalescing while threadIdx.y stacks rows for tile-style cache reuse. + * + * @tparam Kernel The __global__ function pointer. + * @tparam Cap Upper bound on threads per block. + * @return dim3(warpSize, blockSize / warpSize, 1). + */ +template +inline dim3 GetBlockSize2D() { + int blockSize = CachedOccupancyBlockSize(); + int warpSize = CachedWarpSize(); + return dim3(warpSize, blockSize / warpSize, 1); } /** @@ -78,35 +153,6 @@ static inline dim3 GetGridSize2D(size_t width, size_t height, size_t batchSize, return dim3((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y, batchSize); } -/** - * @brief Get the block size for a 1D kernel — all threads on the x axis. - * - * Use for pointwise kernels (no neighborhood reads). There is no locality - * benefit to grouping threads from different rows in the same block when - * each thread only touches its own pixel; a 1D block keeps every wavefront - * on a single contiguous row, maximizing coalescing and eliminating the - * y-axis index math and bottom-edge tail-wave waste of a 2D launch. - * - * Pair with GetGridSize1D, which lays the rows of the image out along - * gridDim.y so existing kernels can derive y directly from blockIdx.y - * without any indexing changes (since blockDim.y == 1 collapses the - * standard `y = blockDim.y * blockIdx.y + threadIdx.y` to `y = blockIdx.y`). - * - * @param[in] targetBlockSize Total threads per block. Should be a multiple - * of warpSize; otherwise it is silently floored - * to the nearest multiple. Defaults to 256. - * @return The block size: dim3(targetBlockSize, 1, 1), aligned to warpSize. - */ -static inline dim3 GetBlockSize1D(int targetBlockSize = 128) { - int deviceId; - int warpSize; - - HIP_VALIDATE_NO_ERRORS(hipGetDevice(&deviceId)); - HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, deviceId)); - - return dim3((targetBlockSize / warpSize) * warpSize, 1, 1); -} - /** * @brief Get the grid size for a 1D-row-major launch. * @@ -128,4 +174,4 @@ static inline dim3 GetGridSize1D(size_t width, size_t height, size_t batchSize, return dim3((width + blockSize.x - 1) / blockSize.x, height, batchSize); } -} // namespace roccv::detail \ No newline at end of file +} // namespace roccv::detail diff --git a/src/op_bilateral_filter.cpp b/src/op_bilateral_filter.cpp index 73c3fc66..b6eb6511 100644 --- a/src/op_bilateral_filter.cpp +++ b/src/op_bilateral_filter.cpp @@ -68,14 +68,14 @@ void dispatch_bilateral_filter_border_mode(hipStream_t stream, const Tensor &inp float colorCoeff = -1 / (2 * sigmaColor * sigmaColor); if (device == eDeviceType::GPU) { - dim3 block = detail::GetBlockSize2D(); + constexpr auto kernel = Kernels::Device::bilateral_filter, ImageWrapper>; + dim3 block = detail::GetBlockSize2D(); uint32_t xGridSize = (outputWrapper.width() + (block.x * 2) - 1) / (block.x * 2); uint32_t yGridSize = (outputWrapper.height() + (block.y * 2) - 1) / (block.y * 2); uint32_t zGridSize = outputWrapper.batches(); dim3 grid(xGridSize, yGridSize, zGridSize); - Kernels::Device::bilateral_filter - <<>>(inputWrapper, outputWrapper, radius, spaceCoeff, colorCoeff); + kernel<<>>(inputWrapper, outputWrapper, radius, spaceCoeff, colorCoeff); } else if (device == eDeviceType::CPU) { int divisor = std::gcd(4, outputWrapper.height()); // greatest common divisor int dividend = std::gcd((numThreads / divisor), outputWrapper.width()); diff --git a/src/op_bnd_box.cpp b/src/op_bnd_box.cpp index 42da0eac..3c0d5430 100644 --- a/src/op_bnd_box.cpp +++ b/src/op_bnd_box.cpp @@ -50,7 +50,8 @@ void dispatch_bnd_box_dtype(hipStream_t stream, const Tensor &input, const Tenso auto batchSize = inputWrapper.batches(); switch (device) { case eDeviceType::GPU: { - const dim3 block = detail::GetBlockSize1D(); + constexpr auto kernel = Kernels::Device::bndbox_kernel, ImageWrapper>; + const dim3 block = detail::GetBlockSize1D(); const dim3 grid = detail::GetGridSize1D(width, height, batchSize, block); Rect_t *rects_ptr = nullptr; @@ -61,8 +62,8 @@ void dispatch_bnd_box_dtype(hipStream_t stream, const Tensor &input, const Tenso HIP_VALIDATE_NO_ERRORS( hipMemcpyAsync(rects_ptr, rects->data(), sizeof(Rect_t) * n_rects, hipMemcpyHostToDevice, stream)); } - Kernels::Device::bndbox_kernel - <<>>(inputWrapper, outputWrapper, rects_ptr, n_rects, batchSize, height, width); + kernel<<>>(inputWrapper, outputWrapper, rects_ptr, n_rects, batchSize, height, + width); if (n_rects > 0) { HIP_VALIDATE_NO_ERRORS(hipFreeAsync(rects_ptr, stream)); } diff --git a/src/op_composite.cpp b/src/op_composite.cpp index 559582fb..3de572cf 100644 --- a/src/op_composite.cpp +++ b/src/op_composite.cpp @@ -41,10 +41,12 @@ void dispatch_composite_masktype(hipStream_t stream, const Tensor& foreground, c switch (device) { case eDeviceType::GPU: { - dim3 block = detail::GetBlockSize1D(); + constexpr auto kernel = Kernels::Device::composite, ImageWrapper, + ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); dim3 grid = detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); - Kernels::Device::composite<<>>(fgWrapper, bgWrapper, maskWrapper, outputWrapper); + kernel<<>>(fgWrapper, bgWrapper, maskWrapper, outputWrapper); break; } diff --git a/src/op_convert_to.cpp b/src/op_convert_to.cpp index 11067706..936f315d 100644 --- a/src/op_convert_to.cpp +++ b/src/op_convert_to.cpp @@ -55,10 +55,12 @@ void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const // Launch CPU/GPU kernel depending on requested device type. switch (device) { case eDeviceType::GPU: { - dim3 block = detail::GetBlockSize1D(); + constexpr auto kernel = + Kernels::Device::convert_to, ImageWrapper, DT_AB>; + dim3 block = detail::GetBlockSize1D(); dim3 grid = detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); - Kernels::Device::convert_to<<>>(inputWrapper, outputWrapper, alpha_ab, beta_ab); + kernel<<>>(inputWrapper, outputWrapper, alpha_ab, beta_ab); break; } case eDeviceType::CPU: { diff --git a/src/op_copy_make_border.cpp b/src/op_copy_make_border.cpp index 7d8c2794..8a16f864 100644 --- a/src/op_copy_make_border.cpp +++ b/src/op_copy_make_border.cpp @@ -44,9 +44,11 @@ void dispatch_copy_make_border_border_mode(hipStream_t stream, const Tensor& inp switch (device) { case eDeviceType::GPU: { - dim3 block = detail::GetBlockSize1D(); + constexpr auto kernel = + Kernels::Device::copy_make_border, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); dim3 grid = detail::GetGridSize1D(out_desc.width(), out_desc.height(), out_desc.batches(), block); - Kernels::Device::copy_make_border<<>>(in_desc, out_desc, top, left); + kernel<<>>(in_desc, out_desc, top, left); break; } case eDeviceType::CPU: { diff --git a/src/op_custom_crop.cpp b/src/op_custom_crop.cpp index 0e54db1a..89916758 100644 --- a/src/op_custom_crop.cpp +++ b/src/op_custom_crop.cpp @@ -41,10 +41,11 @@ void dispatch_custom_crop_dtype(hipStream_t stream, const Tensor& input, const T switch (device) { case eDeviceType::GPU: { - dim3 block = detail::GetBlockSize1D(); + constexpr auto kernel = Kernels::Device::custom_crop, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); dim3 grid = detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); - Kernels::Device::custom_crop<<>>(inputWrapper, outputWrapper, cropRect); + kernel<<>>(inputWrapper, outputWrapper, cropRect); break; } diff --git a/src/op_cvt_color.cpp b/src/op_cvt_color.cpp index 98c0826b..8f978999 100644 --- a/src/op_cvt_color.cpp +++ b/src/op_cvt_color.cpp @@ -82,59 +82,66 @@ void CvtColor::operator()(hipStream_t stream, const Tensor &input, Tensor &outpu switch (conversionCode) { case eColorConversionCode::COLOR_BGR2GRAY: { - dim3 block = detail::GetBlockSize1D(); + constexpr auto kernel = Kernels::Device::rgb_or_bgr_to_grayscale< + uchar3, eSwizzle::ZYXW, ImageWrapper, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); dim3 grid = detail::GetGridSize1D(width, height, samples, block); - Kernels::Device::rgb_or_bgr_to_grayscale - <<>>(ImageWrapper(input), ImageWrapper(output)); + kernel<<>>(ImageWrapper(input), ImageWrapper(output)); break; } case eColorConversionCode::COLOR_RGB2GRAY: { - dim3 block = detail::GetBlockSize1D(); + constexpr auto kernel = Kernels::Device::rgb_or_bgr_to_grayscale< + uchar3, eSwizzle::XYZW, ImageWrapper, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); dim3 grid = detail::GetGridSize1D(width, height, samples, block); - Kernels::Device::rgb_or_bgr_to_grayscale - <<>>(ImageWrapper(input), ImageWrapper(output)); + kernel<<>>(ImageWrapper(input), ImageWrapper(output)); break; } case eColorConversionCode::COLOR_BGR2RGB: case eColorConversionCode::COLOR_RGB2BGR: { - dim3 block = detail::GetBlockSize1D(); + constexpr auto kernel = Kernels::Device::reorder< + uchar3, eSwizzle::ZYXW, ImageWrapper, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); dim3 grid = detail::GetGridSize1D(width, height, samples, block); - Kernels::Device::reorder - <<>>(ImageWrapper(input), ImageWrapper(output)); + kernel<<>>(ImageWrapper(input), ImageWrapper(output)); break; } case eColorConversionCode::COLOR_BGR2YUV: { - dim3 block = detail::GetBlockSize1D(); + constexpr auto kernel = Kernels::Device::rgb_or_bgr_to_yuv< + uchar3, eSwizzle::ZYXW, ImageWrapper, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); dim3 grid = detail::GetGridSize1D(width, height, samples, block); - Kernels::Device::rgb_or_bgr_to_yuv - <<>>(ImageWrapper(input), ImageWrapper(output), 128.0f); + kernel<<>>(ImageWrapper(input), ImageWrapper(output), 128.0f); break; } case eColorConversionCode::COLOR_RGB2YUV: { - dim3 block = detail::GetBlockSize1D(); + constexpr auto kernel = Kernels::Device::rgb_or_bgr_to_yuv< + uchar3, eSwizzle::XYZW, ImageWrapper, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); dim3 grid = detail::GetGridSize1D(width, height, samples, block); - Kernels::Device::rgb_or_bgr_to_yuv - <<>>(ImageWrapper(input), ImageWrapper(output), 128.0f); + kernel<<>>(ImageWrapper(input), ImageWrapper(output), 128.0f); break; } case eColorConversionCode::COLOR_YUV2BGR: { - dim3 block = detail::GetBlockSize1D(); + constexpr auto kernel = Kernels::Device::yuv_to_rgb_or_bgr< + uchar3, eSwizzle::ZYXW, ImageWrapper, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); dim3 grid = detail::GetGridSize1D(width, height, samples, block); - Kernels::Device::yuv_to_rgb_or_bgr - <<>>(ImageWrapper(input), ImageWrapper(output), 128.0f); + kernel<<>>(ImageWrapper(input), ImageWrapper(output), 128.0f); break; } case eColorConversionCode::COLOR_YUV2RGB: { - dim3 block = detail::GetBlockSize1D(); + constexpr auto kernel = Kernels::Device::yuv_to_rgb_or_bgr< + uchar3, eSwizzle::XYZW, ImageWrapper, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); dim3 grid = detail::GetGridSize1D(width, height, samples, block); - Kernels::Device::yuv_to_rgb_or_bgr - <<>>(ImageWrapper(input), ImageWrapper(output), 128.0f); + kernel<<>>(ImageWrapper(input), ImageWrapper(output), 128.0f); break; } diff --git a/src/op_flip.cpp b/src/op_flip.cpp index 6bb7dd33..54886b6f 100644 --- a/src/op_flip.cpp +++ b/src/op_flip.cpp @@ -43,10 +43,11 @@ void dispatch_flip_axis(hipStream_t stream, const Tensor& input, const Tensor& o switch (device) { case eDeviceType::GPU: { - dim3 block = detail::GetBlockSize1D(); + constexpr auto kernel = Kernels::Device::flip, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); dim3 grid = detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); - Kernels::Device::flip<<>>(inputWrapper, outputWrapper); + kernel<<>>(inputWrapper, outputWrapper); break; } diff --git a/src/op_gamma_contrast.cpp b/src/op_gamma_contrast.cpp index 43e66ed8..e764d27c 100644 --- a/src/op_gamma_contrast.cpp +++ b/src/op_gamma_contrast.cpp @@ -43,11 +43,12 @@ void dispatch_gamma_contrast_dtype(hipStream_t stream, const Tensor &input, cons ImageWrapper outputWrapper(output); if (device == eDeviceType::GPU) { - dim3 block = detail::GetBlockSize1D(); + constexpr auto kernel = Kernels::Device::gamma_contrast, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); dim3 grid = detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); - Kernels::Device::gamma_contrast<<>>(inputWrapper, outputWrapper, gamma); + kernel<<>>(inputWrapper, outputWrapper, gamma); } else if (device == eDeviceType::CPU) { Kernels::Host::gamma_contrast(inputWrapper, outputWrapper, gamma); } diff --git a/src/op_normalize.cpp b/src/op_normalize.cpp index 35dd5b25..666b317e 100644 --- a/src/op_normalize.cpp +++ b/src/op_normalize.cpp @@ -50,10 +50,13 @@ void dispatch_normalize_stddev(hipStream_t stream, const Tensor& input, const Te switch (device) { case eDeviceType::GPU: { - dim3 block = detail::GetBlockSize1D(); + constexpr auto kernel = + Kernels::Device::normalize, ImageWrapper, ImageWrapper, + ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); dim3 grid = detail::GetGridSize1D(outputWrap.width(), outputWrap.height(), outputWrap.batches(), block); - Kernels::Device::normalize - <<>>(inputWrap, baseWrap, scaleWrap, outputWrap, global_scale, shift, epsilon); + kernel<<>>(inputWrap, baseWrap, scaleWrap, outputWrap, global_scale, shift, + epsilon); break; } case eDeviceType::CPU: { diff --git a/src/op_remap.cpp b/src/op_remap.cpp index 76ba7cae..cdd4209d 100644 --- a/src/op_remap.cpp +++ b/src/op_remap.cpp @@ -89,11 +89,12 @@ void dispatch_remap_mapInterp(hipStream_t stream, const Tensor &input, const Ten // Launch CPU/GPU kernel depending on requested device type. switch (device) { case eDeviceType::GPU: { - dim3 block = detail::GetBlockSize2D(); + constexpr auto kernel = Kernels::Device::remap< + InterpolationWrapper, ImageWrapper, InterpolationWrapper>; + dim3 block = detail::GetBlockSize2D(); dim3 grid = detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); - Kernels::Device::remap<<>>(inputWrapper, outputWrapper, wrappedMapTensor, - mapBatchSize, params); + kernel<<>>(inputWrapper, outputWrapper, wrappedMapTensor, mapBatchSize, params); break; } diff --git a/src/op_resize.cpp b/src/op_resize.cpp index f40373fa..30d9dc14 100644 --- a/src/op_resize.cpp +++ b/src/op_resize.cpp @@ -45,10 +45,12 @@ void dispatch_resize_interp(hipStream_t stream, const Tensor& input, const Tenso switch (device) { case eDeviceType::GPU: { - dim3 block = detail::GetBlockSize2D(); + constexpr auto kernel = Kernels::Device::resize< + InterpolationWrapper, ImageWrapper>; + dim3 block = detail::GetBlockSize2D(); dim3 grid = detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); - Kernels::Device::resize<<>>(inputWrapper, outputWrapper, scaleX, scaleY); + kernel<<>>(inputWrapper, outputWrapper, scaleX, scaleY); break; } diff --git a/src/op_rotate.cpp b/src/op_rotate.cpp index 9afec0ac..be7d73a4 100644 --- a/src/op_rotate.cpp +++ b/src/op_rotate.cpp @@ -60,9 +60,12 @@ void dispatch_rotate_interp(hipStream_t stream, const Tensor &input, const Tenso switch (device) { case eDeviceType::GPU: { - dim3 block = detail::GetBlockSize2D(); + constexpr auto kernel = Kernels::Device::rotate< + InterpolationWrapper, ImageWrapper, + ArrayWrapper>; + dim3 block = detail::GetBlockSize2D(); dim3 grid = detail::GetGridSize2D(outputWrap.width(), outputWrap.height(), outputWrap.batches(), block); - Kernels::Device::rotate<<>>(inputWrap, outputWrap, matWrap); + kernel<<>>(inputWrap, outputWrap, matWrap); break; } diff --git a/src/op_thresholding.cpp b/src/op_thresholding.cpp index c78c9884..2325c5eb 100644 --- a/src/op_thresholding.cpp +++ b/src/op_thresholding.cpp @@ -51,32 +51,44 @@ void dispatch_threshold_dtype(hipStream_t stream, const Tensor &input, const Ten const auto width = input.shape()[input.shape().layout().width_index()]; if (device == eDeviceType::GPU) { - dim3 block = detail::GetBlockSize1D(); - dim3 grid = detail::GetGridSize1D(width, height, outputWrapper.batches(), block); - switch (m_threshType) { - case THRESH_BINARY: - Kernels::Device::binary_generic<<>>(inputWrapper, outputWrapper, - GenericTensorWrapper(thresh), - GenericTensorWrapper(maxVal)); + case THRESH_BINARY: { + constexpr auto kernel = Kernels::Device::binary_generic, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(width, height, outputWrapper.batches(), block); + kernel<<>>(inputWrapper, outputWrapper, GenericTensorWrapper(thresh), + GenericTensorWrapper(maxVal)); break; - case THRESH_BINARY_INV: - Kernels::Device::binary_inv_generic<<>>(inputWrapper, outputWrapper, - GenericTensorWrapper(thresh), - GenericTensorWrapper(maxVal)); + } + case THRESH_BINARY_INV: { + constexpr auto kernel = Kernels::Device::binary_inv_generic, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(width, height, outputWrapper.batches(), block); + kernel<<>>(inputWrapper, outputWrapper, GenericTensorWrapper(thresh), + GenericTensorWrapper(maxVal)); break; - case THRESH_TRUNC: - Kernels::Device::trunc_generic<<>>(inputWrapper, outputWrapper, - GenericTensorWrapper(thresh)); + } + case THRESH_TRUNC: { + constexpr auto kernel = Kernels::Device::trunc_generic, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(width, height, outputWrapper.batches(), block); + kernel<<>>(inputWrapper, outputWrapper, GenericTensorWrapper(thresh)); break; - case THRESH_TOZERO: - Kernels::Device::tozero_generic<<>>(inputWrapper, outputWrapper, - GenericTensorWrapper(thresh)); + } + case THRESH_TOZERO: { + constexpr auto kernel = Kernels::Device::tozero_generic, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(width, height, outputWrapper.batches(), block); + kernel<<>>(inputWrapper, outputWrapper, GenericTensorWrapper(thresh)); break; - case THRESH_TOZERO_INV: - Kernels::Device::tozeroinv_generic<<>>(inputWrapper, outputWrapper, - GenericTensorWrapper(thresh)); + } + case THRESH_TOZERO_INV: { + constexpr auto kernel = Kernels::Device::tozeroinv_generic, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(width, height, outputWrapper.batches(), block); + kernel<<>>(inputWrapper, outputWrapper, GenericTensorWrapper(thresh)); break; + } } } else if (device == eDeviceType::CPU) { switch (m_threshType) { diff --git a/src/op_warp_perspective.cpp b/src/op_warp_perspective.cpp index 9dd0d700..4db3f3f4 100644 --- a/src/op_warp_perspective.cpp +++ b/src/op_warp_perspective.cpp @@ -42,10 +42,12 @@ void dispatch_warp_perspective_interp(hipStream_t stream, const Tensor &input, c // Launch CPU/GPU kernel depending on requested device type. switch (device) { case eDeviceType::GPU: { - dim3 block = detail::GetBlockSize2D(); + constexpr auto kernel = Kernels::Device::warp_perspective< + InterpolationWrapper, ImageWrapper, ArrayWrapper>; + dim3 block = detail::GetBlockSize2D(); dim3 grid = detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); - Kernels::Device::warp_perspective<<>>(inputWrapper, outputWrapper, transform); + kernel<<>>(inputWrapper, outputWrapper, transform); break; } From f5baca12a09712b793bb1f44bbdb8a9338b751ed Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Mon, 27 Apr 2026 14:08:01 -0400 Subject: [PATCH 14/17] Increase default caps --- include/core/detail/hip_utils.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp index 74a17108..57304e3a 100644 --- a/include/core/detail/hip_utils.hpp +++ b/include/core/detail/hip_utils.hpp @@ -115,7 +115,7 @@ inline int CachedOccupancyBlockSize() { * @tparam Cap Upper bound on threads per block. * @return dim3(blockSize, 1, 1). */ -template +template inline dim3 GetBlockSize1D() { return dim3(CachedOccupancyBlockSize(), 1, 1); } @@ -133,7 +133,7 @@ inline dim3 GetBlockSize1D() { * @tparam Cap Upper bound on threads per block. * @return dim3(warpSize, blockSize / warpSize, 1). */ -template +template inline dim3 GetBlockSize2D() { int blockSize = CachedOccupancyBlockSize(); int warpSize = CachedWarpSize(); From 00a543dbf32bfd3c1b660c68e40696e454d4d18f Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Mon, 27 Apr 2026 14:12:53 -0400 Subject: [PATCH 15/17] Decrease 1D block size cap to 256 --- include/core/detail/hip_utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp index 57304e3a..85e24d3e 100644 --- a/include/core/detail/hip_utils.hpp +++ b/include/core/detail/hip_utils.hpp @@ -115,7 +115,7 @@ inline int CachedOccupancyBlockSize() { * @tparam Cap Upper bound on threads per block. * @return dim3(blockSize, 1, 1). */ -template +template inline dim3 GetBlockSize1D() { return dim3(CachedOccupancyBlockSize(), 1, 1); } From e6f55882c1d2218d0cc5d6d464283c4b9b79b09b Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Mon, 27 Apr 2026 14:13:18 -0400 Subject: [PATCH 16/17] Increase block size cap to 1024 for 1D blocks --- include/core/detail/hip_utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp index 85e24d3e..036496d1 100644 --- a/include/core/detail/hip_utils.hpp +++ b/include/core/detail/hip_utils.hpp @@ -115,7 +115,7 @@ inline int CachedOccupancyBlockSize() { * @tparam Cap Upper bound on threads per block. * @return dim3(blockSize, 1, 1). */ -template +template inline dim3 GetBlockSize1D() { return dim3(CachedOccupancyBlockSize(), 1, 1); } From fc8c00871146d3d13705c63b6ca7b2c3f62f1002 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Mon, 27 Apr 2026 14:17:40 -0400 Subject: [PATCH 17/17] Return cap sizes back to normal defaults --- include/core/detail/hip_utils.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp index 036496d1..74a17108 100644 --- a/include/core/detail/hip_utils.hpp +++ b/include/core/detail/hip_utils.hpp @@ -115,7 +115,7 @@ inline int CachedOccupancyBlockSize() { * @tparam Cap Upper bound on threads per block. * @return dim3(blockSize, 1, 1). */ -template +template inline dim3 GetBlockSize1D() { return dim3(CachedOccupancyBlockSize(), 1, 1); } @@ -133,7 +133,7 @@ inline dim3 GetBlockSize1D() { * @tparam Cap Upper bound on threads per block. * @return dim3(warpSize, blockSize / warpSize, 1). */ -template +template inline dim3 GetBlockSize2D() { int blockSize = CachedOccupancyBlockSize(); int warpSize = CachedWarpSize();