From 77b2090853443c59c60fed4b411ec2cbdc5debaa Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Mon, 9 Feb 2026 13:31:35 -0500
Subject: [PATCH 01/17] Create helper function for determining maximum block
 size

---
 include/core/detail/hip_utils.hpp | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp
index d8a918f1..59e092d6 100644
--- a/include/core/detail/hip_utils.hpp
+++ b/include/core/detail/hip_utils.hpp
@@ -25,6 +25,8 @@
 #include <functional>
 #include <memory>
 
+#include "core/hip_assert.h"
+
 namespace roccv::detail {
 static void StreamCallback(void* userData) {
     std::function<void()>* func = static_cast<std::function<void()>*>(userData);
@@ -46,4 +48,26 @@ void LaunchHostFuncAsync(hipStream_t stream, Callable&& cb) {
     HIP_VALIDATE_NO_ERRORS(hipLaunchHostFunc(stream, StreamCallback, data.get()));
     data.release();  // Release ownership, StreamCallback is responsible for it now
 }
+
+/**
+ * @brief Get the maximum potential block size for a 2D kernel.
+ *
+ * @param[in] kernel The kernel function to get the maximum potential block size for.
+ * @param[in] sharedMemSizePerBlock The shared memory size per block.
+ * @return The maximum potential block size.
+ */
+template <typename KernalFunc>
+dim3 GetMaximumPotentialBlockSize2D(KernalFunc kernel, size_t sharedMemSizePerBlock) {
+    int minimumGridSize;
+    int blockSize;
+    int deviceId;
+    int warpSize;
+
+    HIP_VALIDATE_NO_ERRORS(hipGetDevice(&deviceId));
+    HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, deviceId));
+    HIP_VALIDATE_NO_ERRORS(
+        hipOccupancyMaxPotentialBlockSize(&minimumGridSize, &blockSize, kernel, sharedMemSizePerBlock, warpSize));
+
+    return dim3(warpSize, blockSize / warpSize, 1);
+}
 }  // namespace roccv::detail
\ No newline at end of file

From 6d616afe5ab9ffe9bcc775537520249be92b3c13 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Mon, 9 Feb 2026 13:44:22 -0500
Subject: [PATCH 02/17] Define StreamCallback as static inline

---
 include/core/detail/hip_utils.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp
index 59e092d6..0030221a 100644
--- a/include/core/detail/hip_utils.hpp
+++ b/include/core/detail/hip_utils.hpp
@@ -28,7 +28,7 @@
 #include "core/hip_assert.h"
 
 namespace roccv::detail {
-static void StreamCallback(void* userData) {
+static inline void StreamCallback(void* userData) {
     std::function<void()>* func = static_cast<std::function<void()>*>(userData);
     (*func)();
     delete func;

From 348816dd58b536f932948c72a9458ace1b84ea84 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Mon, 9 Feb 2026 13:44:48 -0500
Subject: [PATCH 03/17] Use dynamic block size helper for Composite operator

---
 src/op_composite.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/op_composite.cpp b/src/op_composite.cpp
index 6d49a156..8073c7f4 100644
--- a/src/op_composite.cpp
+++ b/src/op_composite.cpp
@@ -24,6 +24,7 @@
 #include <functional>
 
 #include "common/validation_helpers.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/wrappers/image_wrapper.hpp"
 #include "kernels/device/composite_device.hpp"
 #include "kernels/host/composite_host.hpp"
@@ -40,7 +41,8 @@ void dispatch_composite_masktype(hipStream_t stream, const Tensor& foreground, c
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block(64, 16);
+            dim3 block = detail::GetMaximumPotentialBlockSize2D(
+                Kernels::Device::composite<ImageWrapper<SrcType>, ImageWrapper<MaskType>, ImageWrapper<DstType>>, 0);
             dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
                       outputWrapper.batches());
             Kernels::Device::composite<<<grid, block, 0, stream>>>(fgWrapper, bgWrapper, maskWrapper, outputWrapper);

From 3b0488e4d298921d5d895241393e17611bd734c4 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Mon, 9 Feb 2026 14:18:04 -0500
Subject: [PATCH 04/17] Add helper for grid size calculations

---
 include/core/detail/hip_utils.hpp | 14 ++++++++++++++
 src/op_bnd_box.cpp                |  5 +++--
 src/op_composite.cpp              |  4 ++--
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp
index 0030221a..0195df9c 100644
--- a/include/core/detail/hip_utils.hpp
+++ b/include/core/detail/hip_utils.hpp
@@ -70,4 +70,18 @@ dim3 GetMaximumPotentialBlockSize2D(KernalFunc kernel, size_t sharedMemSizePerBl
 
     return dim3(warpSize, blockSize / warpSize, 1);
 }
+
+/**
+ * @brief Get the grid size for a 2D kernel.
+ *
+ * @param[in] width The width of the image.
+ * @param[in] height The height of the image.
+ * @param[in] batchSize The batch size of the image.
+ * @param[in] blockSize The block size of the kernel.
+ * @return The grid size.
+ */
+static inline dim3 GetGridSize2D(size_t width, size_t height, size_t batchSize, dim3 blockSize) {
+    return dim3((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y, batchSize);
+}
+
 }  // namespace roccv::detail
\ No newline at end of file
diff --git a/src/op_bnd_box.cpp b/src/op_bnd_box.cpp
index e3cfa717..82c1ba5c 100644
--- a/src/op_bnd_box.cpp
+++ b/src/op_bnd_box.cpp
@@ -52,8 +52,9 @@ void dispatch_bnd_box_dtype(hipStream_t stream, const Tensor &input, const Tenso
     auto batchSize = inputWrapper.batches();
     switch (device) {
         case eDeviceType::GPU: {
-            const dim3 block(32, 32);
-            const dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y, batchSize);
+            const dim3 block = detail::GetMaximumPotentialBlockSize2D(
+                Kernels::Device::bndbox_kernel<has_alpha, T, ImageWrapper<T>, ImageWrapper<T>>, 0);
+            const dim3 grid = detail::GetGridSize2D(width, height, batchSize, block);
 
             Rect_t *rects_ptr = nullptr;
             const auto n_rects = rects->size();
diff --git a/src/op_composite.cpp b/src/op_composite.cpp
index 8073c7f4..1b21cc58 100644
--- a/src/op_composite.cpp
+++ b/src/op_composite.cpp
@@ -43,8 +43,8 @@ void dispatch_composite_masktype(hipStream_t stream, const Tensor& foreground, c
         case eDeviceType::GPU: {
             dim3 block = detail::GetMaximumPotentialBlockSize2D(
                 Kernels::Device::composite<ImageWrapper<SrcType>, ImageWrapper<MaskType>, ImageWrapper<DstType>>, 0);
-            dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
-                      outputWrapper.batches());
+            dim3 grid =
+                detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
             Kernels::Device::composite<<<grid, block, 0, stream>>>(fgWrapper, bgWrapper, maskWrapper, outputWrapper);
             break;
         }

From 2ceddcedaaa141fe9413c8f163c8fcf251b88118 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Mon, 9 Feb 2026 14:48:54 -0500
Subject: [PATCH 05/17] Add dynamic block size calculations to operators

---
 include/kernels/device/bnd_box_device.hpp |  1 +
 include/kernels/host/bnd_box_host.hpp     |  1 +
 src/op_bnd_box.cpp                        |  2 -
 src/op_copy_make_border.cpp               | 11 +--
 src/op_custom_crop.cpp                    |  8 +-
 src/op_cvt_color.cpp                      | 92 ++++++++++++++++-------
 src/op_flip.cpp                           |  8 +-
 src/op_gamma_contrast.cpp                 | 13 ++--
 src/op_normalize.cpp                      | 12 ++-
 src/op_resize.cpp                         | 18 +++--
 src/op_rotate.cpp                         |  7 +-
 src/op_thresholding.cpp                   | 36 ++++++---
 src/op_warp_perspective.cpp               |  8 +-
 13 files changed, 140 insertions(+), 77 deletions(-)

diff --git a/include/kernels/device/bnd_box_device.hpp b/include/kernels/device/bnd_box_device.hpp
index 9076e634..14900490 100644
--- a/include/kernels/device/bnd_box_device.hpp
+++ b/include/kernels/device/bnd_box_device.hpp
@@ -26,6 +26,7 @@ THE SOFTWARE.
 
 #include <core/detail/type_traits.hpp>
 
+#include "common/math_vector.hpp"
 #include "kernels/kernel_helpers.hpp"
 #include "operator_types.h"
 
diff --git a/include/kernels/host/bnd_box_host.hpp b/include/kernels/host/bnd_box_host.hpp
index db1af9f7..7b3826c1 100644
--- a/include/kernels/host/bnd_box_host.hpp
+++ b/include/kernels/host/bnd_box_host.hpp
@@ -26,6 +26,7 @@ THE SOFTWARE.
 
 #include <core/detail/type_traits.hpp>
 
+#include "common/math_vector.hpp"
 #include "kernels/kernel_helpers.hpp"
 #include "operator_types.h"
 
diff --git a/src/op_bnd_box.cpp b/src/op_bnd_box.cpp
index 82c1ba5c..6564658a 100644
--- a/src/op_bnd_box.cpp
+++ b/src/op_bnd_box.cpp
@@ -25,10 +25,8 @@ THE SOFTWARE.
 
 #include <algorithm>
 #include <cstring>
-#include <iostream>
 #include <vector>
 
-#include "common/math_vector.hpp"
 #include "common/validation_helpers.hpp"
 #include "core/detail/hip_utils.hpp"
 #include "core/tensor.hpp"
diff --git a/src/op_copy_make_border.cpp b/src/op_copy_make_border.cpp
index 74d45894..78579393 100644
--- a/src/op_copy_make_border.cpp
+++ b/src/op_copy_make_border.cpp
@@ -24,9 +24,10 @@
 #include <functional>
 
 #include "common/validation_helpers.hpp"
+#include "core/detail/casting.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/wrappers/border_wrapper.hpp"
 #include "core/wrappers/image_wrapper.hpp"
-#include "core/wrappers/interpolation_wrapper.hpp"
 #include "kernels/device/copy_make_border_device.hpp"
 #include "kernels/host/copy_make_border_host.hpp"
 
@@ -43,10 +44,10 @@ void dispatch_copy_make_border_border_mode(hipStream_t stream, const Tensor& inp
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block_dim(64, 16);
-            dim3 grid_dim((out_desc.width() + block_dim.x - 1) / block_dim.x,
-                          (out_desc.height() + block_dim.y - 1) / block_dim.y, out_desc.batches());
-            Kernels::Device::copy_make_border<<<grid_dim, block_dim, 0, stream>>>(in_desc, out_desc, top, left);
+            dim3 block = detail::GetMaximumPotentialBlockSize2D(
+                Kernels::Device::copy_make_border<BorderWrapper<T, BorderMode>, ImageWrapper<T>>, 0);
+            dim3 grid = detail::GetGridSize2D(out_desc.width(), out_desc.height(), out_desc.batches(), block);
+            Kernels::Device::copy_make_border<<<grid, block, 0, stream>>>(in_desc, out_desc, top, left);
             break;
         }
         case eDeviceType::CPU: {
diff --git a/src/op_custom_crop.cpp b/src/op_custom_crop.cpp
index e0e84c77..049777b9 100644
--- a/src/op_custom_crop.cpp
+++ b/src/op_custom_crop.cpp
@@ -26,6 +26,7 @@ THE SOFTWARE.
 #include <functional>
 
 #include "common/validation_helpers.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/wrappers/image_wrapper.hpp"
 #include "kernels/device/custom_crop_device.hpp"
 #include "kernels/host/custom_crop_host.hpp"
@@ -40,9 +41,10 @@ void dispatch_custom_crop_dtype(hipStream_t stream, const Tensor& input, const T
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block(64, 16);
-            dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
-                      outputWrapper.batches());
+            dim3 block = detail::GetMaximumPotentialBlockSize2D(
+                Kernels::Device::custom_crop<ImageWrapper<T>, ImageWrapper<T>>, 0);
+            dim3 grid =
+                detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
             Kernels::Device::custom_crop<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, cropRect);
             break;
         }
diff --git a/src/op_cvt_color.cpp b/src/op_cvt_color.cpp
index 31abd103..0a29518e 100644
--- a/src/op_cvt_color.cpp
+++ b/src/op_cvt_color.cpp
@@ -23,9 +23,8 @@ THE SOFTWARE.
 
 #include <hip/hip_runtime.h>
 
-#include <iostream>
-
 #include "common/validation_helpers.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/tensor.hpp"
 #include "core/wrappers/image_wrapper.hpp"
 #include "kernels/device/cvt_color_device.hpp"
@@ -81,45 +80,82 @@ void CvtColor::operator()(hipStream_t stream, const Tensor &input, Tensor &outpu
     if (device == eDeviceType::GPU) {
         // Dispatch appropriate device kernel based on given conversion code
 
-        dim3 blockSize(32, 16);
-        dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y, samples);
-
         switch (conversionCode) {
-            case eColorConversionCode::COLOR_BGR2GRAY:
+            case eColorConversionCode::COLOR_BGR2GRAY: {
+                dim3 block = detail::GetMaximumPotentialBlockSize2D(
+                    Kernels::Device::rgb_or_bgr_to_grayscale<uchar3, eSwizzle::ZYXW, ImageWrapper<uchar3>,
+                                                             ImageWrapper<uchar1>>,
+                    0);
+                dim3 grid = detail::GetGridSize2D(width, height, samples, block);
                 Kernels::Device::rgb_or_bgr_to_grayscale<uchar3, eSwizzle::ZYXW>
-                    <<<gridSize, blockSize, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar1>(output));
+                    <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar1>(output));
                 break;
-
-            case eColorConversionCode::COLOR_RGB2GRAY:
+            }
+
+            case eColorConversionCode::COLOR_RGB2GRAY: {
+                dim3 block = detail::GetMaximumPotentialBlockSize2D(
+                    Kernels::Device::rgb_or_bgr_to_grayscale<uchar3, eSwizzle::XYZW, ImageWrapper<uchar3>,
+                                                             ImageWrapper<uchar1>>,
+                    0);
+                dim3 grid = detail::GetGridSize2D(width, height, samples, block);
                 Kernels::Device::rgb_or_bgr_to_grayscale<uchar3, eSwizzle::XYZW>
-                    <<<gridSize, blockSize, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar1>(output));
+                    <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar1>(output));
                 break;
+            }
 
             case eColorConversionCode::COLOR_BGR2RGB:
-            case eColorConversionCode::COLOR_RGB2BGR:
+            case eColorConversionCode::COLOR_RGB2BGR: {
+                dim3 block = detail::GetMaximumPotentialBlockSize2D(
+                    Kernels::Device::reorder<uchar3, eSwizzle::ZYXW, ImageWrapper<uchar3>, ImageWrapper<uchar3>>, 0);
+                dim3 grid = detail::GetGridSize2D(width, height, samples, block);
                 Kernels::Device::reorder<uchar3, eSwizzle::ZYXW>
-                    <<<gridSize, blockSize, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output));
+                    <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output));
                 break;
-
-            case eColorConversionCode::COLOR_BGR2YUV:
-                Kernels::Device::rgb_or_bgr_to_yuv<uchar3, eSwizzle::ZYXW><<<gridSize, blockSize, 0, stream>>>(
-                    ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
+            }
+
+            case eColorConversionCode::COLOR_BGR2YUV: {
+                dim3 block = detail::GetMaximumPotentialBlockSize2D(
+                    Kernels::Device::rgb_or_bgr_to_yuv<uchar3, eSwizzle::ZYXW, ImageWrapper<uchar3>,
+                                                       ImageWrapper<uchar3>>,
+                    0);
+                dim3 grid = detail::GetGridSize2D(width, height, samples, block);
+                Kernels::Device::rgb_or_bgr_to_yuv<uchar3, eSwizzle::ZYXW>
+                    <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
                 break;
-
-            case eColorConversionCode::COLOR_RGB2YUV:
-                Kernels::Device::rgb_or_bgr_to_yuv<uchar3, eSwizzle::XYZW><<<gridSize, blockSize, 0, stream>>>(
-                    ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
+            }
+
+            case eColorConversionCode::COLOR_RGB2YUV: {
+                dim3 block = detail::GetMaximumPotentialBlockSize2D(
+                    Kernels::Device::rgb_or_bgr_to_yuv<uchar3, eSwizzle::XYZW, ImageWrapper<uchar3>,
+                                                       ImageWrapper<uchar3>>,
+                    0);
+                dim3 grid = detail::GetGridSize2D(width, height, samples, block);
+                Kernels::Device::rgb_or_bgr_to_yuv<uchar3, eSwizzle::XYZW>
+                    <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
                 break;
-
-            case eColorConversionCode::COLOR_YUV2BGR:
-                Kernels::Device::yuv_to_rgb_or_bgr<uchar3, eSwizzle::ZYXW><<<gridSize, blockSize, 0, stream>>>(
-                    ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
+            }
+
+            case eColorConversionCode::COLOR_YUV2BGR: {
+                dim3 block = detail::GetMaximumPotentialBlockSize2D(
+                    Kernels::Device::yuv_to_rgb_or_bgr<uchar3, eSwizzle::ZYXW, ImageWrapper<uchar3>,
+                                                       ImageWrapper<uchar3>>,
+                    0);
+                dim3 grid = detail::GetGridSize2D(width, height, samples, block);
+                Kernels::Device::yuv_to_rgb_or_bgr<uchar3, eSwizzle::ZYXW>
+                    <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
                 break;
-
-            case eColorConversionCode::COLOR_YUV2RGB:
-                Kernels::Device::yuv_to_rgb_or_bgr<uchar3, eSwizzle::XYZW><<<gridSize, blockSize, 0, stream>>>(
-                    ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
+            }
+
+            case eColorConversionCode::COLOR_YUV2RGB: {
+                dim3 block = detail::GetMaximumPotentialBlockSize2D(
+                    Kernels::Device::yuv_to_rgb_or_bgr<uchar3, eSwizzle::XYZW, ImageWrapper<uchar3>,
+                                                       ImageWrapper<uchar3>>,
+                    0);
+                dim3 grid = detail::GetGridSize2D(width, height, samples, block);
+                Kernels::Device::yuv_to_rgb_or_bgr<uchar3, eSwizzle::XYZW>
+                    <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
                 break;
+            }
 
             default:
                 throw Exception("Not implemented", eStatusType::NOT_IMPLEMENTED);
diff --git a/src/op_flip.cpp b/src/op_flip.cpp
index 84e5ccd9..66219220 100644
--- a/src/op_flip.cpp
+++ b/src/op_flip.cpp
@@ -27,6 +27,7 @@ THE SOFTWARE.
 #include <unordered_map>
 
 #include "common/validation_helpers.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/exception.hpp"
 #include "core/status_type.h"
 #include "core/wrappers/image_wrapper.hpp"
@@ -42,9 +43,10 @@ void dispatch_flip_axis(hipStream_t stream, const Tensor& input, const Tensor& o
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block(64, 16);
-            dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
-                      outputWrapper.batches());
+            dim3 block = detail::GetMaximumPotentialBlockSize2D(
+                Kernels::Device::flip<FlipType, ImageWrapper<T>, ImageWrapper<T>>, 0);
+            dim3 grid =
+                detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
             Kernels::Device::flip<FlipType><<<grid, block, 0, stream>>>(inputWrapper, outputWrapper);
             break;
         }
diff --git a/src/op_gamma_contrast.cpp b/src/op_gamma_contrast.cpp
index 98a44a06..04eee593 100644
--- a/src/op_gamma_contrast.cpp
+++ b/src/op_gamma_contrast.cpp
@@ -24,15 +24,11 @@ THE SOFTWARE.
 
 #include <hip/hip_runtime.h>
 
-#include <algorithm>
 #include <cstring>
 #include <functional>
-#include <iostream>
-#include <vector>
 
-#include "common/array_wrapper.hpp"
-#include "common/math_vector.hpp"
 #include "common/validation_helpers.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/tensor.hpp"
 #include "core/wrappers/image_wrapper.hpp"
 #include "kernels/device/gamma_contrast_device.hpp"
@@ -47,9 +43,10 @@ void dispatch_gamma_contrast_dtype(hipStream_t stream, const Tensor &input, cons
     ImageWrapper<T> outputWrapper(output);
 
     if (device == eDeviceType::GPU) {
-        dim3 block(64, 16);
-        dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
-                  outputWrapper.batches());
+        dim3 block = detail::GetMaximumPotentialBlockSize2D(
+            Kernels::Device::gamma_contrast<ImageWrapper<T>, ImageWrapper<T>>, 0);
+        dim3 grid =
+            detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
 
         Kernels::Device::gamma_contrast<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, gamma);
     } else if (device == eDeviceType::CPU) {
diff --git a/src/op_normalize.cpp b/src/op_normalize.cpp
index 7cc2569e..90493ff3 100644
--- a/src/op_normalize.cpp
+++ b/src/op_normalize.cpp
@@ -26,6 +26,7 @@ THE SOFTWARE.
 #include <unordered_map>
 
 #include "common/validation_helpers.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/detail/type_traits.hpp"
 #include "core/tensor.hpp"
 #include "core/wrappers/image_wrapper.hpp"
@@ -49,9 +50,11 @@ void dispatch_normalize_stddev(hipStream_t stream, const Tensor& input, const Te
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block(32, 8);
-            dim3 grid((outputWrap.width() + block.x - 1) / block.x, (outputWrap.height() + block.y - 1) / block.y,
-                      outputWrap.batches());
+            dim3 block = detail::GetMaximumPotentialBlockSize2D(
+                Kernels::Device::normalize<ScaleStddev, ImageWrapper<T>, ImageWrapper<T>, ImageWrapper<work_type>,
+                                           ImageWrapper<work_type>>,
+                0);
+            dim3 grid = detail::GetGridSize2D(outputWrap.width(), outputWrap.height(), outputWrap.batches(), block);
             Kernels::Device::normalize<ScaleStddev>
                 <<<grid, block, 0, stream>>>(inputWrap, baseWrap, scaleWrap, outputWrap, global_scale, shift, epsilon);
             break;
@@ -112,7 +115,8 @@ void Normalize::operator()(hipStream_t stream, const Tensor& input, const Tensor
     // TODO: Need to support scalar base/scale tensors at some point. Will require some extra handling on the kernel
     // level. Once in place, this check can be removed.
     CHECK_TENSOR_COMPARISON(base.shape(base.layout().channels_index()) == input.shape(input.layout().channels_index()));
-    CHECK_TENSOR_COMPARISON(scale.shape(scale.layout().channels_index()) == input.shape(input.layout().channels_index()));
+    CHECK_TENSOR_COMPARISON(scale.shape(scale.layout().channels_index()) ==
+                            input.shape(input.layout().channels_index()));
 
     // Create kernel dispatching table based on input/output datatype and number of channels.
     // clang-format off
diff --git a/src/op_resize.cpp b/src/op_resize.cpp
index ef508d5d..bc74c8a3 100644
--- a/src/op_resize.cpp
+++ b/src/op_resize.cpp
@@ -25,7 +25,7 @@ THE SOFTWARE.
 #include <unordered_map>
 
 #include "common/validation_helpers.hpp"
-#include "core/detail/casting.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/exception.hpp"
 #include "core/status_type.h"
 #include "core/wrappers/interpolation_wrapper.hpp"
@@ -45,9 +45,10 @@ void dispatch_resize_interp(hipStream_t stream, const Tensor& input, const Tenso
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block(64, 16);
-            dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
-                      outputWrapper.batches());
+            dim3 block =
+                detail::GetMaximumPotentialBlockSize2D(Kernels::Device::resize<ImageWrapper<T>, ImageWrapper<T>>, 0);
+            dim3 grid =
+                detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
             Kernels::Device::resize<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, scaleX, scaleY);
             break;
         }
@@ -65,10 +66,11 @@ void dispatch_resize_dtype(hipStream_t stream, const Tensor& input, const Tensor
     static const std::unordered_map<
         eInterpolationType,
         std::function<void(hipStream_t stream, const Tensor& input, const Tensor& output, const eDeviceType device)>>
-        funcs = {{eInterpolationType::INTERP_TYPE_NEAREST, dispatch_resize_interp<T, eInterpolationType::INTERP_TYPE_NEAREST>},
-                 {eInterpolationType::INTERP_TYPE_LINEAR, dispatch_resize_interp<T, eInterpolationType::INTERP_TYPE_LINEAR>},
-                 {eInterpolationType::INTERP_TYPE_CUBIC, dispatch_resize_interp<T, eInterpolationType::INTERP_TYPE_CUBIC>}
-                };
+        funcs = {
+            {eInterpolationType::INTERP_TYPE_NEAREST,
+             dispatch_resize_interp<T, eInterpolationType::INTERP_TYPE_NEAREST>},
+            {eInterpolationType::INTERP_TYPE_LINEAR, dispatch_resize_interp<T, eInterpolationType::INTERP_TYPE_LINEAR>},
+            {eInterpolationType::INTERP_TYPE_CUBIC, dispatch_resize_interp<T, eInterpolationType::INTERP_TYPE_CUBIC>}};
 
     if (!funcs.contains(interpolation)) {
         throw Exception("Operation does not support the given interpolation mode.", eStatusType::NOT_IMPLEMENTED);
diff --git a/src/op_rotate.cpp b/src/op_rotate.cpp
index d692631f..a9f2bbb0 100644
--- a/src/op_rotate.cpp
+++ b/src/op_rotate.cpp
@@ -26,6 +26,7 @@ THE SOFTWARE.
 
 #include "common/array_wrapper.hpp"
 #include "common/validation_helpers.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/wrappers/interpolation_wrapper.hpp"
 #include "kernels/device/rotate_device.hpp"
 #include "kernels/host/rotate_host.hpp"
@@ -59,9 +60,9 @@ void dispatch_rotate_interp(hipStream_t stream, const Tensor &input, const Tenso
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block(32, 16);
-            dim3 grid((outputWrap.width() + block.x - 1) / block.x, (outputWrap.height() + block.y - 1) / block.y,
-                      outputWrap.batches());
+            dim3 block = detail::GetMaximumPotentialBlockSize2D(
+                Kernels::Device::rotate<ImageWrapper<T>, ImageWrapper<T>, ArrayWrapper<double, 6>>, 0);
+            dim3 grid = detail::GetGridSize2D(outputWrap.width(), outputWrap.height(), outputWrap.batches(), block);
             Kernels::Device::rotate<<<grid, block, 0, stream>>>(inputWrap, outputWrap, matWrap);
             break;
         }
diff --git a/src/op_thresholding.cpp b/src/op_thresholding.cpp
index 04613a2c..72ea45d9 100644
--- a/src/op_thresholding.cpp
+++ b/src/op_thresholding.cpp
@@ -26,8 +26,7 @@ THE SOFTWARE.
 #include <functional>
 
 #include "common/validation_helpers.hpp"
-#include "core/exception.hpp"
-#include "core/status_type.h"
+#include "core/detail/hip_utils.hpp"
 #include "core/wrappers/generic_tensor_wrapper.hpp"
 #include "core/wrappers/image_wrapper.hpp"
 #include "kernels/device/thresholding_device.hpp"
@@ -50,32 +49,49 @@ void dispatch_threshold_dtype(hipStream_t stream, const Tensor &input, const Ten
     const auto width = input.shape()[input.shape().layout().width_index()];
 
     if (device == eDeviceType::GPU) {
-        dim3 block(64, 16);
-        dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y, m_maxBatchSize);
-
         switch (m_threshType) {
-            case THRESH_BINARY:
+            case THRESH_BINARY: {
+                dim3 block = detail::GetMaximumPotentialBlockSize2D(
+                    Kernels::Device::binary_generic<ImageWrapper<T>, ImageWrapper<T>>, 0);
+                dim3 grid = detail::GetGridSize2D(width, height, m_maxBatchSize, block);
                 Kernels::Device::binary_generic<<<grid, block, 0, stream>>>(
                     inputWrapper, outputWrapper, GenericTensorWrapper<double>(thresh),
                     GenericTensorWrapper<double>(maxVal), m_maxBatchSize);
                 break;
-            case THRESH_BINARY_INV:
+            }
+            case THRESH_BINARY_INV: {
+                dim3 block = detail::GetMaximumPotentialBlockSize2D(
+                    Kernels::Device::binary_inv_generic<ImageWrapper<T>, ImageWrapper<T>>, 0);
+                dim3 grid = detail::GetGridSize2D(width, height, m_maxBatchSize, block);
                 Kernels::Device::binary_inv_generic<<<grid, block, 0, stream>>>(
                     inputWrapper, outputWrapper, GenericTensorWrapper<double>(thresh),
                     GenericTensorWrapper<double>(maxVal), m_maxBatchSize);
                 break;
-            case THRESH_TRUNC:
+            }
+            case THRESH_TRUNC: {
+                dim3 block = detail::GetMaximumPotentialBlockSize2D(
+                    Kernels::Device::trunc_generic<ImageWrapper<T>, ImageWrapper<T>>, 0);
+                dim3 grid = detail::GetGridSize2D(width, height, m_maxBatchSize, block);
                 Kernels::Device::trunc_generic<<<grid, block, 0, stream>>>(
                     inputWrapper, outputWrapper, GenericTensorWrapper<double>(thresh), m_maxBatchSize);
                 break;
-            case THRESH_TOZERO:
+            }
+            case THRESH_TOZERO: {
+                dim3 block = detail::GetMaximumPotentialBlockSize2D(
+                    Kernels::Device::tozero_generic<ImageWrapper<T>, ImageWrapper<T>>, 0);
+                dim3 grid = detail::GetGridSize2D(width, height, m_maxBatchSize, block);
                 Kernels::Device::tozero_generic<<<grid, block, 0, stream>>>(
                     inputWrapper, outputWrapper, GenericTensorWrapper<double>(thresh), m_maxBatchSize);
                 break;
-            case THRESH_TOZERO_INV:
+            }
+            case THRESH_TOZERO_INV: {
+                dim3 block = detail::GetMaximumPotentialBlockSize2D(
+                    Kernels::Device::tozeroinv_generic<ImageWrapper<T>, ImageWrapper<T>>, 0);
+                dim3 grid = detail::GetGridSize2D(width, height, m_maxBatchSize, block);
                 Kernels::Device::tozeroinv_generic<<<grid, block, 0, stream>>>(
                     inputWrapper, outputWrapper, GenericTensorWrapper<double>(thresh), m_maxBatchSize);
                 break;
+            }
         }
 
     } else if (device == eDeviceType::CPU) {
diff --git a/src/op_warp_perspective.cpp b/src/op_warp_perspective.cpp
index c90e36fb..6cf9052a 100644
--- a/src/op_warp_perspective.cpp
+++ b/src/op_warp_perspective.cpp
@@ -26,6 +26,7 @@ THE SOFTWARE.
 #include "common/array_wrapper.hpp"
 #include "common/validation_helpers.hpp"
 #include "core/detail/casting.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/detail/math/math.hpp"
 #include "core/detail/type_traits.hpp"
 #include "kernels/device/warp_perspective_device.hpp"
@@ -43,9 +44,10 @@ void dispatch_warp_perspective_interp(hipStream_t stream, const Tensor &input, c
     // Launch CPU/GPU kernel depending on requested device type.
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block(64, 16);
-            dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
-                      outputWrapper.batches());
+            dim3 block = detail::GetMaximumPotentialBlockSize2D(
+                Kernels::Device::warp_perspective<ImageWrapper<T>, ImageWrapper<T>, ArrayWrapper<float, 9>>, 0);
+            dim3 grid =
+                detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
             Kernels::Device::warp_perspective<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, transform);
             break;
         }

From 8c84900e2413ca529573810e24dfbbdc65a91dca Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Mon, 9 Feb 2026 16:46:11 -0500
Subject: [PATCH 06/17] Add dynamic block sizes to rest of operators

---
 src/op_bilateral_filter.cpp    |  8 +++-----
 src/op_convert_to.cpp          | 35 +++++++++++++++++-----------------
 src/op_histogram.cpp           |  2 --
 src/op_non_max_suppression.cpp |  1 -
 src/op_remap.cpp               | 11 +++++------
 src/op_warp_perspective.cpp    |  1 -
 6 files changed, 25 insertions(+), 33 deletions(-)

diff --git a/src/op_bilateral_filter.cpp b/src/op_bilateral_filter.cpp
index 7c39ac40..13e5c317 100644
--- a/src/op_bilateral_filter.cpp
+++ b/src/op_bilateral_filter.cpp
@@ -24,14 +24,11 @@ THE SOFTWARE.
 #include <hip/hip_runtime.h>
 
 #include <functional>
-#include <iostream>
 #include <numeric>
 
-#include "common/array_wrapper.hpp"
 #include "common/validation_helpers.hpp"
 #include "core/detail/casting.hpp"
-#include "core/detail/math/math.hpp"
-#include "core/detail/type_traits.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "kernels/device/bilateral_filter_device.hpp"
 #include "kernels/host/bilateral_filter_host.hpp"
 
@@ -72,7 +69,8 @@ void dispatch_bilateral_filter_border_mode(hipStream_t stream, const Tensor &inp
     float colorCoeff = -1 / (2 * sigmaColor * sigmaColor);
 
     if (device == eDeviceType::GPU) {
-        dim3 block(8, 8);
+        dim3 block = detail::GetMaximumPotentialBlockSize2D(
+            Kernels::Device::bilateral_filter<T, ImageWrapper<T>, ImageWrapper<T>>, 0);
         uint32_t xGridSize = (outputWrapper.width() + (block.x * 2) - 1) / (block.x * 2);
         uint32_t yGridSize = (outputWrapper.height() + (block.y * 2) - 1) / (block.y * 2);
         uint32_t zGridSize = outputWrapper.batches();
diff --git a/src/op_convert_to.cpp b/src/op_convert_to.cpp
index 6a8173d4..8e59d446 100644
--- a/src/op_convert_to.cpp
+++ b/src/op_convert_to.cpp
@@ -21,22 +21,23 @@ THE SOFTWARE.
 */
 #include "op_convert_to.hpp"
 
+#include <hip/hip_runtime.h>
+
 #include <functional>
 
-#include <hip/hip_runtime.h>
-#include "core/wrappers/image_wrapper.hpp"
 #include "common/validation_helpers.hpp"
 #include "core/detail/casting.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/detail/type_traits.hpp"
+#include "core/wrappers/image_wrapper.hpp"
 #include "kernels/device/convert_to_device.hpp"
 #include "kernels/host/convert_to_host.hpp"
 
 namespace roccv {
 
 template <typename SRC_DT, typename DST_DT, int NC>
-void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const Tensor &output,
-                                       const double alpha, const double beta, const eDeviceType device) {
-    
+void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const Tensor &output, const double alpha,
+                                  const double beta, const eDeviceType device) {
     using SRC_DT_NC = detail::MakeType<SRC_DT, NC>;
     using DST_DT_NC = detail::MakeType<DST_DT, NC>;
 
@@ -47,16 +48,17 @@ void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const
     using DST_BT = detail::BaseType<DST_DT>;
 
     using DT_AB = decltype(float() * SRC_BT() * DST_BT());
-    
+
     DT_AB alpha_ab = detail::SaturateCast<DT_AB>(alpha);
     DT_AB beta_ab = detail::SaturateCast<DT_AB>(beta);
 
     // Launch CPU/GPU kernel depending on requested device type.
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block(64, 16);
-            dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
-                      outputWrapper.batches());
+            dim3 block = detail::GetMaximumPotentialBlockSize2D(
+                Kernels::Device::convert_to<ImageWrapper<SRC_DT_NC>, ImageWrapper<DST_DT_NC>, DT_AB>, 0);
+            dim3 grid =
+                detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
             Kernels::Device::convert_to<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, alpha_ab, beta_ab);
             break;
         }
@@ -68,16 +70,14 @@ void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const
 }
 
 template <typename SRC_DT, typename DST_DT>
-void dispatch_convert_to_output_dtype(hipStream_t stream, const Tensor &input, const Tensor &output,
-                                       const double alpha, const double beta, const eDeviceType device) {
-
+void dispatch_convert_to_output_dtype(hipStream_t stream, const Tensor &input, const Tensor &output, const double alpha,
+                                      const double beta, const eDeviceType device) {
     int64_t channels = output.shape(output.layout().channels_index());
     // Select kernel dispatcher based on number of channels.
     // clang-format off
     static const std::array<std::function<void(hipStream_t, const Tensor &, const Tensor &, const double, const double, const eDeviceType)>, 4>
         funcs = {dispatch_convert_to_channels<SRC_DT, DST_DT, 1>, dispatch_convert_to_channels<SRC_DT, DST_DT, 2>, dispatch_convert_to_channels<SRC_DT, DST_DT, 3>, dispatch_convert_to_channels<SRC_DT, DST_DT, 4>};
-        
-            
+
     // clang-format on
 
     auto func = funcs.at(channels - 1);
@@ -86,11 +86,10 @@ void dispatch_convert_to_output_dtype(hipStream_t stream, const Tensor &input, c
 }
 
 template <typename SRC_DT>
-void dispatch_convert_to_input_dtype(hipStream_t stream, const Tensor &input, const Tensor &output,
-                                       const double alpha, const double beta, const eDeviceType device) {
-    
+void dispatch_convert_to_input_dtype(hipStream_t stream, const Tensor &input, const Tensor &output, const double alpha,
+                                     const double beta, const eDeviceType device) {
     eDataType output_dtype = output.dtype().etype();
-    
+
     // Select kernel dispatcher based on a base input datatype.
     // clang-format off
     static const std::unordered_map<eDataType, std::function<void(hipStream_t, const Tensor &, const Tensor &, const double, const double, const eDeviceType)>>
diff --git a/src/op_histogram.cpp b/src/op_histogram.cpp
index 13cb6ac2..832a7dfe 100644
--- a/src/op_histogram.cpp
+++ b/src/op_histogram.cpp
@@ -22,13 +22,11 @@ THE SOFTWARE.
 #include "op_histogram.hpp"
 
 #include <hip/hip_runtime.h>
-#include <stdio.h>
 
 #include <cstring>
 #include <functional>
 #include <unordered_map>
 
-#include "common/array_wrapper.hpp"
 #include "common/validation_helpers.hpp"
 #include "core/wrappers/generic_tensor_wrapper.hpp"
 #include "core/wrappers/image_wrapper.hpp"
diff --git a/src/op_non_max_suppression.cpp b/src/op_non_max_suppression.cpp
index 0900fc50..fc61b51b 100644
--- a/src/op_non_max_suppression.cpp
+++ b/src/op_non_max_suppression.cpp
@@ -24,7 +24,6 @@ THE SOFTWARE.
 #include <hip/hip_runtime.h>
 
 #include "common/validation_helpers.hpp"
-#include "core/hip_assert.h"
 #include "kernels/device/non_max_suppression_device.hpp"
 #include "kernels/host/non_max_suppression_host.hpp"
 
diff --git a/src/op_remap.cpp b/src/op_remap.cpp
index 27f2e284..c7bde25b 100644
--- a/src/op_remap.cpp
+++ b/src/op_remap.cpp
@@ -23,11 +23,9 @@ THE SOFTWARE.
 
 #include <functional>
 
-#include "common/array_wrapper.hpp"
 #include "common/validation_helpers.hpp"
 #include "core/detail/casting.hpp"
-#include "core/detail/math/math.hpp"
-#include "core/detail/type_traits.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/wrappers/image_wrapper.hpp"
 #include "core/wrappers/interpolation_wrapper.hpp"
 #include "kernels/device/remap_device.hpp"
@@ -49,9 +47,10 @@ void dispatch_remap_mapInterp(hipStream_t stream, const Tensor &input, const Ten
     // Launch CPU/GPU kernel depending on requested device type.
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block(64, 16);
-            dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
-                      outputWrapper.batches());
+            dim3 block = detail::GetMaximumPotentialBlockSize2D(
+                Kernels::Device::remap<ImageWrapper<T>, ImageWrapper<T>, InterpolationWrapper<float2, B, M>>, 0);
+            dim3 grid =
+                detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
             Kernels::Device::remap<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, wrappedMapTensor);
             break;
         }
diff --git a/src/op_warp_perspective.cpp b/src/op_warp_perspective.cpp
index 6cf9052a..5857e2b6 100644
--- a/src/op_warp_perspective.cpp
+++ b/src/op_warp_perspective.cpp
@@ -28,7 +28,6 @@ THE SOFTWARE.
 #include "core/detail/casting.hpp"
 #include "core/detail/hip_utils.hpp"
 #include "core/detail/math/math.hpp"
-#include "core/detail/type_traits.hpp"
 #include "kernels/device/warp_perspective_device.hpp"
 #include "kernels/host/warp_perspective_host.hpp"
 

From 284eca8366697a0ce084d3f023dab192c62367a3 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Tue, 17 Feb 2026 12:47:24 -0500
Subject: [PATCH 07/17] Address PR comments

---
 include/core/detail/hip_utils.hpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp
index 0195df9c..ad1c264f 100644
--- a/include/core/detail/hip_utils.hpp
+++ b/include/core/detail/hip_utils.hpp
@@ -56,8 +56,8 @@ void LaunchHostFuncAsync(hipStream_t stream, Callable&& cb) {
  * @param[in] sharedMemSizePerBlock The shared memory size per block.
  * @return The maximum potential block size.
  */
-template <typename KernalFunc>
-dim3 GetMaximumPotentialBlockSize2D(KernalFunc kernel, size_t sharedMemSizePerBlock) {
+template <typename KernelFunc>
+dim3 GetMaximumPotentialBlockSize2D(KernelFunc kernel, size_t sharedMemSizePerBlock) {
     int minimumGridSize;
     int blockSize;
     int deviceId;
@@ -66,9 +66,14 @@ dim3 GetMaximumPotentialBlockSize2D(KernalFunc kernel, size_t sharedMemSizePerBl
     HIP_VALIDATE_NO_ERRORS(hipGetDevice(&deviceId));
     HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, deviceId));
     HIP_VALIDATE_NO_ERRORS(
-        hipOccupancyMaxPotentialBlockSize(&minimumGridSize, &blockSize, kernel, sharedMemSizePerBlock, warpSize));
+        hipOccupancyMaxPotentialBlockSize(&minimumGridSize, &blockSize, kernel, sharedMemSizePerBlock, 0));
 
-    return dim3(warpSize, blockSize / warpSize, 1);
+    if (blockSize >= warpSize && (blockSize % warpSize) == 0) {
+        return dim3(warpSize, blockSize / warpSize, 1);
+    }
+
+    // Fallback to block size if it's not a multiple of the warp size
+    return dim3(blockSize, 1, 1);
 }
 
 /**

From 77d79e464e1a860005a9d27f724683e1af1c4f0d Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Thu, 19 Feb 2026 16:26:59 -0500
Subject: [PATCH 08/17] Update block calculations to floor result

---
 include/core/detail/hip_utils.hpp | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp
index ad1c264f..331d67b4 100644
--- a/include/core/detail/hip_utils.hpp
+++ b/include/core/detail/hip_utils.hpp
@@ -68,12 +68,7 @@ dim3 GetMaximumPotentialBlockSize2D(KernelFunc kernel, size_t sharedMemSizePerBl
     HIP_VALIDATE_NO_ERRORS(
         hipOccupancyMaxPotentialBlockSize(&minimumGridSize, &blockSize, kernel, sharedMemSizePerBlock, 0));
 
-    if (blockSize >= warpSize && (blockSize % warpSize) == 0) {
-        return dim3(warpSize, blockSize / warpSize, 1);
-    }
-
-    // Fallback to block size if it's not a multiple of the warp size
-    return dim3(blockSize, 1, 1);
+    return dim3(warpSize, blockSize / warpSize, 1);
 }
 
 /**

From 1c37b9108c659d7a97fd7866c7695a7315a0ac47 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Fri, 20 Feb 2026 12:05:20 -0500
Subject: [PATCH 09/17] Hardcode target block size instead of querying runtime

---
 include/core/detail/hip_utils.hpp | 16 +++++----------
 src/op_bilateral_filter.cpp       |  3 +--
 src/op_bnd_box.cpp                |  3 +--
 src/op_composite.cpp              |  3 +--
 src/op_convert_to.cpp             |  3 +--
 src/op_copy_make_border.cpp       |  3 +--
 src/op_custom_crop.cpp            |  3 +--
 src/op_cvt_color.cpp              | 33 +++++++------------------------
 src/op_flip.cpp                   |  3 +--
 src/op_gamma_contrast.cpp         |  3 +--
 src/op_normalize.cpp              |  5 +----
 src/op_remap.cpp                  |  3 +--
 src/op_resize.cpp                 |  3 +--
 src/op_rotate.cpp                 |  3 +--
 src/op_thresholding.cpp           | 15 +++++---------
 src/op_warp_perspective.cpp       |  3 +--
 16 files changed, 30 insertions(+), 75 deletions(-)

diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp
index 331d67b4..be59eba6 100644
--- a/include/core/detail/hip_utils.hpp
+++ b/include/core/detail/hip_utils.hpp
@@ -50,25 +50,19 @@ void LaunchHostFuncAsync(hipStream_t stream, Callable&& cb) {
 }
 
 /**
- * @brief Get the maximum potential block size for a 2D kernel.
+ * @brief Get the block size for a 2D kernel.
  *
- * @param[in] kernel The kernel function to get the maximum potential block size for.
- * @param[in] sharedMemSizePerBlock The shared memory size per block.
- * @return The maximum potential block size.
+ * @param[in] targetBlockSize The target block size.
+ * @return The block size.
  */
-template <typename KernelFunc>
-dim3 GetMaximumPotentialBlockSize2D(KernelFunc kernel, size_t sharedMemSizePerBlock) {
-    int minimumGridSize;
-    int blockSize;
+static inline dim3 GetBlockSize2D(int targetBlockSize = 512) {
     int deviceId;
     int warpSize;
 
     HIP_VALIDATE_NO_ERRORS(hipGetDevice(&deviceId));
     HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, deviceId));
-    HIP_VALIDATE_NO_ERRORS(
-        hipOccupancyMaxPotentialBlockSize(&minimumGridSize, &blockSize, kernel, sharedMemSizePerBlock, 0));
 
-    return dim3(warpSize, blockSize / warpSize, 1);
+    return dim3(warpSize, targetBlockSize / warpSize, 1);
 }
 
 /**
diff --git a/src/op_bilateral_filter.cpp b/src/op_bilateral_filter.cpp
index 13e5c317..115821f0 100644
--- a/src/op_bilateral_filter.cpp
+++ b/src/op_bilateral_filter.cpp
@@ -69,8 +69,7 @@ void dispatch_bilateral_filter_border_mode(hipStream_t stream, const Tensor &inp
     float colorCoeff = -1 / (2 * sigmaColor * sigmaColor);
 
     if (device == eDeviceType::GPU) {
-        dim3 block = detail::GetMaximumPotentialBlockSize2D(
-            Kernels::Device::bilateral_filter<T, ImageWrapper<T>, ImageWrapper<T>>, 0);
+        dim3 block = detail::GetBlockSize2D();
         uint32_t xGridSize = (outputWrapper.width() + (block.x * 2) - 1) / (block.x * 2);
         uint32_t yGridSize = (outputWrapper.height() + (block.y * 2) - 1) / (block.y * 2);
         uint32_t zGridSize = outputWrapper.batches();
diff --git a/src/op_bnd_box.cpp b/src/op_bnd_box.cpp
index 6564658a..9e03736f 100644
--- a/src/op_bnd_box.cpp
+++ b/src/op_bnd_box.cpp
@@ -50,8 +50,7 @@ void dispatch_bnd_box_dtype(hipStream_t stream, const Tensor &input, const Tenso
     auto batchSize = inputWrapper.batches();
     switch (device) {
         case eDeviceType::GPU: {
-            const dim3 block = detail::GetMaximumPotentialBlockSize2D(
-                Kernels::Device::bndbox_kernel<has_alpha, T, ImageWrapper<T>, ImageWrapper<T>>, 0);
+            const dim3 block = detail::GetBlockSize2D();
             const dim3 grid = detail::GetGridSize2D(width, height, batchSize, block);
 
             Rect_t *rects_ptr = nullptr;
diff --git a/src/op_composite.cpp b/src/op_composite.cpp
index 1b21cc58..db6545f7 100644
--- a/src/op_composite.cpp
+++ b/src/op_composite.cpp
@@ -41,8 +41,7 @@ void dispatch_composite_masktype(hipStream_t stream, const Tensor& foreground, c
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block = detail::GetMaximumPotentialBlockSize2D(
-                Kernels::Device::composite<ImageWrapper<SrcType>, ImageWrapper<MaskType>, ImageWrapper<DstType>>, 0);
+            dim3 block = detail::GetBlockSize2D();
             dim3 grid =
                 detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
             Kernels::Device::composite<<<grid, block, 0, stream>>>(fgWrapper, bgWrapper, maskWrapper, outputWrapper);
diff --git a/src/op_convert_to.cpp b/src/op_convert_to.cpp
index 8e59d446..34503cfb 100644
--- a/src/op_convert_to.cpp
+++ b/src/op_convert_to.cpp
@@ -55,8 +55,7 @@ void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const
     // Launch CPU/GPU kernel depending on requested device type.
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block = detail::GetMaximumPotentialBlockSize2D(
-                Kernels::Device::convert_to<ImageWrapper<SRC_DT_NC>, ImageWrapper<DST_DT_NC>, DT_AB>, 0);
+            dim3 block = detail::GetBlockSize2D();
             dim3 grid =
                 detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
             Kernels::Device::convert_to<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, alpha_ab, beta_ab);
diff --git a/src/op_copy_make_border.cpp b/src/op_copy_make_border.cpp
index 78579393..887e3b38 100644
--- a/src/op_copy_make_border.cpp
+++ b/src/op_copy_make_border.cpp
@@ -44,8 +44,7 @@ void dispatch_copy_make_border_border_mode(hipStream_t stream, const Tensor& inp
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block = detail::GetMaximumPotentialBlockSize2D(
-                Kernels::Device::copy_make_border<BorderWrapper<T, BorderMode>, ImageWrapper<T>>, 0);
+            dim3 block = detail::GetBlockSize2D();
             dim3 grid = detail::GetGridSize2D(out_desc.width(), out_desc.height(), out_desc.batches(), block);
             Kernels::Device::copy_make_border<<<grid, block, 0, stream>>>(in_desc, out_desc, top, left);
             break;
diff --git a/src/op_custom_crop.cpp b/src/op_custom_crop.cpp
index 049777b9..3d5d2e8f 100644
--- a/src/op_custom_crop.cpp
+++ b/src/op_custom_crop.cpp
@@ -41,8 +41,7 @@ void dispatch_custom_crop_dtype(hipStream_t stream, const Tensor& input, const T
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block = detail::GetMaximumPotentialBlockSize2D(
-                Kernels::Device::custom_crop<ImageWrapper<T>, ImageWrapper<T>>, 0);
+            dim3 block = detail::GetBlockSize2D();
             dim3 grid =
                 detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
             Kernels::Device::custom_crop<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, cropRect);
diff --git a/src/op_cvt_color.cpp b/src/op_cvt_color.cpp
index 0a29518e..9094854c 100644
--- a/src/op_cvt_color.cpp
+++ b/src/op_cvt_color.cpp
@@ -82,10 +82,7 @@ void CvtColor::operator()(hipStream_t stream, const Tensor &input, Tensor &outpu
 
         switch (conversionCode) {
             case eColorConversionCode::COLOR_BGR2GRAY: {
-                dim3 block = detail::GetMaximumPotentialBlockSize2D(
-                    Kernels::Device::rgb_or_bgr_to_grayscale<uchar3, eSwizzle::ZYXW, ImageWrapper<uchar3>,
-                                                             ImageWrapper<uchar1>>,
-                    0);
+                dim3 block = detail::GetBlockSize2D();
                 dim3 grid = detail::GetGridSize2D(width, height, samples, block);
                 Kernels::Device::rgb_or_bgr_to_grayscale<uchar3, eSwizzle::ZYXW>
                     <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar1>(output));
@@ -93,10 +90,7 @@ void CvtColor::operator()(hipStream_t stream, const Tensor &input, Tensor &outpu
             }
 
             case eColorConversionCode::COLOR_RGB2GRAY: {
-                dim3 block = detail::GetMaximumPotentialBlockSize2D(
-                    Kernels::Device::rgb_or_bgr_to_grayscale<uchar3, eSwizzle::XYZW, ImageWrapper<uchar3>,
-                                                             ImageWrapper<uchar1>>,
-                    0);
+                dim3 block = detail::GetBlockSize2D();
                 dim3 grid = detail::GetGridSize2D(width, height, samples, block);
                 Kernels::Device::rgb_or_bgr_to_grayscale<uchar3, eSwizzle::XYZW>
                     <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar1>(output));
@@ -105,8 +99,7 @@ void CvtColor::operator()(hipStream_t stream, const Tensor &input, Tensor &outpu
 
             case eColorConversionCode::COLOR_BGR2RGB:
             case eColorConversionCode::COLOR_RGB2BGR: {
-                dim3 block = detail::GetMaximumPotentialBlockSize2D(
-                    Kernels::Device::reorder<uchar3, eSwizzle::ZYXW, ImageWrapper<uchar3>, ImageWrapper<uchar3>>, 0);
+                dim3 block = detail::GetBlockSize2D();
                 dim3 grid = detail::GetGridSize2D(width, height, samples, block);
                 Kernels::Device::reorder<uchar3, eSwizzle::ZYXW>
                     <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output));
@@ -114,10 +107,7 @@ void CvtColor::operator()(hipStream_t stream, const Tensor &input, Tensor &outpu
             }
 
             case eColorConversionCode::COLOR_BGR2YUV: {
-                dim3 block = detail::GetMaximumPotentialBlockSize2D(
-                    Kernels::Device::rgb_or_bgr_to_yuv<uchar3, eSwizzle::ZYXW, ImageWrapper<uchar3>,
-                                                       ImageWrapper<uchar3>>,
-                    0);
+                dim3 block = detail::GetBlockSize2D();
                 dim3 grid = detail::GetGridSize2D(width, height, samples, block);
                 Kernels::Device::rgb_or_bgr_to_yuv<uchar3, eSwizzle::ZYXW>
                     <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
@@ -125,10 +115,7 @@ void CvtColor::operator()(hipStream_t stream, const Tensor &input, Tensor &outpu
             }
 
             case eColorConversionCode::COLOR_RGB2YUV: {
-                dim3 block = detail::GetMaximumPotentialBlockSize2D(
-                    Kernels::Device::rgb_or_bgr_to_yuv<uchar3, eSwizzle::XYZW, ImageWrapper<uchar3>,
-                                                       ImageWrapper<uchar3>>,
-                    0);
+                dim3 block = detail::GetBlockSize2D();
                 dim3 grid = detail::GetGridSize2D(width, height, samples, block);
                 Kernels::Device::rgb_or_bgr_to_yuv<uchar3, eSwizzle::XYZW>
                     <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
@@ -136,10 +123,7 @@ void CvtColor::operator()(hipStream_t stream, const Tensor &input, Tensor &outpu
             }
 
             case eColorConversionCode::COLOR_YUV2BGR: {
-                dim3 block = detail::GetMaximumPotentialBlockSize2D(
-                    Kernels::Device::yuv_to_rgb_or_bgr<uchar3, eSwizzle::ZYXW, ImageWrapper<uchar3>,
-                                                       ImageWrapper<uchar3>>,
-                    0);
+                dim3 block = detail::GetBlockSize2D();
                 dim3 grid = detail::GetGridSize2D(width, height, samples, block);
                 Kernels::Device::yuv_to_rgb_or_bgr<uchar3, eSwizzle::ZYXW>
                     <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
@@ -147,10 +131,7 @@ void CvtColor::operator()(hipStream_t stream, const Tensor &input, Tensor &outpu
             }
 
             case eColorConversionCode::COLOR_YUV2RGB: {
-                dim3 block = detail::GetMaximumPotentialBlockSize2D(
-                    Kernels::Device::yuv_to_rgb_or_bgr<uchar3, eSwizzle::XYZW, ImageWrapper<uchar3>,
-                                                       ImageWrapper<uchar3>>,
-                    0);
+                dim3 block = detail::GetBlockSize2D();
                 dim3 grid = detail::GetGridSize2D(width, height, samples, block);
                 Kernels::Device::yuv_to_rgb_or_bgr<uchar3, eSwizzle::XYZW>
                     <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
diff --git a/src/op_flip.cpp b/src/op_flip.cpp
index 66219220..002d4e48 100644
--- a/src/op_flip.cpp
+++ b/src/op_flip.cpp
@@ -43,8 +43,7 @@ void dispatch_flip_axis(hipStream_t stream, const Tensor& input, const Tensor& o
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block = detail::GetMaximumPotentialBlockSize2D(
-                Kernels::Device::flip<FlipType, ImageWrapper<T>, ImageWrapper<T>>, 0);
+            dim3 block = detail::GetBlockSize2D();
             dim3 grid =
                 detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
             Kernels::Device::flip<FlipType><<<grid, block, 0, stream>>>(inputWrapper, outputWrapper);
diff --git a/src/op_gamma_contrast.cpp b/src/op_gamma_contrast.cpp
index 04eee593..0362046f 100644
--- a/src/op_gamma_contrast.cpp
+++ b/src/op_gamma_contrast.cpp
@@ -43,8 +43,7 @@ void dispatch_gamma_contrast_dtype(hipStream_t stream, const Tensor &input, cons
     ImageWrapper<T> outputWrapper(output);
 
     if (device == eDeviceType::GPU) {
-        dim3 block = detail::GetMaximumPotentialBlockSize2D(
-            Kernels::Device::gamma_contrast<ImageWrapper<T>, ImageWrapper<T>>, 0);
+        dim3 block = detail::GetBlockSize2D();
         dim3 grid =
             detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
 
diff --git a/src/op_normalize.cpp b/src/op_normalize.cpp
index 90493ff3..802ec680 100644
--- a/src/op_normalize.cpp
+++ b/src/op_normalize.cpp
@@ -50,10 +50,7 @@ void dispatch_normalize_stddev(hipStream_t stream, const Tensor& input, const Te
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block = detail::GetMaximumPotentialBlockSize2D(
-                Kernels::Device::normalize<ScaleStddev, ImageWrapper<T>, ImageWrapper<T>, ImageWrapper<work_type>,
-                                           ImageWrapper<work_type>>,
-                0);
+            dim3 block = detail::GetBlockSize2D();
             dim3 grid = detail::GetGridSize2D(outputWrap.width(), outputWrap.height(), outputWrap.batches(), block);
             Kernels::Device::normalize<ScaleStddev>
                 <<<grid, block, 0, stream>>>(inputWrap, baseWrap, scaleWrap, outputWrap, global_scale, shift, epsilon);
diff --git a/src/op_remap.cpp b/src/op_remap.cpp
index c7bde25b..1fdfced6 100644
--- a/src/op_remap.cpp
+++ b/src/op_remap.cpp
@@ -47,8 +47,7 @@ void dispatch_remap_mapInterp(hipStream_t stream, const Tensor &input, const Ten
     // Launch CPU/GPU kernel depending on requested device type.
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block = detail::GetMaximumPotentialBlockSize2D(
-                Kernels::Device::remap<ImageWrapper<T>, ImageWrapper<T>, InterpolationWrapper<float2, B, M>>, 0);
+            dim3 block = detail::GetBlockSize2D();
             dim3 grid =
                 detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
             Kernels::Device::remap<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, wrappedMapTensor);
diff --git a/src/op_resize.cpp b/src/op_resize.cpp
index bc74c8a3..a8df3b6d 100644
--- a/src/op_resize.cpp
+++ b/src/op_resize.cpp
@@ -45,8 +45,7 @@ void dispatch_resize_interp(hipStream_t stream, const Tensor& input, const Tenso
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block =
-                detail::GetMaximumPotentialBlockSize2D(Kernels::Device::resize<ImageWrapper<T>, ImageWrapper<T>>, 0);
+            dim3 block = detail::GetBlockSize2D();
             dim3 grid =
                 detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
             Kernels::Device::resize<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, scaleX, scaleY);
diff --git a/src/op_rotate.cpp b/src/op_rotate.cpp
index a9f2bbb0..c922fa6f 100644
--- a/src/op_rotate.cpp
+++ b/src/op_rotate.cpp
@@ -60,8 +60,7 @@ void dispatch_rotate_interp(hipStream_t stream, const Tensor &input, const Tenso
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block = detail::GetMaximumPotentialBlockSize2D(
-                Kernels::Device::rotate<ImageWrapper<T>, ImageWrapper<T>, ArrayWrapper<double, 6>>, 0);
+            dim3 block = detail::GetBlockSize2D();
             dim3 grid = detail::GetGridSize2D(outputWrap.width(), outputWrap.height(), outputWrap.batches(), block);
             Kernels::Device::rotate<<<grid, block, 0, stream>>>(inputWrap, outputWrap, matWrap);
             break;
diff --git a/src/op_thresholding.cpp b/src/op_thresholding.cpp
index 72ea45d9..5fc96005 100644
--- a/src/op_thresholding.cpp
+++ b/src/op_thresholding.cpp
@@ -51,8 +51,7 @@ void dispatch_threshold_dtype(hipStream_t stream, const Tensor &input, const Ten
     if (device == eDeviceType::GPU) {
         switch (m_threshType) {
             case THRESH_BINARY: {
-                dim3 block = detail::GetMaximumPotentialBlockSize2D(
-                    Kernels::Device::binary_generic<ImageWrapper<T>, ImageWrapper<T>>, 0);
+                dim3 block = detail::GetBlockSize2D();
                 dim3 grid = detail::GetGridSize2D(width, height, m_maxBatchSize, block);
                 Kernels::Device::binary_generic<<<grid, block, 0, stream>>>(
                     inputWrapper, outputWrapper, GenericTensorWrapper<double>(thresh),
@@ -60,8 +59,7 @@ void dispatch_threshold_dtype(hipStream_t stream, const Tensor &input, const Ten
                 break;
             }
             case THRESH_BINARY_INV: {
-                dim3 block = detail::GetMaximumPotentialBlockSize2D(
-                    Kernels::Device::binary_inv_generic<ImageWrapper<T>, ImageWrapper<T>>, 0);
+                dim3 block = detail::GetBlockSize2D();
                 dim3 grid = detail::GetGridSize2D(width, height, m_maxBatchSize, block);
                 Kernels::Device::binary_inv_generic<<<grid, block, 0, stream>>>(
                     inputWrapper, outputWrapper, GenericTensorWrapper<double>(thresh),
@@ -69,24 +67,21 @@ void dispatch_threshold_dtype(hipStream_t stream, const Tensor &input, const Ten
                 break;
             }
             case THRESH_TRUNC: {
-                dim3 block = detail::GetMaximumPotentialBlockSize2D(
-                    Kernels::Device::trunc_generic<ImageWrapper<T>, ImageWrapper<T>>, 0);
+                dim3 block = detail::GetBlockSize2D();
                 dim3 grid = detail::GetGridSize2D(width, height, m_maxBatchSize, block);
                 Kernels::Device::trunc_generic<<<grid, block, 0, stream>>>(
                     inputWrapper, outputWrapper, GenericTensorWrapper<double>(thresh), m_maxBatchSize);
                 break;
             }
             case THRESH_TOZERO: {
-                dim3 block = detail::GetMaximumPotentialBlockSize2D(
-                    Kernels::Device::tozero_generic<ImageWrapper<T>, ImageWrapper<T>>, 0);
+                dim3 block = detail::GetBlockSize2D();
                 dim3 grid = detail::GetGridSize2D(width, height, m_maxBatchSize, block);
                 Kernels::Device::tozero_generic<<<grid, block, 0, stream>>>(
                     inputWrapper, outputWrapper, GenericTensorWrapper<double>(thresh), m_maxBatchSize);
                 break;
             }
             case THRESH_TOZERO_INV: {
-                dim3 block = detail::GetMaximumPotentialBlockSize2D(
-                    Kernels::Device::tozeroinv_generic<ImageWrapper<T>, ImageWrapper<T>>, 0);
+                dim3 block = detail::GetBlockSize2D();
                 dim3 grid = detail::GetGridSize2D(width, height, m_maxBatchSize, block);
                 Kernels::Device::tozeroinv_generic<<<grid, block, 0, stream>>>(
                     inputWrapper, outputWrapper, GenericTensorWrapper<double>(thresh), m_maxBatchSize);
diff --git a/src/op_warp_perspective.cpp b/src/op_warp_perspective.cpp
index 5857e2b6..430d2130 100644
--- a/src/op_warp_perspective.cpp
+++ b/src/op_warp_perspective.cpp
@@ -43,8 +43,7 @@ void dispatch_warp_perspective_interp(hipStream_t stream, const Tensor &input, c
     // Launch CPU/GPU kernel depending on requested device type.
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block = detail::GetMaximumPotentialBlockSize2D(
-                Kernels::Device::warp_perspective<ImageWrapper<T>, ImageWrapper<T>, ArrayWrapper<float, 9>>, 0);
+            dim3 block = detail::GetBlockSize2D();
             dim3 grid =
                 detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
             Kernels::Device::warp_perspective<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, transform);

From a899bb0debca028310cf48e962e62a78157d3686 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Mon, 27 Apr 2026 11:39:34 -0400
Subject: [PATCH 10/17] Use 1D blocks for pointwise kernels

---
 include/core/detail/hip_utils.hpp | 55 +++++++++++++++++++++++++++++++
 src/op_bnd_box.cpp                |  4 +--
 src/op_composite.cpp              |  4 +--
 src/op_convert_to.cpp             |  4 +--
 src/op_copy_make_border.cpp       |  4 +--
 src/op_custom_crop.cpp            |  4 +--
 src/op_cvt_color.cpp              | 28 ++++++++--------
 src/op_flip.cpp                   |  4 +--
 src/op_gamma_contrast.cpp         |  4 +--
 src/op_normalize.cpp              |  4 +--
 src/op_thresholding.cpp           |  4 +--
 11 files changed, 87 insertions(+), 32 deletions(-)

diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp
index be59eba6..668b3323 100644
--- a/include/core/detail/hip_utils.hpp
+++ b/include/core/detail/hip_utils.hpp
@@ -78,4 +78,59 @@ static inline dim3 GetGridSize2D(size_t width, size_t height, size_t batchSize,
     return dim3((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y, batchSize);
 }
 
+/**
+ * @brief Get the block size for a 1D kernel — all threads on the x axis.
+ *
+ * Use for pointwise kernels (no neighborhood reads). There is no locality
+ * benefit to grouping threads from different rows in the same block when
+ * each thread only touches its own pixel; a 1D block keeps every wavefront
+ * on a single contiguous row, maximizing coalescing and eliminating the
+ * y-axis index math and bottom-edge tail-wave waste of a 2D launch.
+ *
+ * Pair with GetGridSize1D, which lays the rows of the image out along
+ * gridDim.y so existing kernels can derive y directly from blockIdx.y
+ * without any indexing changes (since blockDim.y == 1 collapses the
+ * standard `y = blockDim.y * blockIdx.y + threadIdx.y` to `y = blockIdx.y`).
+ *
+ * The default of 256 threads is intentionally smaller than the 2D default
+ * of 512: pointwise ops are bandwidth-bound, so 4 wavefronts per block on
+ * AMD (warp=64) is enough to keep memory in flight on every CU while
+ * producing less tail waste than 512 on narrow images.
+ *
+ * @param[in] targetBlockSize Total threads per block. Should be a multiple
+ *                            of warpSize; otherwise it is silently floored
+ *                            to the nearest multiple. Defaults to 256.
+ * @return The block size: dim3(targetBlockSize, 1, 1), aligned to warpSize.
+ */
+static inline dim3 GetBlockSize1D(int targetBlockSize = 256) {
+    int deviceId;
+    int warpSize;
+
+    HIP_VALIDATE_NO_ERRORS(hipGetDevice(&deviceId));
+    HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, deviceId));
+
+    return dim3((targetBlockSize / warpSize) * warpSize, 1, 1);
+}
+
+/**
+ * @brief Get the grid size for a 1D-row-major launch.
+ *
+ * Lays one image row along gridDim.y and the batch along gridDim.z. The
+ * block from GetBlockSize1D has blockDim.y == 1, so the kernel's standard
+ * `y = blockDim.y * blockIdx.y + threadIdx.y` collapses to `y = blockIdx.y`
+ * with no kernel changes required.
+ *
+ * Caller is responsible for ensuring height does not exceed the device's
+ * gridDim.y limit (typically 65535) and batchSize does not exceed gridDim.z.
+ *
+ * @param[in] width      The width of the image.
+ * @param[in] height     The height of the image (becomes gridDim.y).
+ * @param[in] batchSize  The number of images (becomes gridDim.z).
+ * @param[in] blockSize  Block size from GetBlockSize1D.
+ * @return The grid size: dim3(ceil(width / blockSize.x), height, batchSize).
+ */
+static inline dim3 GetGridSize1D(size_t width, size_t height, size_t batchSize, dim3 blockSize) {
+    return dim3((width + blockSize.x - 1) / blockSize.x, height, batchSize);
+}
+
 }  // namespace roccv::detail
\ No newline at end of file
diff --git a/src/op_bnd_box.cpp b/src/op_bnd_box.cpp
index a46c35f7..42da0eac 100644
--- a/src/op_bnd_box.cpp
+++ b/src/op_bnd_box.cpp
@@ -50,8 +50,8 @@ void dispatch_bnd_box_dtype(hipStream_t stream, const Tensor &input, const Tenso
     auto batchSize = inputWrapper.batches();
     switch (device) {
         case eDeviceType::GPU: {
-            const dim3 block = detail::GetBlockSize2D();
-            const dim3 grid = detail::GetGridSize2D(width, height, batchSize, block);
+            const dim3 block = detail::GetBlockSize1D();
+            const dim3 grid = detail::GetGridSize1D(width, height, batchSize, block);
 
             Rect_t *rects_ptr = nullptr;
             const auto n_rects = rects->size();
diff --git a/src/op_composite.cpp b/src/op_composite.cpp
index 091c54f0..559582fb 100644
--- a/src/op_composite.cpp
+++ b/src/op_composite.cpp
@@ -41,9 +41,9 @@ void dispatch_composite_masktype(hipStream_t stream, const Tensor& foreground, c
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block = detail::GetBlockSize2D();
+            dim3 block = detail::GetBlockSize1D();
             dim3 grid =
-                detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
+                detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
             Kernels::Device::composite<<<grid, block, 0, stream>>>(fgWrapper, bgWrapper, maskWrapper, outputWrapper);
             break;
         }
diff --git a/src/op_convert_to.cpp b/src/op_convert_to.cpp
index 27900fd6..11067706 100644
--- a/src/op_convert_to.cpp
+++ b/src/op_convert_to.cpp
@@ -55,9 +55,9 @@ void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const
     // Launch CPU/GPU kernel depending on requested device type.
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block = detail::GetBlockSize2D();
+            dim3 block = detail::GetBlockSize1D();
             dim3 grid =
-                detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
+                detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
             Kernels::Device::convert_to<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, alpha_ab, beta_ab);
             break;
         }
diff --git a/src/op_copy_make_border.cpp b/src/op_copy_make_border.cpp
index 36a29e91..7d8c2794 100644
--- a/src/op_copy_make_border.cpp
+++ b/src/op_copy_make_border.cpp
@@ -44,8 +44,8 @@ void dispatch_copy_make_border_border_mode(hipStream_t stream, const Tensor& inp
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block = detail::GetBlockSize2D();
-            dim3 grid = detail::GetGridSize2D(out_desc.width(), out_desc.height(), out_desc.batches(), block);
+            dim3 block = detail::GetBlockSize1D();
+            dim3 grid = detail::GetGridSize1D(out_desc.width(), out_desc.height(), out_desc.batches(), block);
             Kernels::Device::copy_make_border<<<grid, block, 0, stream>>>(in_desc, out_desc, top, left);
             break;
         }
diff --git a/src/op_custom_crop.cpp b/src/op_custom_crop.cpp
index e4ae6480..0e54db1a 100644
--- a/src/op_custom_crop.cpp
+++ b/src/op_custom_crop.cpp
@@ -41,9 +41,9 @@ void dispatch_custom_crop_dtype(hipStream_t stream, const Tensor& input, const T
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block = detail::GetBlockSize2D();
+            dim3 block = detail::GetBlockSize1D();
             dim3 grid =
-                detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
+                detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
             Kernels::Device::custom_crop<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, cropRect);
             break;
         }
diff --git a/src/op_cvt_color.cpp b/src/op_cvt_color.cpp
index 9094854c..98c0826b 100644
--- a/src/op_cvt_color.cpp
+++ b/src/op_cvt_color.cpp
@@ -82,16 +82,16 @@ void CvtColor::operator()(hipStream_t stream, const Tensor &input, Tensor &outpu
 
         switch (conversionCode) {
             case eColorConversionCode::COLOR_BGR2GRAY: {
-                dim3 block = detail::GetBlockSize2D();
-                dim3 grid = detail::GetGridSize2D(width, height, samples, block);
+                dim3 block = detail::GetBlockSize1D();
+                dim3 grid = detail::GetGridSize1D(width, height, samples, block);
                 Kernels::Device::rgb_or_bgr_to_grayscale<uchar3, eSwizzle::ZYXW>
                     <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar1>(output));
                 break;
             }
 
             case eColorConversionCode::COLOR_RGB2GRAY: {
-                dim3 block = detail::GetBlockSize2D();
-                dim3 grid = detail::GetGridSize2D(width, height, samples, block);
+                dim3 block = detail::GetBlockSize1D();
+                dim3 grid = detail::GetGridSize1D(width, height, samples, block);
                 Kernels::Device::rgb_or_bgr_to_grayscale<uchar3, eSwizzle::XYZW>
                     <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar1>(output));
                 break;
@@ -99,40 +99,40 @@ void CvtColor::operator()(hipStream_t stream, const Tensor &input, Tensor &outpu
 
             case eColorConversionCode::COLOR_BGR2RGB:
             case eColorConversionCode::COLOR_RGB2BGR: {
-                dim3 block = detail::GetBlockSize2D();
-                dim3 grid = detail::GetGridSize2D(width, height, samples, block);
+                dim3 block = detail::GetBlockSize1D();
+                dim3 grid = detail::GetGridSize1D(width, height, samples, block);
                 Kernels::Device::reorder<uchar3, eSwizzle::ZYXW>
                     <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output));
                 break;
             }
 
             case eColorConversionCode::COLOR_BGR2YUV: {
-                dim3 block = detail::GetBlockSize2D();
-                dim3 grid = detail::GetGridSize2D(width, height, samples, block);
+                dim3 block = detail::GetBlockSize1D();
+                dim3 grid = detail::GetGridSize1D(width, height, samples, block);
                 Kernels::Device::rgb_or_bgr_to_yuv<uchar3, eSwizzle::ZYXW>
                     <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
                 break;
             }
 
             case eColorConversionCode::COLOR_RGB2YUV: {
-                dim3 block = detail::GetBlockSize2D();
-                dim3 grid = detail::GetGridSize2D(width, height, samples, block);
+                dim3 block = detail::GetBlockSize1D();
+                dim3 grid = detail::GetGridSize1D(width, height, samples, block);
                 Kernels::Device::rgb_or_bgr_to_yuv<uchar3, eSwizzle::XYZW>
                     <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
                 break;
             }
 
             case eColorConversionCode::COLOR_YUV2BGR: {
-                dim3 block = detail::GetBlockSize2D();
-                dim3 grid = detail::GetGridSize2D(width, height, samples, block);
+                dim3 block = detail::GetBlockSize1D();
+                dim3 grid = detail::GetGridSize1D(width, height, samples, block);
                 Kernels::Device::yuv_to_rgb_or_bgr<uchar3, eSwizzle::ZYXW>
                     <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
                 break;
             }
 
             case eColorConversionCode::COLOR_YUV2RGB: {
-                dim3 block = detail::GetBlockSize2D();
-                dim3 grid = detail::GetGridSize2D(width, height, samples, block);
+                dim3 block = detail::GetBlockSize1D();
+                dim3 grid = detail::GetGridSize1D(width, height, samples, block);
                 Kernels::Device::yuv_to_rgb_or_bgr<uchar3, eSwizzle::XYZW>
                     <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
                 break;
diff --git a/src/op_flip.cpp b/src/op_flip.cpp
index 79ec12b6..6bb7dd33 100644
--- a/src/op_flip.cpp
+++ b/src/op_flip.cpp
@@ -43,9 +43,9 @@ void dispatch_flip_axis(hipStream_t stream, const Tensor& input, const Tensor& o
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block = detail::GetBlockSize2D();
+            dim3 block = detail::GetBlockSize1D();
             dim3 grid =
-                detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
+                detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
             Kernels::Device::flip<FlipType><<<grid, block, 0, stream>>>(inputWrapper, outputWrapper);
             break;
         }
diff --git a/src/op_gamma_contrast.cpp b/src/op_gamma_contrast.cpp
index f596e5bf..43e66ed8 100644
--- a/src/op_gamma_contrast.cpp
+++ b/src/op_gamma_contrast.cpp
@@ -43,9 +43,9 @@ void dispatch_gamma_contrast_dtype(hipStream_t stream, const Tensor &input, cons
     ImageWrapper<T> outputWrapper(output);
 
     if (device == eDeviceType::GPU) {
-        dim3 block = detail::GetBlockSize2D();
+        dim3 block = detail::GetBlockSize1D();
         dim3 grid =
-            detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
+            detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
 
         Kernels::Device::gamma_contrast<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, gamma);
     } else if (device == eDeviceType::CPU) {
diff --git a/src/op_normalize.cpp b/src/op_normalize.cpp
index 5aeb1a8a..35dd5b25 100644
--- a/src/op_normalize.cpp
+++ b/src/op_normalize.cpp
@@ -50,8 +50,8 @@ void dispatch_normalize_stddev(hipStream_t stream, const Tensor& input, const Te
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block = detail::GetBlockSize2D();
-            dim3 grid = detail::GetGridSize2D(outputWrap.width(), outputWrap.height(), outputWrap.batches(), block);
+            dim3 block = detail::GetBlockSize1D();
+            dim3 grid = detail::GetGridSize1D(outputWrap.width(), outputWrap.height(), outputWrap.batches(), block);
             Kernels::Device::normalize<ScaleStddev>
                 <<<grid, block, 0, stream>>>(inputWrap, baseWrap, scaleWrap, outputWrap, global_scale, shift, epsilon);
             break;
diff --git a/src/op_thresholding.cpp b/src/op_thresholding.cpp
index 837e9982..c78c9884 100644
--- a/src/op_thresholding.cpp
+++ b/src/op_thresholding.cpp
@@ -51,8 +51,8 @@ void dispatch_threshold_dtype(hipStream_t stream, const Tensor &input, const Ten
     const auto width = input.shape()[input.shape().layout().width_index()];
 
     if (device == eDeviceType::GPU) {
-        dim3 block = detail::GetBlockSize2D();
-        dim3 grid = detail::GetGridSize2D(width, height, outputWrapper.batches(), block);
+        dim3 block = detail::GetBlockSize1D();
+        dim3 grid = detail::GetGridSize1D(width, height, outputWrapper.batches(), block);
 
         switch (m_threshType) {
             case THRESH_BINARY:

From 063f97f703755bbe9cc1ecf97839c4959c43ec29 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Mon, 27 Apr 2026 11:49:55 -0400
Subject: [PATCH 11/17] Change default 2D block size to 256

---
 include/core/detail/hip_utils.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp
index 668b3323..7acc635c 100644
--- a/include/core/detail/hip_utils.hpp
+++ b/include/core/detail/hip_utils.hpp
@@ -55,7 +55,7 @@ void LaunchHostFuncAsync(hipStream_t stream, Callable&& cb) {
  * @param[in] targetBlockSize The target block size.
  * @return The block size.
  */
-static inline dim3 GetBlockSize2D(int targetBlockSize = 512) {
+static inline dim3 GetBlockSize2D(int targetBlockSize = 256) {
     int deviceId;
     int warpSize;
 

From 9ba59d468583e882b8eb0a73dd2c91e2f988c890 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Mon, 27 Apr 2026 11:59:12 -0400
Subject: [PATCH 12/17] Switch default target block size to 128

---
 include/core/detail/hip_utils.hpp | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp
index 7acc635c..522c7bde 100644
--- a/include/core/detail/hip_utils.hpp
+++ b/include/core/detail/hip_utils.hpp
@@ -55,7 +55,7 @@ void LaunchHostFuncAsync(hipStream_t stream, Callable&& cb) {
  * @param[in] targetBlockSize The target block size.
  * @return The block size.
  */
-static inline dim3 GetBlockSize2D(int targetBlockSize = 256) {
+static inline dim3 GetBlockSize2D(int targetBlockSize = 128) {
     int deviceId;
     int warpSize;
 
@@ -92,17 +92,12 @@ static inline dim3 GetGridSize2D(size_t width, size_t height, size_t batchSize,
  * without any indexing changes (since blockDim.y == 1 collapses the
  * standard `y = blockDim.y * blockIdx.y + threadIdx.y` to `y = blockIdx.y`).
  *
- * The default of 256 threads is intentionally smaller than the 2D default
- * of 512: pointwise ops are bandwidth-bound, so 4 wavefronts per block on
- * AMD (warp=64) is enough to keep memory in flight on every CU while
- * producing less tail waste than 512 on narrow images.
- *
  * @param[in] targetBlockSize Total threads per block. Should be a multiple
  *                            of warpSize; otherwise it is silently floored
  *                            to the nearest multiple. Defaults to 256.
  * @return The block size: dim3(targetBlockSize, 1, 1), aligned to warpSize.
  */
-static inline dim3 GetBlockSize1D(int targetBlockSize = 256) {
+static inline dim3 GetBlockSize1D(int targetBlockSize = 128) {
     int deviceId;
     int warpSize;
 

From 981346addd26255e25ef5ac06238e9390429b348 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Mon, 27 Apr 2026 13:47:55 -0400
Subject: [PATCH 13/17] Use runtime API to determine maximum occupancy

---
 include/core/detail/hip_utils.hpp | 120 +++++++++++++++++++++---------
 src/op_bilateral_filter.cpp       |   6 +-
 src/op_bnd_box.cpp                |   7 +-
 src/op_composite.cpp              |   6 +-
 src/op_convert_to.cpp             |   6 +-
 src/op_copy_make_border.cpp       |   6 +-
 src/op_custom_crop.cpp            |   5 +-
 src/op_cvt_color.cpp              |  49 ++++++------
 src/op_flip.cpp                   |   5 +-
 src/op_gamma_contrast.cpp         |   5 +-
 src/op_normalize.cpp              |   9 ++-
 src/op_remap.cpp                  |   7 +-
 src/op_resize.cpp                 |   6 +-
 src/op_rotate.cpp                 |   7 +-
 src/op_thresholding.cpp           |  52 ++++++++-----
 src/op_warp_perspective.cpp       |   6 +-
 16 files changed, 194 insertions(+), 108 deletions(-)

diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp
index 522c7bde..74a17108 100644
--- a/include/core/detail/hip_utils.hpp
+++ b/include/core/detail/hip_utils.hpp
@@ -50,19 +50,94 @@ void LaunchHostFuncAsync(hipStream_t stream, Callable&& cb) {
 }
 
 /**
- * @brief Get the block size for a 2D kernel.
+ * @brief Get the device's wavefront/warp size, cached per-thread.
  *
- * @param[in] targetBlockSize The target block size.
- * @return The block size.
+ * Re-queries the runtime if the active HIP device has changed since the
+ * last call from the calling thread (so multi-device callers stay correct
+ * without paying for a query on every launch).
  */
-static inline dim3 GetBlockSize2D(int targetBlockSize = 128) {
+inline int CachedWarpSize() {
+    static thread_local int cachedDeviceId = -1;
+    static thread_local int cachedWarpSize = 0;
     int deviceId;
-    int warpSize;
+    HIP_VALIDATE_NO_ERRORS(hipGetDevice(&deviceId));
+    if (deviceId != cachedDeviceId) {
+        HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&cachedWarpSize, hipDeviceAttributeWarpSize, deviceId));
+        cachedDeviceId = deviceId;
+    }
+    return cachedWarpSize;
+}
 
+/**
+ * @brief Cache hipOccupancyMaxPotentialBlockSize per (kernel, device).
+ *
+ * The driver picks a thread count that maximizes resident wavefronts per CU
+ * for the given kernel on the current device, accounting for the kernel's
+ * register and static-shared-memory usage. The result is bounded above by
+ * Cap so the API's drive toward maximum occupancy can't override workload-
+ * class judgment (memory-bound ops gain nothing past ~50% occupancy and
+ * can lose throughput to cache pressure with overly large blocks).
+ *
+ * Each (Kernel, Cap) instantiation gets its own thread-local cache slot,
+ * so the runtime query runs once per (kernel, device, cap) per thread.
+ *
+ * @tparam Kernel The __global__ function pointer (auto NTTP — each unique
+ *                kernel address gets its own cached result).
+ * @tparam Cap    Upper bound on the returned block size.
+ */
+template <auto Kernel, int Cap>
+inline int CachedOccupancyBlockSize() {
+    static thread_local int cachedDeviceId = -1;
+    static thread_local int cachedBlockSize = 0;
+    int deviceId;
     HIP_VALIDATE_NO_ERRORS(hipGetDevice(&deviceId));
-    HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, deviceId));
+    if (deviceId != cachedDeviceId) {
+        int minGridSize;
+        HIP_VALIDATE_NO_ERRORS(hipOccupancyMaxPotentialBlockSize(&minGridSize, &cachedBlockSize, Kernel, 0, Cap));
+        cachedDeviceId = deviceId;
+    }
+    return cachedBlockSize;
+}
+
+/**
+ * @brief Pick a 1D block size for a pointwise kernel via runtime occupancy
+ *        query, capped at Cap, and cached per (kernel, device).
+ *
+ * Use for pointwise kernels (no neighborhood reads). The driver returns a
+ * thread count tuned to this specific kernel's register pressure on the
+ * current device — important on architectures with very different SIMD-per-CU
+ * counts and register-file sizes (e.g. CDNA wants more wavefronts in flight
+ * per CU than RDNA to hide HBM latency).
+ *
+ * Pair with GetGridSize1D — see its docs for the row-major launch shape.
+ *
+ * @tparam Kernel The __global__ function pointer.
+ * @tparam Cap    Upper bound on threads per block.
+ * @return dim3(blockSize, 1, 1).
+ */
+template <auto Kernel, int Cap = 256>
+inline dim3 GetBlockSize1D() {
+    return dim3(CachedOccupancyBlockSize<Kernel, Cap>(), 1, 1);
+}
 
-    return dim3(warpSize, targetBlockSize / warpSize, 1);
+/**
+ * @brief Pick a 2D block size for a stencil/transform kernel via runtime
+ *        occupancy query, capped at Cap, and cached per (kernel, device).
+ *
+ * Use for kernels with 2D locality (stencils, interpolation neighborhoods,
+ * affine warps). Reshapes the queried thread count as
+ * (warpSize, blockSize / warpSize, 1) so threadIdx.x is wavefront-aligned
+ * for coalescing while threadIdx.y stacks rows for tile-style cache reuse.
+ *
+ * @tparam Kernel The __global__ function pointer.
+ * @tparam Cap    Upper bound on threads per block.
+ * @return dim3(warpSize, blockSize / warpSize, 1).
+ */
+template <auto Kernel, int Cap = 512>
+inline dim3 GetBlockSize2D() {
+    int blockSize = CachedOccupancyBlockSize<Kernel, Cap>();
+    int warpSize = CachedWarpSize();
+    return dim3(warpSize, blockSize / warpSize, 1);
 }
 
 /**
@@ -78,35 +153,6 @@ static inline dim3 GetGridSize2D(size_t width, size_t height, size_t batchSize,
     return dim3((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y, batchSize);
 }
 
-/**
- * @brief Get the block size for a 1D kernel — all threads on the x axis.
- *
- * Use for pointwise kernels (no neighborhood reads). There is no locality
- * benefit to grouping threads from different rows in the same block when
- * each thread only touches its own pixel; a 1D block keeps every wavefront
- * on a single contiguous row, maximizing coalescing and eliminating the
- * y-axis index math and bottom-edge tail-wave waste of a 2D launch.
- *
- * Pair with GetGridSize1D, which lays the rows of the image out along
- * gridDim.y so existing kernels can derive y directly from blockIdx.y
- * without any indexing changes (since blockDim.y == 1 collapses the
- * standard `y = blockDim.y * blockIdx.y + threadIdx.y` to `y = blockIdx.y`).
- *
- * @param[in] targetBlockSize Total threads per block. Should be a multiple
- *                            of warpSize; otherwise it is silently floored
- *                            to the nearest multiple. Defaults to 256.
- * @return The block size: dim3(targetBlockSize, 1, 1), aligned to warpSize.
- */
-static inline dim3 GetBlockSize1D(int targetBlockSize = 128) {
-    int deviceId;
-    int warpSize;
-
-    HIP_VALIDATE_NO_ERRORS(hipGetDevice(&deviceId));
-    HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, deviceId));
-
-    return dim3((targetBlockSize / warpSize) * warpSize, 1, 1);
-}
-
 /**
  * @brief Get the grid size for a 1D-row-major launch.
  *
@@ -128,4 +174,4 @@ static inline dim3 GetGridSize1D(size_t width, size_t height, size_t batchSize,
     return dim3((width + blockSize.x - 1) / blockSize.x, height, batchSize);
 }
 
-}  // namespace roccv::detail
\ No newline at end of file
+}  // namespace roccv::detail
diff --git a/src/op_bilateral_filter.cpp b/src/op_bilateral_filter.cpp
index 73c3fc66..b6eb6511 100644
--- a/src/op_bilateral_filter.cpp
+++ b/src/op_bilateral_filter.cpp
@@ -68,14 +68,14 @@ void dispatch_bilateral_filter_border_mode(hipStream_t stream, const Tensor &inp
     float colorCoeff = -1 / (2 * sigmaColor * sigmaColor);
 
     if (device == eDeviceType::GPU) {
-        dim3 block = detail::GetBlockSize2D();
+        constexpr auto kernel = Kernels::Device::bilateral_filter<T, BorderWrapper<T, B>, ImageWrapper<T>>;
+        dim3 block = detail::GetBlockSize2D<kernel>();
         uint32_t xGridSize = (outputWrapper.width() + (block.x * 2) - 1) / (block.x * 2);
         uint32_t yGridSize = (outputWrapper.height() + (block.y * 2) - 1) / (block.y * 2);
         uint32_t zGridSize = outputWrapper.batches();
         dim3 grid(xGridSize, yGridSize, zGridSize);
 
-        Kernels::Device::bilateral_filter<T>
-            <<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, radius, spaceCoeff, colorCoeff);
+        kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, radius, spaceCoeff, colorCoeff);
     } else if (device == eDeviceType::CPU) {
         int divisor = std::gcd(4, outputWrapper.height());  // greatest common divisor
         int dividend = std::gcd((numThreads / divisor), outputWrapper.width());
diff --git a/src/op_bnd_box.cpp b/src/op_bnd_box.cpp
index 42da0eac..3c0d5430 100644
--- a/src/op_bnd_box.cpp
+++ b/src/op_bnd_box.cpp
@@ -50,7 +50,8 @@ void dispatch_bnd_box_dtype(hipStream_t stream, const Tensor &input, const Tenso
     auto batchSize = inputWrapper.batches();
     switch (device) {
         case eDeviceType::GPU: {
-            const dim3 block = detail::GetBlockSize1D();
+            constexpr auto kernel = Kernels::Device::bndbox_kernel<has_alpha, T, ImageWrapper<T>, ImageWrapper<T>>;
+            const dim3 block = detail::GetBlockSize1D<kernel>();
             const dim3 grid = detail::GetGridSize1D(width, height, batchSize, block);
 
             Rect_t *rects_ptr = nullptr;
@@ -61,8 +62,8 @@ void dispatch_bnd_box_dtype(hipStream_t stream, const Tensor &input, const Tenso
                 HIP_VALIDATE_NO_ERRORS(
                     hipMemcpyAsync(rects_ptr, rects->data(), sizeof(Rect_t) * n_rects, hipMemcpyHostToDevice, stream));
             }
-            Kernels::Device::bndbox_kernel<has_alpha, T>
-                <<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, rects_ptr, n_rects, batchSize, height, width);
+            kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, rects_ptr, n_rects, batchSize, height,
+                                                width);
             if (n_rects > 0) {
                 HIP_VALIDATE_NO_ERRORS(hipFreeAsync(rects_ptr, stream));
             }
diff --git a/src/op_composite.cpp b/src/op_composite.cpp
index 559582fb..3de572cf 100644
--- a/src/op_composite.cpp
+++ b/src/op_composite.cpp
@@ -41,10 +41,12 @@ void dispatch_composite_masktype(hipStream_t stream, const Tensor& foreground, c
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block = detail::GetBlockSize1D();
+            constexpr auto kernel = Kernels::Device::composite<ImageWrapper<SrcType>, ImageWrapper<MaskType>,
+                                                                ImageWrapper<DstType>>;
+            dim3 block = detail::GetBlockSize1D<kernel>();
             dim3 grid =
                 detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
-            Kernels::Device::composite<<<grid, block, 0, stream>>>(fgWrapper, bgWrapper, maskWrapper, outputWrapper);
+            kernel<<<grid, block, 0, stream>>>(fgWrapper, bgWrapper, maskWrapper, outputWrapper);
             break;
         }
 
diff --git a/src/op_convert_to.cpp b/src/op_convert_to.cpp
index 11067706..936f315d 100644
--- a/src/op_convert_to.cpp
+++ b/src/op_convert_to.cpp
@@ -55,10 +55,12 @@ void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const
     // Launch CPU/GPU kernel depending on requested device type.
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block = detail::GetBlockSize1D();
+            constexpr auto kernel =
+                Kernels::Device::convert_to<ImageWrapper<SRC_DT_NC>, ImageWrapper<DST_DT_NC>, DT_AB>;
+            dim3 block = detail::GetBlockSize1D<kernel>();
             dim3 grid =
                 detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
-            Kernels::Device::convert_to<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, alpha_ab, beta_ab);
+            kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, alpha_ab, beta_ab);
             break;
         }
         case eDeviceType::CPU: {
diff --git a/src/op_copy_make_border.cpp b/src/op_copy_make_border.cpp
index 7d8c2794..8a16f864 100644
--- a/src/op_copy_make_border.cpp
+++ b/src/op_copy_make_border.cpp
@@ -44,9 +44,11 @@ void dispatch_copy_make_border_border_mode(hipStream_t stream, const Tensor& inp
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block = detail::GetBlockSize1D();
+            constexpr auto kernel =
+                Kernels::Device::copy_make_border<BorderWrapper<T, BorderMode>, ImageWrapper<T>>;
+            dim3 block = detail::GetBlockSize1D<kernel>();
             dim3 grid = detail::GetGridSize1D(out_desc.width(), out_desc.height(), out_desc.batches(), block);
-            Kernels::Device::copy_make_border<<<grid, block, 0, stream>>>(in_desc, out_desc, top, left);
+            kernel<<<grid, block, 0, stream>>>(in_desc, out_desc, top, left);
             break;
         }
         case eDeviceType::CPU: {
diff --git a/src/op_custom_crop.cpp b/src/op_custom_crop.cpp
index 0e54db1a..89916758 100644
--- a/src/op_custom_crop.cpp
+++ b/src/op_custom_crop.cpp
@@ -41,10 +41,11 @@ void dispatch_custom_crop_dtype(hipStream_t stream, const Tensor& input, const T
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block = detail::GetBlockSize1D();
+            constexpr auto kernel = Kernels::Device::custom_crop<ImageWrapper<T>, ImageWrapper<T>>;
+            dim3 block = detail::GetBlockSize1D<kernel>();
             dim3 grid =
                 detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
-            Kernels::Device::custom_crop<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, cropRect);
+            kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, cropRect);
             break;
         }
 
diff --git a/src/op_cvt_color.cpp b/src/op_cvt_color.cpp
index 98c0826b..8f978999 100644
--- a/src/op_cvt_color.cpp
+++ b/src/op_cvt_color.cpp
@@ -82,59 +82,66 @@ void CvtColor::operator()(hipStream_t stream, const Tensor &input, Tensor &outpu
 
         switch (conversionCode) {
             case eColorConversionCode::COLOR_BGR2GRAY: {
-                dim3 block = detail::GetBlockSize1D();
+                constexpr auto kernel = Kernels::Device::rgb_or_bgr_to_grayscale<
+                    uchar3, eSwizzle::ZYXW, ImageWrapper<uchar3>, ImageWrapper<uchar1>>;
+                dim3 block = detail::GetBlockSize1D<kernel>();
                 dim3 grid = detail::GetGridSize1D(width, height, samples, block);
-                Kernels::Device::rgb_or_bgr_to_grayscale<uchar3, eSwizzle::ZYXW>
-                    <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar1>(output));
+                kernel<<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar1>(output));
                 break;
             }
 
             case eColorConversionCode::COLOR_RGB2GRAY: {
-                dim3 block = detail::GetBlockSize1D();
+                constexpr auto kernel = Kernels::Device::rgb_or_bgr_to_grayscale<
+                    uchar3, eSwizzle::XYZW, ImageWrapper<uchar3>, ImageWrapper<uchar1>>;
+                dim3 block = detail::GetBlockSize1D<kernel>();
                 dim3 grid = detail::GetGridSize1D(width, height, samples, block);
-                Kernels::Device::rgb_or_bgr_to_grayscale<uchar3, eSwizzle::XYZW>
-                    <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar1>(output));
+                kernel<<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar1>(output));
                 break;
             }
 
             case eColorConversionCode::COLOR_BGR2RGB:
             case eColorConversionCode::COLOR_RGB2BGR: {
-                dim3 block = detail::GetBlockSize1D();
+                constexpr auto kernel = Kernels::Device::reorder<
+                    uchar3, eSwizzle::ZYXW, ImageWrapper<uchar3>, ImageWrapper<uchar3>>;
+                dim3 block = detail::GetBlockSize1D<kernel>();
                 dim3 grid = detail::GetGridSize1D(width, height, samples, block);
-                Kernels::Device::reorder<uchar3, eSwizzle::ZYXW>
-                    <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output));
+                kernel<<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output));
                 break;
             }
 
             case eColorConversionCode::COLOR_BGR2YUV: {
-                dim3 block = detail::GetBlockSize1D();
+                constexpr auto kernel = Kernels::Device::rgb_or_bgr_to_yuv<
+                    uchar3, eSwizzle::ZYXW, ImageWrapper<uchar3>, ImageWrapper<uchar3>>;
+                dim3 block = detail::GetBlockSize1D<kernel>();
                 dim3 grid = detail::GetGridSize1D(width, height, samples, block);
-                Kernels::Device::rgb_or_bgr_to_yuv<uchar3, eSwizzle::ZYXW>
-                    <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
+                kernel<<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
                 break;
             }
 
             case eColorConversionCode::COLOR_RGB2YUV: {
-                dim3 block = detail::GetBlockSize1D();
+                constexpr auto kernel = Kernels::Device::rgb_or_bgr_to_yuv<
+                    uchar3, eSwizzle::XYZW, ImageWrapper<uchar3>, ImageWrapper<uchar3>>;
+                dim3 block = detail::GetBlockSize1D<kernel>();
                 dim3 grid = detail::GetGridSize1D(width, height, samples, block);
-                Kernels::Device::rgb_or_bgr_to_yuv<uchar3, eSwizzle::XYZW>
-                    <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
+                kernel<<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
                 break;
             }
 
             case eColorConversionCode::COLOR_YUV2BGR: {
-                dim3 block = detail::GetBlockSize1D();
+                constexpr auto kernel = Kernels::Device::yuv_to_rgb_or_bgr<
+                    uchar3, eSwizzle::ZYXW, ImageWrapper<uchar3>, ImageWrapper<uchar3>>;
+                dim3 block = detail::GetBlockSize1D<kernel>();
                 dim3 grid = detail::GetGridSize1D(width, height, samples, block);
-                Kernels::Device::yuv_to_rgb_or_bgr<uchar3, eSwizzle::ZYXW>
-                    <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
+                kernel<<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
                 break;
             }
 
             case eColorConversionCode::COLOR_YUV2RGB: {
-                dim3 block = detail::GetBlockSize1D();
+                constexpr auto kernel = Kernels::Device::yuv_to_rgb_or_bgr<
+                    uchar3, eSwizzle::XYZW, ImageWrapper<uchar3>, ImageWrapper<uchar3>>;
+                dim3 block = detail::GetBlockSize1D<kernel>();
                 dim3 grid = detail::GetGridSize1D(width, height, samples, block);
-                Kernels::Device::yuv_to_rgb_or_bgr<uchar3, eSwizzle::XYZW>
-                    <<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
+                kernel<<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
                 break;
             }
 
diff --git a/src/op_flip.cpp b/src/op_flip.cpp
index 6bb7dd33..54886b6f 100644
--- a/src/op_flip.cpp
+++ b/src/op_flip.cpp
@@ -43,10 +43,11 @@ void dispatch_flip_axis(hipStream_t stream, const Tensor& input, const Tensor& o
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block = detail::GetBlockSize1D();
+            constexpr auto kernel = Kernels::Device::flip<FlipType, ImageWrapper<T>, ImageWrapper<T>>;
+            dim3 block = detail::GetBlockSize1D<kernel>();
             dim3 grid =
                 detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
-            Kernels::Device::flip<FlipType><<<grid, block, 0, stream>>>(inputWrapper, outputWrapper);
+            kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper);
             break;
         }
 
diff --git a/src/op_gamma_contrast.cpp b/src/op_gamma_contrast.cpp
index 43e66ed8..e764d27c 100644
--- a/src/op_gamma_contrast.cpp
+++ b/src/op_gamma_contrast.cpp
@@ -43,11 +43,12 @@ void dispatch_gamma_contrast_dtype(hipStream_t stream, const Tensor &input, cons
     ImageWrapper<T> outputWrapper(output);
 
     if (device == eDeviceType::GPU) {
-        dim3 block = detail::GetBlockSize1D();
+        constexpr auto kernel = Kernels::Device::gamma_contrast<ImageWrapper<T>, ImageWrapper<T>>;
+        dim3 block = detail::GetBlockSize1D<kernel>();
         dim3 grid =
             detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
 
-        Kernels::Device::gamma_contrast<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, gamma);
+        kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, gamma);
     } else if (device == eDeviceType::CPU) {
         Kernels::Host::gamma_contrast(inputWrapper, outputWrapper, gamma);
     }
diff --git a/src/op_normalize.cpp b/src/op_normalize.cpp
index 35dd5b25..666b317e 100644
--- a/src/op_normalize.cpp
+++ b/src/op_normalize.cpp
@@ -50,10 +50,13 @@ void dispatch_normalize_stddev(hipStream_t stream, const Tensor& input, const Te
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block = detail::GetBlockSize1D();
+            constexpr auto kernel =
+                Kernels::Device::normalize<ScaleStddev, ImageWrapper<T>, ImageWrapper<T>, ImageWrapper<work_type>,
+                                            ImageWrapper<work_type>>;
+            dim3 block = detail::GetBlockSize1D<kernel>();
             dim3 grid = detail::GetGridSize1D(outputWrap.width(), outputWrap.height(), outputWrap.batches(), block);
-            Kernels::Device::normalize<ScaleStddev>
-                <<<grid, block, 0, stream>>>(inputWrap, baseWrap, scaleWrap, outputWrap, global_scale, shift, epsilon);
+            kernel<<<grid, block, 0, stream>>>(inputWrap, baseWrap, scaleWrap, outputWrap, global_scale, shift,
+                                                 epsilon);
             break;
         }
         case eDeviceType::CPU: {
diff --git a/src/op_remap.cpp b/src/op_remap.cpp
index 76ba7cae..cdd4209d 100644
--- a/src/op_remap.cpp
+++ b/src/op_remap.cpp
@@ -89,11 +89,12 @@ void dispatch_remap_mapInterp(hipStream_t stream, const Tensor &input, const Ten
     // Launch CPU/GPU kernel depending on requested device type.
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block = detail::GetBlockSize2D();
+            constexpr auto kernel = Kernels::Device::remap<
+                InterpolationWrapper<T, B, I>, ImageWrapper<T>, InterpolationWrapper<float2, B, M>>;
+            dim3 block = detail::GetBlockSize2D<kernel>();
             dim3 grid =
                 detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
-            Kernels::Device::remap<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, wrappedMapTensor,
-                                                               mapBatchSize, params);
+            kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, wrappedMapTensor, mapBatchSize, params);
             break;
         }
 
diff --git a/src/op_resize.cpp b/src/op_resize.cpp
index f40373fa..30d9dc14 100644
--- a/src/op_resize.cpp
+++ b/src/op_resize.cpp
@@ -45,10 +45,12 @@ void dispatch_resize_interp(hipStream_t stream, const Tensor& input, const Tenso
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block = detail::GetBlockSize2D();
+            constexpr auto kernel = Kernels::Device::resize<
+                InterpolationWrapper<T, eBorderType::BORDER_TYPE_REPLICATE, I>, ImageWrapper<T>>;
+            dim3 block = detail::GetBlockSize2D<kernel>();
             dim3 grid =
                 detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
-            Kernels::Device::resize<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, scaleX, scaleY);
+            kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, scaleX, scaleY);
             break;
         }
 
diff --git a/src/op_rotate.cpp b/src/op_rotate.cpp
index 9afec0ac..be7d73a4 100644
--- a/src/op_rotate.cpp
+++ b/src/op_rotate.cpp
@@ -60,9 +60,12 @@ void dispatch_rotate_interp(hipStream_t stream, const Tensor &input, const Tenso
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block = detail::GetBlockSize2D();
+            constexpr auto kernel = Kernels::Device::rotate<
+                InterpolationWrapper<T, eBorderType::BORDER_TYPE_CONSTANT, InterpType>, ImageWrapper<T>,
+                ArrayWrapper<double, 6>>;
+            dim3 block = detail::GetBlockSize2D<kernel>();
             dim3 grid = detail::GetGridSize2D(outputWrap.width(), outputWrap.height(), outputWrap.batches(), block);
-            Kernels::Device::rotate<<<grid, block, 0, stream>>>(inputWrap, outputWrap, matWrap);
+            kernel<<<grid, block, 0, stream>>>(inputWrap, outputWrap, matWrap);
             break;
         }
 
diff --git a/src/op_thresholding.cpp b/src/op_thresholding.cpp
index c78c9884..2325c5eb 100644
--- a/src/op_thresholding.cpp
+++ b/src/op_thresholding.cpp
@@ -51,32 +51,44 @@ void dispatch_threshold_dtype(hipStream_t stream, const Tensor &input, const Ten
     const auto width = input.shape()[input.shape().layout().width_index()];
 
     if (device == eDeviceType::GPU) {
-        dim3 block = detail::GetBlockSize1D();
-        dim3 grid = detail::GetGridSize1D(width, height, outputWrapper.batches(), block);
-
         switch (m_threshType) {
-            case THRESH_BINARY:
-                Kernels::Device::binary_generic<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper,
-                                                                            GenericTensorWrapper<double>(thresh),
-                                                                            GenericTensorWrapper<double>(maxVal));
+            case THRESH_BINARY: {
+                constexpr auto kernel = Kernels::Device::binary_generic<ImageWrapper<T>, ImageWrapper<T>>;
+                dim3 block = detail::GetBlockSize1D<kernel>();
+                dim3 grid = detail::GetGridSize1D(width, height, outputWrapper.batches(), block);
+                kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, GenericTensorWrapper<double>(thresh),
+                                                   GenericTensorWrapper<double>(maxVal));
                 break;
-            case THRESH_BINARY_INV:
-                Kernels::Device::binary_inv_generic<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper,
-                                                                                GenericTensorWrapper<double>(thresh),
-                                                                                GenericTensorWrapper<double>(maxVal));
+            }
+            case THRESH_BINARY_INV: {
+                constexpr auto kernel = Kernels::Device::binary_inv_generic<ImageWrapper<T>, ImageWrapper<T>>;
+                dim3 block = detail::GetBlockSize1D<kernel>();
+                dim3 grid = detail::GetGridSize1D(width, height, outputWrapper.batches(), block);
+                kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, GenericTensorWrapper<double>(thresh),
+                                                   GenericTensorWrapper<double>(maxVal));
                 break;
-            case THRESH_TRUNC:
-                Kernels::Device::trunc_generic<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper,
-                                                                           GenericTensorWrapper<double>(thresh));
+            }
+            case THRESH_TRUNC: {
+                constexpr auto kernel = Kernels::Device::trunc_generic<ImageWrapper<T>, ImageWrapper<T>>;
+                dim3 block = detail::GetBlockSize1D<kernel>();
+                dim3 grid = detail::GetGridSize1D(width, height, outputWrapper.batches(), block);
+                kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, GenericTensorWrapper<double>(thresh));
                 break;
-            case THRESH_TOZERO:
-                Kernels::Device::tozero_generic<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper,
-                                                                            GenericTensorWrapper<double>(thresh));
+            }
+            case THRESH_TOZERO: {
+                constexpr auto kernel = Kernels::Device::tozero_generic<ImageWrapper<T>, ImageWrapper<T>>;
+                dim3 block = detail::GetBlockSize1D<kernel>();
+                dim3 grid = detail::GetGridSize1D(width, height, outputWrapper.batches(), block);
+                kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, GenericTensorWrapper<double>(thresh));
                 break;
-            case THRESH_TOZERO_INV:
-                Kernels::Device::tozeroinv_generic<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper,
-                                                                               GenericTensorWrapper<double>(thresh));
+            }
+            case THRESH_TOZERO_INV: {
+                constexpr auto kernel = Kernels::Device::tozeroinv_generic<ImageWrapper<T>, ImageWrapper<T>>;
+                dim3 block = detail::GetBlockSize1D<kernel>();
+                dim3 grid = detail::GetGridSize1D(width, height, outputWrapper.batches(), block);
+                kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, GenericTensorWrapper<double>(thresh));
                 break;
+            }
         }
     } else if (device == eDeviceType::CPU) {
         switch (m_threshType) {
diff --git a/src/op_warp_perspective.cpp b/src/op_warp_perspective.cpp
index 9dd0d700..4db3f3f4 100644
--- a/src/op_warp_perspective.cpp
+++ b/src/op_warp_perspective.cpp
@@ -42,10 +42,12 @@ void dispatch_warp_perspective_interp(hipStream_t stream, const Tensor &input, c
     // Launch CPU/GPU kernel depending on requested device type.
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block = detail::GetBlockSize2D();
+            constexpr auto kernel = Kernels::Device::warp_perspective<
+                InterpolationWrapper<T, B, I>, ImageWrapper<T>, ArrayWrapper<float, 9>>;
+            dim3 block = detail::GetBlockSize2D<kernel>();
             dim3 grid =
                 detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
-            Kernels::Device::warp_perspective<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, transform);
+            kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, transform);
             break;
         }
 

From f5baca12a09712b793bb1f44bbdb8a9338b751ed Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Mon, 27 Apr 2026 14:08:01 -0400
Subject: [PATCH 14/17] Increase default caps

---
 include/core/detail/hip_utils.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp
index 74a17108..57304e3a 100644
--- a/include/core/detail/hip_utils.hpp
+++ b/include/core/detail/hip_utils.hpp
@@ -115,7 +115,7 @@ inline int CachedOccupancyBlockSize() {
  * @tparam Cap    Upper bound on threads per block.
  * @return dim3(blockSize, 1, 1).
  */
-template <auto Kernel, int Cap = 256>
+template <auto Kernel, int Cap = 512>
 inline dim3 GetBlockSize1D() {
     return dim3(CachedOccupancyBlockSize<Kernel, Cap>(), 1, 1);
 }
@@ -133,7 +133,7 @@ inline dim3 GetBlockSize1D() {
  * @tparam Cap    Upper bound on threads per block.
  * @return dim3(warpSize, blockSize / warpSize, 1).
  */
-template <auto Kernel, int Cap = 512>
+template <auto Kernel, int Cap = 1024>
 inline dim3 GetBlockSize2D() {
     int blockSize = CachedOccupancyBlockSize<Kernel, Cap>();
     int warpSize = CachedWarpSize();

From 00a543dbf32bfd3c1b660c68e40696e454d4d18f Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Mon, 27 Apr 2026 14:12:53 -0400
Subject: [PATCH 15/17] Decrease 1D block size cap to 256

---
 include/core/detail/hip_utils.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp
index 57304e3a..85e24d3e 100644
--- a/include/core/detail/hip_utils.hpp
+++ b/include/core/detail/hip_utils.hpp
@@ -115,7 +115,7 @@ inline int CachedOccupancyBlockSize() {
  * @tparam Cap    Upper bound on threads per block.
  * @return dim3(blockSize, 1, 1).
  */
-template <auto Kernel, int Cap = 512>
+template <auto Kernel, int Cap = 256>
 inline dim3 GetBlockSize1D() {
     return dim3(CachedOccupancyBlockSize<Kernel, Cap>(), 1, 1);
 }

From e6f55882c1d2218d0cc5d6d464283c4b9b79b09b Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Mon, 27 Apr 2026 14:13:18 -0400
Subject: [PATCH 16/17] Increase block size cap to 1024 for 1D blocks

---
 include/core/detail/hip_utils.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp
index 85e24d3e..036496d1 100644
--- a/include/core/detail/hip_utils.hpp
+++ b/include/core/detail/hip_utils.hpp
@@ -115,7 +115,7 @@ inline int CachedOccupancyBlockSize() {
  * @tparam Cap    Upper bound on threads per block.
  * @return dim3(blockSize, 1, 1).
  */
-template <auto Kernel, int Cap = 256>
+template <auto Kernel, int Cap = 1024>
 inline dim3 GetBlockSize1D() {
     return dim3(CachedOccupancyBlockSize<Kernel, Cap>(), 1, 1);
 }

From fc8c00871146d3d13705c63b6ca7b2c3f62f1002 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Mon, 27 Apr 2026 14:17:40 -0400
Subject: [PATCH 17/17] Return cap sizes back to normal defaults

---
 include/core/detail/hip_utils.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp
index 036496d1..74a17108 100644
--- a/include/core/detail/hip_utils.hpp
+++ b/include/core/detail/hip_utils.hpp
@@ -115,7 +115,7 @@ inline int CachedOccupancyBlockSize() {
  * @tparam Cap    Upper bound on threads per block.
  * @return dim3(blockSize, 1, 1).
  */
-template <auto Kernel, int Cap = 1024>
+template <auto Kernel, int Cap = 256>
 inline dim3 GetBlockSize1D() {
     return dim3(CachedOccupancyBlockSize<Kernel, Cap>(), 1, 1);
 }
@@ -133,7 +133,7 @@ inline dim3 GetBlockSize1D() {
  * @tparam Cap    Upper bound on threads per block.
  * @return dim3(warpSize, blockSize / warpSize, 1).
  */
-template <auto Kernel, int Cap = 1024>
+template <auto Kernel, int Cap = 512>
 inline dim3 GetBlockSize2D() {
     int blockSize = CachedOccupancyBlockSize<Kernel, Cap>();
     int warpSize = CachedWarpSize();