diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp
index d8a918f1..74a17108 100644
--- a/include/core/detail/hip_utils.hpp
+++ b/include/core/detail/hip_utils.hpp
@@ -25,8 +25,10 @@
 #include <functional>
 #include <memory>
 
+#include "core/hip_assert.h"
+
 namespace roccv::detail {
-static void StreamCallback(void* userData) {
+static inline void StreamCallback(void* userData) {
     std::function<void()>* func = static_cast<std::function<void()>*>(userData);
     (*func)();
     delete func;
@@ -46,4 +48,130 @@ void LaunchHostFuncAsync(hipStream_t stream, Callable&& cb) {
     HIP_VALIDATE_NO_ERRORS(hipLaunchHostFunc(stream, StreamCallback, data.get()));
     data.release();  // Release ownership, StreamCallback is responsible for it now
 }
-}  // namespace roccv::detail
\ No newline at end of file
+
+/**
+ * @brief Get the device's wavefront/warp size, cached per-thread.
+ *
+ * Re-queries the runtime if the active HIP device has changed since the
+ * last call from the calling thread (so multi-device callers stay correct
+ * without paying for a query on every launch).
+ */
+inline int CachedWarpSize() {
+    static thread_local int cachedDeviceId = -1;
+    static thread_local int cachedWarpSize = 0;
+    int deviceId;
+    HIP_VALIDATE_NO_ERRORS(hipGetDevice(&deviceId));
+    if (deviceId != cachedDeviceId) {
+        HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&cachedWarpSize, hipDeviceAttributeWarpSize, deviceId));
+        cachedDeviceId = deviceId;
+    }
+    return cachedWarpSize;
+}
+
+/**
+ * @brief Cache hipOccupancyMaxPotentialBlockSize per (kernel, device).
+ *
+ * The driver picks a thread count that maximizes resident wavefronts per CU
+ * for the given kernel on the current device, accounting for the kernel's
+ * register and static-shared-memory usage. The result is bounded above by
+ * Cap so the API's drive toward maximum occupancy can't override workload-
+ * class judgment (memory-bound ops gain nothing past ~50% occupancy and
+ * can lose throughput to cache pressure with overly large blocks).
+ *
+ * Each (Kernel, Cap) instantiation gets its own thread-local cache slot,
+ * so the runtime query runs once per (kernel, device, cap) per thread.
+ *
+ * @tparam Kernel The __global__ function pointer (auto NTTP — each unique
+ *                kernel address gets its own cached result).
+ * @tparam Cap    Upper bound on the returned block size.
+ */
+template <auto Kernel, int Cap>
+inline int CachedOccupancyBlockSize() {
+    static thread_local int cachedDeviceId = -1;
+    static thread_local int cachedBlockSize = 0;
+    int deviceId;
+    HIP_VALIDATE_NO_ERRORS(hipGetDevice(&deviceId));
+    if (deviceId != cachedDeviceId) {
+        int minGridSize;
+        HIP_VALIDATE_NO_ERRORS(hipOccupancyMaxPotentialBlockSize(&minGridSize, &cachedBlockSize, Kernel, 0, Cap));
+        cachedDeviceId = deviceId;
+    }
+    return cachedBlockSize;
+}
+
+/**
+ * @brief Pick a 1D block size for a pointwise kernel via runtime occupancy
+ *        query, capped at Cap, and cached per (kernel, device).
+ *
+ * Use for pointwise kernels (no neighborhood reads). The driver returns a
+ * thread count tuned to this specific kernel's register pressure on the
+ * current device — important on architectures with very different SIMD-per-CU
+ * counts and register-file sizes (e.g. CDNA wants more wavefronts in flight
+ * per CU than RDNA to hide HBM latency).
+ *
+ * Pair with GetGridSize1D — see its docs for the row-major launch shape.
+ *
+ * @tparam Kernel The __global__ function pointer.
+ * @tparam Cap    Upper bound on threads per block.
+ * @return dim3(blockSize, 1, 1).
+ */
+template <auto Kernel, int Cap = 256>
+inline dim3 GetBlockSize1D() {
+    return dim3(CachedOccupancyBlockSize<Kernel, Cap>(), 1, 1);
+}
+
+/**
+ * @brief Pick a 2D block size for a stencil/transform kernel via runtime
+ *        occupancy query, capped at Cap, and cached per (kernel, device).
+ *
+ * Use for kernels with 2D locality (stencils, interpolation neighborhoods,
+ * affine warps). Reshapes the queried thread count as
+ * (warpSize, blockSize / warpSize, 1) so threadIdx.x is wavefront-aligned
+ * for coalescing while threadIdx.y stacks rows for tile-style cache reuse.
+ *
+ * @tparam Kernel The __global__ function pointer.
+ * @tparam Cap    Upper bound on threads per block.
+ * @return dim3(warpSize, blockSize / warpSize, 1).
+ */
+template <auto Kernel, int Cap = 512>
+inline dim3 GetBlockSize2D() {
+    int blockSize = CachedOccupancyBlockSize<Kernel, Cap>();
+    int warpSize = CachedWarpSize();
+    return dim3(warpSize, blockSize / warpSize, 1);
+}
+
+/**
+ * @brief Get the grid size for a 2D kernel.
+ *
+ * @param[in] width The width of the image.
+ * @param[in] height The height of the image.
+ * @param[in] batchSize The batch size of the image.
+ * @param[in] blockSize The block size of the kernel.
+ * @return The grid size.
+ */
+static inline dim3 GetGridSize2D(size_t width, size_t height, size_t batchSize, dim3 blockSize) {
+    return dim3((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y, batchSize);
+}
+
+/**
+ * @brief Get the grid size for a 1D-row-major launch.
+ *
+ * Lays one image row along gridDim.y and the batch along gridDim.z. The
+ * block from GetBlockSize1D has blockDim.y == 1, so the kernel's standard
+ * `y = blockDim.y * blockIdx.y + threadIdx.y` collapses to `y = blockIdx.y`
+ * with no kernel changes required.
+ *
+ * Caller is responsible for ensuring height does not exceed the device's
+ * gridDim.y limit (typically 65535) and batchSize does not exceed gridDim.z.
+ *
+ * @param[in] width      The width of the image.
+ * @param[in] height     The height of the image (becomes gridDim.y).
+ * @param[in] batchSize  The number of images (becomes gridDim.z).
+ * @param[in] blockSize  Block size from GetBlockSize1D.
+ * @return The grid size: dim3(ceil(width / blockSize.x), height, batchSize).
+ */
+static inline dim3 GetGridSize1D(size_t width, size_t height, size_t batchSize, dim3 blockSize) {
+    return dim3((width + blockSize.x - 1) / blockSize.x, height, batchSize);
+}
+
+}  // namespace roccv::detail
diff --git a/include/kernels/device/bnd_box_device.hpp b/include/kernels/device/bnd_box_device.hpp
index 9076e634..14900490 100644
--- a/include/kernels/device/bnd_box_device.hpp
+++ b/include/kernels/device/bnd_box_device.hpp
@@ -26,6 +26,7 @@ THE SOFTWARE.
 
 #include <core/detail/type_traits.hpp>
 
+#include "common/math_vector.hpp"
 #include "kernels/kernel_helpers.hpp"
 #include "operator_types.h"
 
diff --git a/include/kernels/host/bnd_box_host.hpp b/include/kernels/host/bnd_box_host.hpp
index db1af9f7..7b3826c1 100644
--- a/include/kernels/host/bnd_box_host.hpp
+++ b/include/kernels/host/bnd_box_host.hpp
@@ -26,6 +26,7 @@ THE SOFTWARE.
 
 #include <core/detail/type_traits.hpp>
 
+#include "common/math_vector.hpp"
 #include "kernels/kernel_helpers.hpp"
 #include "operator_types.h"
 
diff --git a/src/op_bilateral_filter.cpp b/src/op_bilateral_filter.cpp
index dffba8ae..b6eb6511 100644
--- a/src/op_bilateral_filter.cpp
+++ b/src/op_bilateral_filter.cpp
@@ -30,6 +30,7 @@ THE SOFTWARE.
 
 #include "common/validation_helpers.hpp"
 #include "core/detail/casting.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/wrappers/border_wrapper.hpp"
 #include "core/wrappers/image_wrapper.hpp"
 #include "kernels/device/bilateral_filter_device.hpp"
@@ -61,21 +62,20 @@ void dispatch_bilateral_filter_border_mode(hipStream_t stream, const Tensor &inp
         sigmaSpace = 1.0f;
     }
 
-    const int radius =
-        (diameter <= 0) ? static_cast<int>(std::roundf(sigmaSpace * 1.5f)) : (diameter >> 1);
+    const int radius = (diameter <= 0) ? static_cast<int>(std::roundf(sigmaSpace * 1.5f)) : (diameter >> 1);
 
     float spaceCoeff = -1 / (2 * sigmaSpace * sigmaSpace);
     float colorCoeff = -1 / (2 * sigmaColor * sigmaColor);
 
     if (device == eDeviceType::GPU) {
-        dim3 block(8, 8);
+        constexpr auto kernel = Kernels::Device::bilateral_filter<T, BorderWrapper<T, B>, ImageWrapper<T>>;
+        dim3 block = detail::GetBlockSize2D<kernel>();
         uint32_t xGridSize = (outputWrapper.width() + (block.x * 2) - 1) / (block.x * 2);
         uint32_t yGridSize = (outputWrapper.height() + (block.y * 2) - 1) / (block.y * 2);
         uint32_t zGridSize = outputWrapper.batches();
         dim3 grid(xGridSize, yGridSize, zGridSize);
 
-        Kernels::Device::bilateral_filter<T>
-            <<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, radius, spaceCoeff, colorCoeff);
+        kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, radius, spaceCoeff, colorCoeff);
     } else if (device == eDeviceType::CPU) {
         int divisor = std::gcd(4, outputWrapper.height());  // greatest common divisor
         int dividend = std::gcd((numThreads / divisor), outputWrapper.width());
diff --git a/src/op_bnd_box.cpp b/src/op_bnd_box.cpp
index 3b2443e0..3c0d5430 100644
--- a/src/op_bnd_box.cpp
+++ b/src/op_bnd_box.cpp
@@ -25,10 +25,8 @@ THE SOFTWARE.
 
 #include <algorithm>
 #include <cstring>
-#include <iostream>
 #include <vector>
 
-#include "common/math_vector.hpp"
 #include "common/validation_helpers.hpp"
 #include "core/detail/hip_utils.hpp"
 #include "core/tensor.hpp"
@@ -52,8 +50,9 @@ void dispatch_bnd_box_dtype(hipStream_t stream, const Tensor &input, const Tenso
     auto batchSize = inputWrapper.batches();
     switch (device) {
         case eDeviceType::GPU: {
-            const dim3 block(32, 32);
-            const dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y, batchSize);
+            constexpr auto kernel = Kernels::Device::bndbox_kernel<has_alpha, T, ImageWrapper<T>, ImageWrapper<T>>;
+            const dim3 block = detail::GetBlockSize1D<kernel>();
+            const dim3 grid = detail::GetGridSize1D(width, height, batchSize, block);
 
             Rect_t *rects_ptr = nullptr;
             const auto n_rects = rects->size();
@@ -63,8 +62,8 @@ void dispatch_bnd_box_dtype(hipStream_t stream, const Tensor &input, const Tenso
                 HIP_VALIDATE_NO_ERRORS(
                     hipMemcpyAsync(rects_ptr, rects->data(), sizeof(Rect_t) * n_rects, hipMemcpyHostToDevice, stream));
             }
-            Kernels::Device::bndbox_kernel<has_alpha, T>
-                <<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, rects_ptr, n_rects, batchSize, height, width);
+            kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, rects_ptr, n_rects, batchSize, height,
+                                                width);
             if (n_rects > 0) {
                 HIP_VALIDATE_NO_ERRORS(hipFreeAsync(rects_ptr, stream));
             }
diff --git a/src/op_composite.cpp b/src/op_composite.cpp
index 650a7bf1..3de572cf 100644
--- a/src/op_composite.cpp
+++ b/src/op_composite.cpp
@@ -24,6 +24,7 @@
 #include <functional>
 
 #include "common/validation_helpers.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/wrappers/image_wrapper.hpp"
 #include "kernels/device/composite_device.hpp"
 #include "kernels/host/composite_host.hpp"
@@ -40,10 +41,12 @@ void dispatch_composite_masktype(hipStream_t stream, const Tensor& foreground, c
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block(64, 16);
-            dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
-                      outputWrapper.batches());
-            Kernels::Device::composite<<<grid, block, 0, stream>>>(fgWrapper, bgWrapper, maskWrapper, outputWrapper);
+            constexpr auto kernel = Kernels::Device::composite<ImageWrapper<SrcType>, ImageWrapper<MaskType>,
+                                                                ImageWrapper<DstType>>;
+            dim3 block = detail::GetBlockSize1D<kernel>();
+            dim3 grid =
+                detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
+            kernel<<<grid, block, 0, stream>>>(fgWrapper, bgWrapper, maskWrapper, outputWrapper);
             break;
         }
 
diff --git a/src/op_convert_to.cpp b/src/op_convert_to.cpp
index 3baa4577..936f315d 100644
--- a/src/op_convert_to.cpp
+++ b/src/op_convert_to.cpp
@@ -21,22 +21,23 @@ THE SOFTWARE.
 */
 #include "op_convert_to.hpp"
 
+#include <hip/hip_runtime.h>
+
 #include <functional>
 
-#include <hip/hip_runtime.h>
-#include "core/wrappers/image_wrapper.hpp"
 #include "common/validation_helpers.hpp"
 #include "core/detail/casting.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/detail/type_traits.hpp"
+#include "core/wrappers/image_wrapper.hpp"
 #include "kernels/device/convert_to_device.hpp"
 #include "kernels/host/convert_to_host.hpp"
 
 namespace roccv {
 
 template <typename SRC_DT, typename DST_DT, int NC>
-void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const Tensor &output,
-                                       double alpha, double beta, eDeviceType device) {
-    
+void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const Tensor &output, double alpha,
+                                  double beta, eDeviceType device) {
     using SRC_DT_NC = detail::MakeType<SRC_DT, NC>;
     using DST_DT_NC = detail::MakeType<DST_DT, NC>;
 
@@ -47,17 +48,19 @@ void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const
     using DST_BT = detail::BaseType<DST_DT>;
 
     using DT_AB = decltype(float() * SRC_BT() * DST_BT());
-    
+
     DT_AB alpha_ab = detail::SaturateCast<DT_AB>(alpha);
     DT_AB beta_ab = detail::SaturateCast<DT_AB>(beta);
 
     // Launch CPU/GPU kernel depending on requested device type.
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block(64, 16);
-            dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
-                      outputWrapper.batches());
-            Kernels::Device::convert_to<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, alpha_ab, beta_ab);
+            constexpr auto kernel =
+                Kernels::Device::convert_to<ImageWrapper<SRC_DT_NC>, ImageWrapper<DST_DT_NC>, DT_AB>;
+            dim3 block = detail::GetBlockSize1D<kernel>();
+            dim3 grid =
+                detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
+            kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, alpha_ab, beta_ab);
             break;
         }
         case eDeviceType::CPU: {
@@ -68,16 +71,14 @@ void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const
 }
 
 template <typename SRC_DT, typename DST_DT>
-void dispatch_convert_to_output_dtype(hipStream_t stream, const Tensor &input, const Tensor &output,
-                                       double alpha, double beta, eDeviceType device) {
-
+void dispatch_convert_to_output_dtype(hipStream_t stream, const Tensor &input, const Tensor &output, double alpha,
+                                      double beta, eDeviceType device) {
     int64_t channels = output.shape(output.layout().channels_index());
     // Select kernel dispatcher based on number of channels.
     // clang-format off
     static const std::array<std::function<void(hipStream_t, const Tensor &, const Tensor &, double, double, eDeviceType)>, 4>
         funcs = {dispatch_convert_to_channels<SRC_DT, DST_DT, 1>, dispatch_convert_to_channels<SRC_DT, DST_DT, 2>, dispatch_convert_to_channels<SRC_DT, DST_DT, 3>, dispatch_convert_to_channels<SRC_DT, DST_DT, 4>};
-        
-            
+
     // clang-format on
 
     auto func = funcs.at(channels - 1);
@@ -86,11 +87,10 @@ void dispatch_convert_to_output_dtype(hipStream_t stream, const Tensor &input, c
 }
 
 template <typename SRC_DT>
-void dispatch_convert_to_input_dtype(hipStream_t stream, const Tensor &input, const Tensor &output,
-                                       double alpha, double beta, eDeviceType device) {
-    
+void dispatch_convert_to_input_dtype(hipStream_t stream, const Tensor &input, const Tensor &output, double alpha,
+                                     double beta, eDeviceType device) {
     eDataType output_dtype = output.dtype().etype();
-    
+
     // Select kernel dispatcher based on a base input datatype.
     // clang-format off
     static const std::unordered_map<eDataType, std::function<void(hipStream_t, const Tensor &, const Tensor &, double, double, eDeviceType)>>
diff --git a/src/op_copy_make_border.cpp b/src/op_copy_make_border.cpp
index feacfbd9..8a16f864 100644
--- a/src/op_copy_make_border.cpp
+++ b/src/op_copy_make_border.cpp
@@ -24,9 +24,10 @@
 #include <functional>
 
 #include "common/validation_helpers.hpp"
+#include "core/detail/casting.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/wrappers/border_wrapper.hpp"
 #include "core/wrappers/image_wrapper.hpp"
-#include "core/wrappers/interpolation_wrapper.hpp"
 #include "kernels/device/copy_make_border_device.hpp"
 #include "kernels/host/copy_make_border_host.hpp"
 
@@ -43,10 +44,11 @@ void dispatch_copy_make_border_border_mode(hipStream_t stream, const Tensor& inp
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block_dim(64, 16);
-            dim3 grid_dim((out_desc.width() + block_dim.x - 1) / block_dim.x,
-                          (out_desc.height() + block_dim.y - 1) / block_dim.y, out_desc.batches());
-            Kernels::Device::copy_make_border<<<grid_dim, block_dim, 0, stream>>>(in_desc, out_desc, top, left);
+            constexpr auto kernel =
+                Kernels::Device::copy_make_border<BorderWrapper<T, BorderMode>, ImageWrapper<T>>;
+            dim3 block = detail::GetBlockSize1D<kernel>();
+            dim3 grid = detail::GetGridSize1D(out_desc.width(), out_desc.height(), out_desc.batches(), block);
+            kernel<<<grid, block, 0, stream>>>(in_desc, out_desc, top, left);
             break;
         }
         case eDeviceType::CPU: {
diff --git a/src/op_custom_crop.cpp b/src/op_custom_crop.cpp
index 3d1a7056..89916758 100644
--- a/src/op_custom_crop.cpp
+++ b/src/op_custom_crop.cpp
@@ -26,6 +26,7 @@ THE SOFTWARE.
 #include <functional>
 
 #include "common/validation_helpers.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/wrappers/image_wrapper.hpp"
 #include "kernels/device/custom_crop_device.hpp"
 #include "kernels/host/custom_crop_host.hpp"
@@ -40,10 +41,11 @@ void dispatch_custom_crop_dtype(hipStream_t stream, const Tensor& input, const T
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block(64, 16);
-            dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
-                      outputWrapper.batches());
-            Kernels::Device::custom_crop<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, cropRect);
+            constexpr auto kernel = Kernels::Device::custom_crop<ImageWrapper<T>, ImageWrapper<T>>;
+            dim3 block = detail::GetBlockSize1D<kernel>();
+            dim3 grid =
+                detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
+            kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, cropRect);
             break;
         }
 
diff --git a/src/op_cvt_color.cpp b/src/op_cvt_color.cpp
index 31abd103..8f978999 100644
--- a/src/op_cvt_color.cpp
+++ b/src/op_cvt_color.cpp
@@ -23,9 +23,8 @@ THE SOFTWARE.
 
 #include <hip/hip_runtime.h>
 
-#include <iostream>
-
 #include "common/validation_helpers.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/tensor.hpp"
 #include "core/wrappers/image_wrapper.hpp"
 #include "kernels/device/cvt_color_device.hpp"
@@ -81,45 +80,70 @@ void CvtColor::operator()(hipStream_t stream, const Tensor &input, Tensor &outpu
     if (device == eDeviceType::GPU) {
         // Dispatch appropriate device kernel based on given conversion code
 
-        dim3 blockSize(32, 16);
-        dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y, samples);
-
         switch (conversionCode) {
-            case eColorConversionCode::COLOR_BGR2GRAY:
-                Kernels::Device::rgb_or_bgr_to_grayscale<uchar3, eSwizzle::ZYXW>
-                    <<<gridSize, blockSize, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar1>(output));
+            case eColorConversionCode::COLOR_BGR2GRAY: {
+                constexpr auto kernel = Kernels::Device::rgb_or_bgr_to_grayscale<
+                    uchar3, eSwizzle::ZYXW, ImageWrapper<uchar3>, ImageWrapper<uchar1>>;
+                dim3 block = detail::GetBlockSize1D<kernel>();
+                dim3 grid = detail::GetGridSize1D(width, height, samples, block);
+                kernel<<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar1>(output));
                 break;
-
-            case eColorConversionCode::COLOR_RGB2GRAY:
-                Kernels::Device::rgb_or_bgr_to_grayscale<uchar3, eSwizzle::XYZW>
-                    <<<gridSize, blockSize, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar1>(output));
+            }
+
+            case eColorConversionCode::COLOR_RGB2GRAY: {
+                constexpr auto kernel = Kernels::Device::rgb_or_bgr_to_grayscale<
+                    uchar3, eSwizzle::XYZW, ImageWrapper<uchar3>, ImageWrapper<uchar1>>;
+                dim3 block = detail::GetBlockSize1D<kernel>();
+                dim3 grid = detail::GetGridSize1D(width, height, samples, block);
+                kernel<<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar1>(output));
                 break;
+            }
 
             case eColorConversionCode::COLOR_BGR2RGB:
-            case eColorConversionCode::COLOR_RGB2BGR:
-                Kernels::Device::reorder<uchar3, eSwizzle::ZYXW>
-                    <<<gridSize, blockSize, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output));
+            case eColorConversionCode::COLOR_RGB2BGR: {
+                constexpr auto kernel = Kernels::Device::reorder<
+                    uchar3, eSwizzle::ZYXW, ImageWrapper<uchar3>, ImageWrapper<uchar3>>;
+                dim3 block = detail::GetBlockSize1D<kernel>();
+                dim3 grid = detail::GetGridSize1D(width, height, samples, block);
+                kernel<<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output));
                 break;
-
-            case eColorConversionCode::COLOR_BGR2YUV:
-                Kernels::Device::rgb_or_bgr_to_yuv<uchar3, eSwizzle::ZYXW><<<gridSize, blockSize, 0, stream>>>(
-                    ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
+            }
+
+            case eColorConversionCode::COLOR_BGR2YUV: {
+                constexpr auto kernel = Kernels::Device::rgb_or_bgr_to_yuv<
+                    uchar3, eSwizzle::ZYXW, ImageWrapper<uchar3>, ImageWrapper<uchar3>>;
+                dim3 block = detail::GetBlockSize1D<kernel>();
+                dim3 grid = detail::GetGridSize1D(width, height, samples, block);
+                kernel<<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
                 break;
-
-            case eColorConversionCode::COLOR_RGB2YUV:
-                Kernels::Device::rgb_or_bgr_to_yuv<uchar3, eSwizzle::XYZW><<<gridSize, blockSize, 0, stream>>>(
-                    ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
+            }
+
+            case eColorConversionCode::COLOR_RGB2YUV: {
+                constexpr auto kernel = Kernels::Device::rgb_or_bgr_to_yuv<
+                    uchar3, eSwizzle::XYZW, ImageWrapper<uchar3>, ImageWrapper<uchar3>>;
+                dim3 block = detail::GetBlockSize1D<kernel>();
+                dim3 grid = detail::GetGridSize1D(width, height, samples, block);
+                kernel<<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
                 break;
-
-            case eColorConversionCode::COLOR_YUV2BGR:
-                Kernels::Device::yuv_to_rgb_or_bgr<uchar3, eSwizzle::ZYXW><<<gridSize, blockSize, 0, stream>>>(
-                    ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
+            }
+
+            case eColorConversionCode::COLOR_YUV2BGR: {
+                constexpr auto kernel = Kernels::Device::yuv_to_rgb_or_bgr<
+                    uchar3, eSwizzle::ZYXW, ImageWrapper<uchar3>, ImageWrapper<uchar3>>;
+                dim3 block = detail::GetBlockSize1D<kernel>();
+                dim3 grid = detail::GetGridSize1D(width, height, samples, block);
+                kernel<<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
                 break;
-
-            case eColorConversionCode::COLOR_YUV2RGB:
-                Kernels::Device::yuv_to_rgb_or_bgr<uchar3, eSwizzle::XYZW><<<gridSize, blockSize, 0, stream>>>(
-                    ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
+            }
+
+            case eColorConversionCode::COLOR_YUV2RGB: {
+                constexpr auto kernel = Kernels::Device::yuv_to_rgb_or_bgr<
+                    uchar3, eSwizzle::XYZW, ImageWrapper<uchar3>, ImageWrapper<uchar3>>;
+                dim3 block = detail::GetBlockSize1D<kernel>();
+                dim3 grid = detail::GetGridSize1D(width, height, samples, block);
+                kernel<<<grid, block, 0, stream>>>(ImageWrapper<uchar3>(input), ImageWrapper<uchar3>(output), 128.0f);
                 break;
+            }
 
             default:
                 throw Exception("Not implemented", eStatusType::NOT_IMPLEMENTED);
diff --git a/src/op_flip.cpp b/src/op_flip.cpp
index 5566a0b6..54886b6f 100644
--- a/src/op_flip.cpp
+++ b/src/op_flip.cpp
@@ -27,6 +27,7 @@ THE SOFTWARE.
 #include <unordered_map>
 
 #include "common/validation_helpers.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/exception.hpp"
 #include "core/status_type.h"
 #include "core/wrappers/image_wrapper.hpp"
@@ -42,10 +43,11 @@ void dispatch_flip_axis(hipStream_t stream, const Tensor& input, const Tensor& o
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block(64, 16);
-            dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
-                      outputWrapper.batches());
-            Kernels::Device::flip<FlipType><<<grid, block, 0, stream>>>(inputWrapper, outputWrapper);
+            constexpr auto kernel = Kernels::Device::flip<FlipType, ImageWrapper<T>, ImageWrapper<T>>;
+            dim3 block = detail::GetBlockSize1D<kernel>();
+            dim3 grid =
+                detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
+            kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper);
             break;
         }
 
diff --git a/src/op_gamma_contrast.cpp b/src/op_gamma_contrast.cpp
index d6c78690..e764d27c 100644
--- a/src/op_gamma_contrast.cpp
+++ b/src/op_gamma_contrast.cpp
@@ -24,15 +24,11 @@ THE SOFTWARE.
 
 #include <hip/hip_runtime.h>
 
-#include <algorithm>
 #include <cstring>
 #include <functional>
-#include <iostream>
-#include <vector>
 
-#include "common/array_wrapper.hpp"
-#include "common/math_vector.hpp"
 #include "common/validation_helpers.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/tensor.hpp"
 #include "core/wrappers/image_wrapper.hpp"
 #include "kernels/device/gamma_contrast_device.hpp"
@@ -47,11 +43,12 @@ void dispatch_gamma_contrast_dtype(hipStream_t stream, const Tensor &input, cons
     ImageWrapper<T> outputWrapper(output);
 
     if (device == eDeviceType::GPU) {
-        dim3 block(64, 16);
-        dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
-                  outputWrapper.batches());
+        constexpr auto kernel = Kernels::Device::gamma_contrast<ImageWrapper<T>, ImageWrapper<T>>;
+        dim3 block = detail::GetBlockSize1D<kernel>();
+        dim3 grid =
+            detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
 
-        Kernels::Device::gamma_contrast<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, gamma);
+        kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, gamma);
     } else if (device == eDeviceType::CPU) {
         Kernels::Host::gamma_contrast(inputWrapper, outputWrapper, gamma);
     }
diff --git a/src/op_histogram.cpp b/src/op_histogram.cpp
index 24a2ef79..819c70fc 100644
--- a/src/op_histogram.cpp
+++ b/src/op_histogram.cpp
@@ -22,13 +22,11 @@ THE SOFTWARE.
 #include "op_histogram.hpp"
 
 #include <hip/hip_runtime.h>
-#include <stdio.h>
 
 #include <cstring>
 #include <functional>
 #include <unordered_map>
 
-#include "common/array_wrapper.hpp"
 #include "common/validation_helpers.hpp"
 #include "core/wrappers/generic_tensor_wrapper.hpp"
 #include "core/wrappers/image_wrapper.hpp"
diff --git a/src/op_non_max_suppression.cpp b/src/op_non_max_suppression.cpp
index 93c4f90c..d34b819e 100644
--- a/src/op_non_max_suppression.cpp
+++ b/src/op_non_max_suppression.cpp
@@ -24,7 +24,6 @@ THE SOFTWARE.
 #include <hip/hip_runtime.h>
 
 #include "common/validation_helpers.hpp"
-#include "core/hip_assert.h"
 #include "kernels/device/non_max_suppression_device.hpp"
 #include "kernels/host/non_max_suppression_host.hpp"
 
diff --git a/src/op_normalize.cpp b/src/op_normalize.cpp
index 529638b8..666b317e 100644
--- a/src/op_normalize.cpp
+++ b/src/op_normalize.cpp
@@ -26,6 +26,7 @@ THE SOFTWARE.
 #include <unordered_map>
 
 #include "common/validation_helpers.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/detail/type_traits.hpp"
 #include "core/tensor.hpp"
 #include "core/wrappers/image_wrapper.hpp"
@@ -49,11 +50,13 @@ void dispatch_normalize_stddev(hipStream_t stream, const Tensor& input, const Te
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block(32, 8);
-            dim3 grid((outputWrap.width() + block.x - 1) / block.x, (outputWrap.height() + block.y - 1) / block.y,
-                      outputWrap.batches());
-            Kernels::Device::normalize<ScaleStddev>
-                <<<grid, block, 0, stream>>>(inputWrap, baseWrap, scaleWrap, outputWrap, global_scale, shift, epsilon);
+            constexpr auto kernel =
+                Kernels::Device::normalize<ScaleStddev, ImageWrapper<T>, ImageWrapper<T>, ImageWrapper<work_type>,
+                                            ImageWrapper<work_type>>;
+            dim3 block = detail::GetBlockSize1D<kernel>();
+            dim3 grid = detail::GetGridSize1D(outputWrap.width(), outputWrap.height(), outputWrap.batches(), block);
+            kernel<<<grid, block, 0, stream>>>(inputWrap, baseWrap, scaleWrap, outputWrap, global_scale, shift,
+                                                 epsilon);
             break;
         }
         case eDeviceType::CPU: {
@@ -112,7 +115,8 @@ void Normalize::operator()(hipStream_t stream, const Tensor& input, const Tensor
     // TODO: Need to support scalar base/scale tensors at some point. Will require some extra handling on the kernel
     // level. Once in place, this check can be removed.
     CHECK_TENSOR_COMPARISON(base.shape(base.layout().channels_index()) == input.shape(input.layout().channels_index()));
-    CHECK_TENSOR_COMPARISON(scale.shape(scale.layout().channels_index()) == input.shape(input.layout().channels_index()));
+    CHECK_TENSOR_COMPARISON(scale.shape(scale.layout().channels_index()) ==
+                            input.shape(input.layout().channels_index()));
 
     // Create kernel dispatching table based on input/output datatype and number of channels.
     // clang-format off
diff --git a/src/op_remap.cpp b/src/op_remap.cpp
index 0992cf44..cdd4209d 100644
--- a/src/op_remap.cpp
+++ b/src/op_remap.cpp
@@ -23,12 +23,10 @@ THE SOFTWARE.
 
 #include <functional>
 
-#include "common/array_wrapper.hpp"
 #include "common/validation_helpers.hpp"
 #include "core/detail/casting.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/detail/internal_structs.hpp"
-#include "core/detail/math/math.hpp"
-#include "core/detail/type_traits.hpp"
 #include "core/wrappers/image_wrapper.hpp"
 #include "core/wrappers/interpolation_wrapper.hpp"
 #include "kernels/device/remap_device.hpp"
@@ -91,11 +89,12 @@ void dispatch_remap_mapInterp(hipStream_t stream, const Tensor &input, const Ten
     // Launch CPU/GPU kernel depending on requested device type.
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block(64, 16);
-            dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
-                      outputWrapper.batches());
-            Kernels::Device::remap<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, wrappedMapTensor,
-                                                               mapBatchSize, params);
+            constexpr auto kernel = Kernels::Device::remap<
+                InterpolationWrapper<T, B, I>, ImageWrapper<T>, InterpolationWrapper<float2, B, M>>;
+            dim3 block = detail::GetBlockSize2D<kernel>();
+            dim3 grid =
+                detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
+            kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, wrappedMapTensor, mapBatchSize, params);
             break;
         }
 
diff --git a/src/op_resize.cpp b/src/op_resize.cpp
index d7cd0b61..30d9dc14 100644
--- a/src/op_resize.cpp
+++ b/src/op_resize.cpp
@@ -25,7 +25,7 @@ THE SOFTWARE.
 #include <unordered_map>
 
 #include "common/validation_helpers.hpp"
-#include "core/detail/casting.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/exception.hpp"
 #include "core/status_type.h"
 #include "core/wrappers/interpolation_wrapper.hpp"
@@ -45,10 +45,12 @@ void dispatch_resize_interp(hipStream_t stream, const Tensor& input, const Tenso
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block(64, 16);
-            dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
-                      outputWrapper.batches());
-            Kernels::Device::resize<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, scaleX, scaleY);
+            constexpr auto kernel = Kernels::Device::resize<
+                InterpolationWrapper<T, eBorderType::BORDER_TYPE_REPLICATE, I>, ImageWrapper<T>>;
+            dim3 block = detail::GetBlockSize2D<kernel>();
+            dim3 grid =
+                detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
+            kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, scaleX, scaleY);
             break;
         }
 
@@ -62,13 +64,13 @@ void dispatch_resize_interp(hipStream_t stream, const Tensor& input, const Tenso
 template <typename T>
 void dispatch_resize_dtype(hipStream_t stream, const Tensor& input, const Tensor& output,
                            eInterpolationType interpolation, eDeviceType device) {
-    static const std::unordered_map<
-        eInterpolationType,
-        std::function<void(hipStream_t stream, const Tensor& input, const Tensor& output, eDeviceType device)>>
-        funcs = {{eInterpolationType::INTERP_TYPE_NEAREST, dispatch_resize_interp<T, eInterpolationType::INTERP_TYPE_NEAREST>},
-                 {eInterpolationType::INTERP_TYPE_LINEAR, dispatch_resize_interp<T, eInterpolationType::INTERP_TYPE_LINEAR>},
-                 {eInterpolationType::INTERP_TYPE_CUBIC, dispatch_resize_interp<T, eInterpolationType::INTERP_TYPE_CUBIC>}
-                };
+    static const std::unordered_map<eInterpolationType, std::function<void(hipStream_t stream, const Tensor& input,
+                                                                           const Tensor& output, eDeviceType device)>>
+        funcs = {
+            {eInterpolationType::INTERP_TYPE_NEAREST,
+             dispatch_resize_interp<T, eInterpolationType::INTERP_TYPE_NEAREST>},
+            {eInterpolationType::INTERP_TYPE_LINEAR, dispatch_resize_interp<T, eInterpolationType::INTERP_TYPE_LINEAR>},
+            {eInterpolationType::INTERP_TYPE_CUBIC, dispatch_resize_interp<T, eInterpolationType::INTERP_TYPE_CUBIC>}};
 
     if (!funcs.contains(interpolation)) {
         throw Exception("Operation does not support the given interpolation mode.", eStatusType::NOT_IMPLEMENTED);
@@ -78,8 +80,8 @@ void dispatch_resize_dtype(hipStream_t stream, const Tensor& input, const Tensor
     func(stream, input, output, device);
 }
 
-void Resize::operator()(hipStream_t stream, const Tensor& input, const Tensor& output,
-                        eInterpolationType interpolation, eDeviceType device) const {
+void Resize::operator()(hipStream_t stream, const Tensor& input, const Tensor& output, eInterpolationType interpolation,
+                        eDeviceType device) const {
     CHECK_TENSOR_DEVICE(input, device);
     CHECK_TENSOR_DEVICE(output, device);
 
diff --git a/src/op_rotate.cpp b/src/op_rotate.cpp
index 28806779..be7d73a4 100644
--- a/src/op_rotate.cpp
+++ b/src/op_rotate.cpp
@@ -26,6 +26,7 @@ THE SOFTWARE.
 
 #include "common/array_wrapper.hpp"
 #include "common/validation_helpers.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/wrappers/interpolation_wrapper.hpp"
 #include "kernels/device/rotate_device.hpp"
 #include "kernels/host/rotate_host.hpp"
@@ -59,10 +60,12 @@ void dispatch_rotate_interp(hipStream_t stream, const Tensor &input, const Tenso
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block(32, 16);
-            dim3 grid((outputWrap.width() + block.x - 1) / block.x, (outputWrap.height() + block.y - 1) / block.y,
-                      outputWrap.batches());
-            Kernels::Device::rotate<<<grid, block, 0, stream>>>(inputWrap, outputWrap, matWrap);
+            constexpr auto kernel = Kernels::Device::rotate<
+                InterpolationWrapper<T, eBorderType::BORDER_TYPE_CONSTANT, InterpType>, ImageWrapper<T>,
+                ArrayWrapper<double, 6>>;
+            dim3 block = detail::GetBlockSize2D<kernel>();
+            dim3 grid = detail::GetGridSize2D(outputWrap.width(), outputWrap.height(), outputWrap.batches(), block);
+            kernel<<<grid, block, 0, stream>>>(inputWrap, outputWrap, matWrap);
             break;
         }
 
diff --git a/src/op_thresholding.cpp b/src/op_thresholding.cpp
index 420ac84a..2325c5eb 100644
--- a/src/op_thresholding.cpp
+++ b/src/op_thresholding.cpp
@@ -26,6 +26,7 @@ THE SOFTWARE.
 #include <functional>
 
 #include "common/validation_helpers.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/wrappers/generic_tensor_wrapper.hpp"
 #include "core/wrappers/image_wrapper.hpp"
 #include "kernels/device/thresholding_device.hpp"
@@ -50,34 +51,45 @@ void dispatch_threshold_dtype(hipStream_t stream, const Tensor &input, const Ten
     const auto width = input.shape()[input.shape().layout().width_index()];
 
     if (device == eDeviceType::GPU) {
-        dim3 block(64, 16);
-        dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y, outputWrapper.batches());
-
         switch (m_threshType) {
-            case THRESH_BINARY:
-                Kernels::Device::binary_generic<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper,
-                                                                            GenericTensorWrapper<double>(thresh),
-                                                                            GenericTensorWrapper<double>(maxVal));
+            case THRESH_BINARY: {
+                constexpr auto kernel = Kernels::Device::binary_generic<ImageWrapper<T>, ImageWrapper<T>>;
+                dim3 block = detail::GetBlockSize1D<kernel>();
+                dim3 grid = detail::GetGridSize1D(width, height, outputWrapper.batches(), block);
+                kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, GenericTensorWrapper<double>(thresh),
+                                                   GenericTensorWrapper<double>(maxVal));
                 break;
-            case THRESH_BINARY_INV:
-                Kernels::Device::binary_inv_generic<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper,
-                                                                                GenericTensorWrapper<double>(thresh),
-                                                                                GenericTensorWrapper<double>(maxVal));
+            }
+            case THRESH_BINARY_INV: {
+                constexpr auto kernel = Kernels::Device::binary_inv_generic<ImageWrapper<T>, ImageWrapper<T>>;
+                dim3 block = detail::GetBlockSize1D<kernel>();
+                dim3 grid = detail::GetGridSize1D(width, height, outputWrapper.batches(), block);
+                kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, GenericTensorWrapper<double>(thresh),
+                                                   GenericTensorWrapper<double>(maxVal));
                 break;
-            case THRESH_TRUNC:
-                Kernels::Device::trunc_generic<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper,
-                                                                           GenericTensorWrapper<double>(thresh));
+            }
+            case THRESH_TRUNC: {
+                constexpr auto kernel = Kernels::Device::trunc_generic<ImageWrapper<T>, ImageWrapper<T>>;
+                dim3 block = detail::GetBlockSize1D<kernel>();
+                dim3 grid = detail::GetGridSize1D(width, height, outputWrapper.batches(), block);
+                kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, GenericTensorWrapper<double>(thresh));
                 break;
-            case THRESH_TOZERO:
-                Kernels::Device::tozero_generic<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper,
-                                                                            GenericTensorWrapper<double>(thresh));
+            }
+            case THRESH_TOZERO: {
+                constexpr auto kernel = Kernels::Device::tozero_generic<ImageWrapper<T>, ImageWrapper<T>>;
+                dim3 block = detail::GetBlockSize1D<kernel>();
+                dim3 grid = detail::GetGridSize1D(width, height, outputWrapper.batches(), block);
+                kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, GenericTensorWrapper<double>(thresh));
                 break;
-            case THRESH_TOZERO_INV:
-                Kernels::Device::tozeroinv_generic<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper,
-                                                                               GenericTensorWrapper<double>(thresh));
+            }
+            case THRESH_TOZERO_INV: {
+                constexpr auto kernel = Kernels::Device::tozeroinv_generic<ImageWrapper<T>, ImageWrapper<T>>;
+                dim3 block = detail::GetBlockSize1D<kernel>();
+                dim3 grid = detail::GetGridSize1D(width, height, outputWrapper.batches(), block);
+                kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, GenericTensorWrapper<double>(thresh));
                 break;
+            }
         }
-
     } else if (device == eDeviceType::CPU) {
         switch (m_threshType) {
             case THRESH_BINARY:
diff --git a/src/op_warp_perspective.cpp b/src/op_warp_perspective.cpp
index ca77fc8c..4db3f3f4 100644
--- a/src/op_warp_perspective.cpp
+++ b/src/op_warp_perspective.cpp
@@ -26,8 +26,8 @@ THE SOFTWARE.
 #include "common/array_wrapper.hpp"
 #include "common/validation_helpers.hpp"
 #include "core/detail/casting.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/detail/math/math.hpp"
-#include "core/detail/type_traits.hpp"
 #include "kernels/device/warp_perspective_device.hpp"
 #include "kernels/host/warp_perspective_host.hpp"
 
@@ -42,10 +42,12 @@ void dispatch_warp_perspective_interp(hipStream_t stream, const Tensor &input, c
     // Launch CPU/GPU kernel depending on requested device type.
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block(64, 16);
-            dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
-                      outputWrapper.batches());
-            Kernels::Device::warp_perspective<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, transform);
+            constexpr auto kernel = Kernels::Device::warp_perspective<
+                InterpolationWrapper<T, B, I>, ImageWrapper<T>, ArrayWrapper<float, 9>>;
+            dim3 block = detail::GetBlockSize2D<kernel>();
+            dim3 grid =
+                detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
+            kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, transform);
             break;
         }