ROCm · zacharyvincze · Feb 9, 2026 · Feb 9, 2026 · Feb 9, 2026 · Feb 9, 2026
diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp
@@ -25,8 +25,10 @@
 #include <functional>
 #include <memory>
 
+#include "core/hip_assert.h"
+
 namespace roccv::detail {
-static void StreamCallback(void* userData) {
+static inline void StreamCallback(void* userData) {
     std::function<void()>* func = static_cast<std::function<void()>*>(userData);
     (*func)();
     delete func;
@@ -46,4 +48,130 @@ void LaunchHostFuncAsync(hipStream_t stream, Callable&& cb) {
     HIP_VALIDATE_NO_ERRORS(hipLaunchHostFunc(stream, StreamCallback, data.get()));
     data.release();  // Release ownership, StreamCallback is responsible for it now
 }
-}  // namespace roccv::detail
+
+/**
+ * @brief Get the device's wavefront/warp size, cached per-thread.
+ *
+ * Re-queries the runtime if the active HIP device has changed since the
+ * last call from the calling thread (so multi-device callers stay correct
+ * without paying for a query on every launch).
+ */
+inline int CachedWarpSize() {
+    static thread_local int cachedDeviceId = -1;
+    static thread_local int cachedWarpSize = 0;
+    int deviceId;
+    HIP_VALIDATE_NO_ERRORS(hipGetDevice(&deviceId));
+    if (deviceId != cachedDeviceId) {
+        HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&cachedWarpSize, hipDeviceAttributeWarpSize, deviceId));
+        cachedDeviceId = deviceId;
+    }
+    return cachedWarpSize;
+}
+
+/**
+ * @brief Cache hipOccupancyMaxPotentialBlockSize per (kernel, device).
+ *
+ * The driver picks a thread count that maximizes resident wavefronts per CU
+ * for the given kernel on the current device, accounting for the kernel's
+ * register and static-shared-memory usage. The result is bounded above by
+ * Cap so the API's drive toward maximum occupancy can't override workload-
+ * class judgment (memory-bound ops gain nothing past ~50% occupancy and
+ * can lose throughput to cache pressure with overly large blocks).
+ *
+ * Each (Kernel, Cap) instantiation gets its own thread-local cache slot,
+ * so the runtime query runs once per (kernel, device, cap) per thread.
+ *
+ * @tparam Kernel The __global__ function pointer (auto NTTP — each unique
+ *                kernel address gets its own cached result).
+ * @tparam Cap    Upper bound on the returned block size.
+ */
+template <auto Kernel, int Cap>
+inline int CachedOccupancyBlockSize() {
+    static thread_local int cachedDeviceId = -1;
+    static thread_local int cachedBlockSize = 0;
+    int deviceId;
+    HIP_VALIDATE_NO_ERRORS(hipGetDevice(&deviceId));
+    if (deviceId != cachedDeviceId) {
+        int minGridSize;
+        HIP_VALIDATE_NO_ERRORS(hipOccupancyMaxPotentialBlockSize(&minGridSize, &cachedBlockSize, Kernel, 0, Cap));
+        cachedDeviceId = deviceId;
+    }
+    return cachedBlockSize;
+}
+
+/**
+ * @brief Pick a 1D block size for a pointwise kernel via runtime occupancy
+ *        query, capped at Cap, and cached per (kernel, device).
+ *
+ * Use for pointwise kernels (no neighborhood reads). The driver returns a
+ * thread count tuned to this specific kernel's register pressure on the
+ * current device — important on architectures with very different SIMD-per-CU
+ * counts and register-file sizes (e.g. CDNA wants more wavefronts in flight
+ * per CU than RDNA to hide HBM latency).
+ *
+ * Pair with GetGridSize1D — see its docs for the row-major launch shape.
+ *
+ * @tparam Kernel The __global__ function pointer.
+ * @tparam Cap    Upper bound on threads per block.
+ * @return dim3(blockSize, 1, 1).
+ */
+template <auto Kernel, int Cap = 256>
+inline dim3 GetBlockSize1D() {
+    return dim3(CachedOccupancyBlockSize<Kernel, Cap>(), 1, 1);
+}
+
+/**
+ * @brief Pick a 2D block size for a stencil/transform kernel via runtime
+ *        occupancy query, capped at Cap, and cached per (kernel, device).
+ *
+ * Use for kernels with 2D locality (stencils, interpolation neighborhoods,
+ * affine warps). Reshapes the queried thread count as
+ * (warpSize, blockSize / warpSize, 1) so threadIdx.x is wavefront-aligned
+ * for coalescing while threadIdx.y stacks rows for tile-style cache reuse.
+ *
+ * @tparam Kernel The __global__ function pointer.
+ * @tparam Cap    Upper bound on threads per block.
+ * @return dim3(warpSize, blockSize / warpSize, 1).
+ */
+template <auto Kernel, int Cap = 512>
+inline dim3 GetBlockSize2D() {
+    int blockSize = CachedOccupancyBlockSize<Kernel, Cap>();
+    int warpSize = CachedWarpSize();
+    return dim3(warpSize, blockSize / warpSize, 1);
+}
+
+/**
+ * @brief Get the grid size for a 2D kernel.
+ *
+ * @param[in] width The width of the image.
+ * @param[in] height The height of the image.
+ * @param[in] batchSize The batch size of the image.
+ * @param[in] blockSize The block size of the kernel.
+ * @return The grid size.
+ */
+static inline dim3 GetGridSize2D(size_t width, size_t height, size_t batchSize, dim3 blockSize) {
+    return dim3((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y, batchSize);
+}
+
+/**
+ * @brief Get the grid size for a 1D-row-major launch.
+ *
+ * Lays one image row along gridDim.y and the batch along gridDim.z. The
+ * block from GetBlockSize1D has blockDim.y == 1, so the kernel's standard
+ * `y = blockDim.y * blockIdx.y + threadIdx.y` collapses to `y = blockIdx.y`
+ * with no kernel changes required.
+ *
+ * Caller is responsible for ensuring height does not exceed the device's
+ * gridDim.y limit (typically 65535) and batchSize does not exceed gridDim.z.
+ *
+ * @param[in] width      The width of the image.
+ * @param[in] height     The height of the image (becomes gridDim.y).
+ * @param[in] batchSize  The number of images (becomes gridDim.z).
+ * @param[in] blockSize  Block size from GetBlockSize1D.
+ * @return The grid size: dim3(ceil(width / blockSize.x), height, batchSize).
+ */
+static inline dim3 GetGridSize1D(size_t width, size_t height, size_t batchSize, dim3 blockSize) {
+    return dim3((width + blockSize.x - 1) / blockSize.x, height, batchSize);
+}
+
+}  // namespace roccv::detail
diff --git a/include/kernels/device/bnd_box_device.hpp b/include/kernels/device/bnd_box_device.hpp
@@ -26,6 +26,7 @@ THE SOFTWARE.
 
 #include <core/detail/type_traits.hpp>
 
+#include "common/math_vector.hpp"
 #include "kernels/kernel_helpers.hpp"
 #include "operator_types.h"
 

diff --git a/include/kernels/host/bnd_box_host.hpp b/include/kernels/host/bnd_box_host.hpp
@@ -26,6 +26,7 @@ THE SOFTWARE.
 
 #include <core/detail/type_traits.hpp>
 
+#include "common/math_vector.hpp"
 #include "kernels/kernel_helpers.hpp"
 #include "operator_types.h"
 

diff --git a/src/op_bilateral_filter.cpp b/src/op_bilateral_filter.cpp
@@ -30,6 +30,7 @@ THE SOFTWARE.
 
 #include "common/validation_helpers.hpp"
 #include "core/detail/casting.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/wrappers/border_wrapper.hpp"
 #include "core/wrappers/image_wrapper.hpp"
 #include "kernels/device/bilateral_filter_device.hpp"
@@ -61,21 +62,20 @@ void dispatch_bilateral_filter_border_mode(hipStream_t stream, const Tensor &inp
         sigmaSpace = 1.0f;
     }
 
-    const int radius =
-        (diameter <= 0) ? static_cast<int>(std::roundf(sigmaSpace * 1.5f)) : (diameter >> 1);
+    const int radius = (diameter <= 0) ? static_cast<int>(std::roundf(sigmaSpace * 1.5f)) : (diameter >> 1);
 
     float spaceCoeff = -1 / (2 * sigmaSpace * sigmaSpace);
     float colorCoeff = -1 / (2 * sigmaColor * sigmaColor);
 
     if (device == eDeviceType::GPU) {
-        dim3 block(8, 8);
+        constexpr auto kernel = Kernels::Device::bilateral_filter<T, BorderWrapper<T, B>, ImageWrapper<T>>;
+        dim3 block = detail::GetBlockSize2D<kernel>();
         uint32_t xGridSize = (outputWrapper.width() + (block.x * 2) - 1) / (block.x * 2);
         uint32_t yGridSize = (outputWrapper.height() + (block.y * 2) - 1) / (block.y * 2);
         uint32_t zGridSize = outputWrapper.batches();
         dim3 grid(xGridSize, yGridSize, zGridSize);
 
-        Kernels::Device::bilateral_filter<T>
-            <<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, radius, spaceCoeff, colorCoeff);
+        kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, radius, spaceCoeff, colorCoeff);
     } else if (device == eDeviceType::CPU) {
         int divisor = std::gcd(4, outputWrapper.height());  // greatest common divisor
         int dividend = std::gcd((numThreads / divisor), outputWrapper.width());

diff --git a/src/op_bnd_box.cpp b/src/op_bnd_box.cpp
@@ -25,10 +25,8 @@ THE SOFTWARE.
 
 #include <algorithm>
 #include <cstring>
-#include <iostream>
 #include <vector>
 
-#include "common/math_vector.hpp"
 #include "common/validation_helpers.hpp"
 #include "core/detail/hip_utils.hpp"
 #include "core/tensor.hpp"
@@ -52,8 +50,9 @@ void dispatch_bnd_box_dtype(hipStream_t stream, const Tensor &input, const Tenso
     auto batchSize = inputWrapper.batches();
     switch (device) {
         case eDeviceType::GPU: {
-            const dim3 block(32, 32);
-            const dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y, batchSize);
+            constexpr auto kernel = Kernels::Device::bndbox_kernel<has_alpha, T, ImageWrapper<T>, ImageWrapper<T>>;
+            const dim3 block = detail::GetBlockSize1D<kernel>();
+            const dim3 grid = detail::GetGridSize1D(width, height, batchSize, block);
 
             Rect_t *rects_ptr = nullptr;
             const auto n_rects = rects->size();
@@ -63,8 +62,8 @@ void dispatch_bnd_box_dtype(hipStream_t stream, const Tensor &input, const Tenso
                 HIP_VALIDATE_NO_ERRORS(
                     hipMemcpyAsync(rects_ptr, rects->data(), sizeof(Rect_t) * n_rects, hipMemcpyHostToDevice, stream));
             }
-            Kernels::Device::bndbox_kernel<has_alpha, T>
-                <<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, rects_ptr, n_rects, batchSize, height, width);
+            kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, rects_ptr, n_rects, batchSize, height,
+                                                width);
             if (n_rects > 0) {
                 HIP_VALIDATE_NO_ERRORS(hipFreeAsync(rects_ptr, stream));
             }

diff --git a/src/op_composite.cpp b/src/op_composite.cpp
@@ -24,6 +24,7 @@
 #include <functional>
 
 #include "common/validation_helpers.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/wrappers/image_wrapper.hpp"
 #include "kernels/device/composite_device.hpp"
 #include "kernels/host/composite_host.hpp"
@@ -40,10 +41,12 @@ void dispatch_composite_masktype(hipStream_t stream, const Tensor& foreground, c
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block(64, 16);
-            dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
-                      outputWrapper.batches());
-            Kernels::Device::composite<<<grid, block, 0, stream>>>(fgWrapper, bgWrapper, maskWrapper, outputWrapper);
+            constexpr auto kernel = Kernels::Device::composite<ImageWrapper<SrcType>, ImageWrapper<MaskType>,
+                                                                ImageWrapper<DstType>>;
+            dim3 block = detail::GetBlockSize1D<kernel>();
+            dim3 grid =
+                detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
+            kernel<<<grid, block, 0, stream>>>(fgWrapper, bgWrapper, maskWrapper, outputWrapper);
             break;
         }
 

diff --git a/src/op_convert_to.cpp b/src/op_convert_to.cpp
@@ -21,22 +21,23 @@ THE SOFTWARE.
 */
 #include "op_convert_to.hpp"
 
+#include <hip/hip_runtime.h>
+
 #include <functional>
 
-#include <hip/hip_runtime.h>
-#include "core/wrappers/image_wrapper.hpp"
 #include "common/validation_helpers.hpp"
 #include "core/detail/casting.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/detail/type_traits.hpp"
+#include "core/wrappers/image_wrapper.hpp"
 #include "kernels/device/convert_to_device.hpp"
 #include "kernels/host/convert_to_host.hpp"
 
 namespace roccv {
 
 template <typename SRC_DT, typename DST_DT, int NC>
-void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const Tensor &output,
-                                       double alpha, double beta, eDeviceType device) {
-
+void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const Tensor &output, double alpha,
+                                  double beta, eDeviceType device) {
     using SRC_DT_NC = detail::MakeType<SRC_DT, NC>;
     using DST_DT_NC = detail::MakeType<DST_DT, NC>;
 
@@ -47,17 +48,19 @@ void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const
     using DST_BT = detail::BaseType<DST_DT>;
 
     using DT_AB = decltype(float() * SRC_BT() * DST_BT());
-    
+
     DT_AB alpha_ab = detail::SaturateCast<DT_AB>(alpha);
     DT_AB beta_ab = detail::SaturateCast<DT_AB>(beta);
 
     // Launch CPU/GPU kernel depending on requested device type.
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block(64, 16);
-            dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
-                      outputWrapper.batches());
-            Kernels::Device::convert_to<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, alpha_ab, beta_ab);
+            constexpr auto kernel =
+                Kernels::Device::convert_to<ImageWrapper<SRC_DT_NC>, ImageWrapper<DST_DT_NC>, DT_AB>;
+            dim3 block = detail::GetBlockSize1D<kernel>();
+            dim3 grid =
+                detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
+            kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, alpha_ab, beta_ab);
             break;
         }
         case eDeviceType::CPU: {
@@ -68,16 +71,14 @@ void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const
 }
 
 template <typename SRC_DT, typename DST_DT>
-void dispatch_convert_to_output_dtype(hipStream_t stream, const Tensor &input, const Tensor &output,
-                                       double alpha, double beta, eDeviceType device) {
-
+void dispatch_convert_to_output_dtype(hipStream_t stream, const Tensor &input, const Tensor &output, double alpha,
+                                      double beta, eDeviceType device) {
     int64_t channels = output.shape(output.layout().channels_index());
     // Select kernel dispatcher based on number of channels.
     // clang-format off
     static const std::array<std::function<void(hipStream_t, const Tensor &, const Tensor &, double, double, eDeviceType)>, 4>
         funcs = {dispatch_convert_to_channels<SRC_DT, DST_DT, 1>, dispatch_convert_to_channels<SRC_DT, DST_DT, 2>, dispatch_convert_to_channels<SRC_DT, DST_DT, 3>, dispatch_convert_to_channels<SRC_DT, DST_DT, 4>};
-
-
+
     // clang-format on
 
     auto func = funcs.at(channels - 1);
@@ -86,11 +87,10 @@ void dispatch_convert_to_output_dtype(hipStream_t stream, const Tensor &input, c
 }
 
 template <typename SRC_DT>
-void dispatch_convert_to_input_dtype(hipStream_t stream, const Tensor &input, const Tensor &output,
-                                       double alpha, double beta, eDeviceType device) {
-
+void dispatch_convert_to_input_dtype(hipStream_t stream, const Tensor &input, const Tensor &output, double alpha,
+                                     double beta, eDeviceType device) {
     eDataType output_dtype = output.dtype().etype();
-    
+
     // Select kernel dispatcher based on a base input datatype.
     // clang-format off
     static const std::unordered_map<eDataType, std::function<void(hipStream_t, const Tensor &, const Tensor &, double, double, eDeviceType)>>

diff --git a/src/op_copy_make_border.cpp b/src/op_copy_make_border.cpp
@@ -24,9 +24,10 @@
 #include <functional>
 
 #include "common/validation_helpers.hpp"
+#include "core/detail/casting.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/wrappers/border_wrapper.hpp"
 #include "core/wrappers/image_wrapper.hpp"
-#include "core/wrappers/interpolation_wrapper.hpp"
 #include "kernels/device/copy_make_border_device.hpp"
 #include "kernels/host/copy_make_border_host.hpp"
 
@@ -43,10 +44,11 @@ void dispatch_copy_make_border_border_mode(hipStream_t stream, const Tensor& inp
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block_dim(64, 16);
-            dim3 grid_dim((out_desc.width() + block_dim.x - 1) / block_dim.x,
-                          (out_desc.height() + block_dim.y - 1) / block_dim.y, out_desc.batches());
-            Kernels::Device::copy_make_border<<<grid_dim, block_dim, 0, stream>>>(in_desc, out_desc, top, left);
+            constexpr auto kernel =
+                Kernels::Device::copy_make_border<BorderWrapper<T, BorderMode>, ImageWrapper<T>>;
+            dim3 block = detail::GetBlockSize1D<kernel>();
+            dim3 grid = detail::GetGridSize1D(out_desc.width(), out_desc.height(), out_desc.batches(), block);
+            kernel<<<grid, block, 0, stream>>>(in_desc, out_desc, top, left);
             break;
         }
         case eDeviceType::CPU: {

diff --git a/src/op_custom_crop.cpp b/src/op_custom_crop.cpp
@@ -26,6 +26,7 @@ THE SOFTWARE.
 #include <functional>
 
 #include "common/validation_helpers.hpp"
+#include "core/detail/hip_utils.hpp"
 #include "core/wrappers/image_wrapper.hpp"
 #include "kernels/device/custom_crop_device.hpp"
 #include "kernels/host/custom_crop_host.hpp"
@@ -40,10 +41,11 @@ void dispatch_custom_crop_dtype(hipStream_t stream, const Tensor& input, const T
 
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block(64, 16);
-            dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
-                      outputWrapper.batches());
-            Kernels::Device::custom_crop<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, cropRect);
+            constexpr auto kernel = Kernels::Device::custom_crop<ImageWrapper<T>, ImageWrapper<T>>;
+            dim3 block = detail::GetBlockSize1D<kernel>();
+            dim3 grid =
+                detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
+            kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, cropRect);
             break;
         }