From 718c1ad3d3985379fe73ab4ab6a37cc2ac209356 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 1 Apr 2026 15:01:59 -0400
Subject: [PATCH 1/9] Improve hot paths for BorderWrapper/InterpolationWrapper

---
 include/core/wrappers/border_wrapper.hpp      | 29 +++++++--
 .../core/wrappers/interpolation_wrapper.hpp   | 63 ++++++++++++++-----
 2 files changed, 71 insertions(+), 21 deletions(-)
diff --git a/include/core/wrappers/border_wrapper.hpp b/include/core/wrappers/border_wrapper.hpp
index f55493a5..84c80835 100644
--- a/include/core/wrappers/border_wrapper.hpp
+++ b/include/core/wrappers/border_wrapper.hpp
@@ -27,6 +27,16 @@
 #include "operator_types.h"
 
 namespace roccv {
+namespace detail {
+
+/** Euclidean modulo: result in [0, modulus) for modulus > 0. One hardware remainder vs (a%m+m)%m. */
+__device__ __host__ inline int64_t euclid_mod_i64(int64_t a, int64_t modulus) {
+    int64_t r = a % modulus;
+    if (r < 0) r += modulus;
+    return r;
+}
+
+}  // namespace detail
 
 /**
  * @brief Wrapper class for ImageWrapper. This extends the descriptors by defining behaviors for when tensor
@@ -56,6 +66,13 @@ class BorderWrapper {
     BorderWrapper(ImageWrapper<T> image_wrapper, T border_value)
         : m_desc(image_wrapper), m_border_value(border_value) {}
 
+    /**
+     * @brief Sample the underlying image with no border logic. Caller must ensure coordinates are in-range.
+     */
+    __device__ __host__ inline const T at_inbounds(int64_t n, int64_t h, int64_t w, int64_t c) const {
+        return m_desc.at(n, h, w, c);
+    }
+
     /**
      * @brief Returns a reference to the underlying data given image coordinates. If the coordinates fall out of bounds,
      * a fallback reference based on the provided border type will be given instead.
@@ -92,11 +109,11 @@ class BorderWrapper {
         // is the intended behavior for this border mode.)
         if constexpr (BorderType == eBorderType::BORDER_TYPE_REFLECT) {
             int64_t scale = imgWidth * 2;
-            int64_t val = (w % scale + scale) % scale;
+            int64_t val = detail::euclid_mod_i64(w, scale);
             x = (val < imgWidth) ? val : scale - 1 - val;
 
             scale = imgHeight * 2;
-            val = (h % scale + scale) % scale;
+            val = detail::euclid_mod_i64(h, scale);
             y = (val < imgHeight) ? val : scale - 1 - val;
         }
 
@@ -105,7 +122,7 @@ class BorderWrapper {
                 x = 0;
             } else {
                 int64_t scale = 2 * imgWidth - 2;
-                x = (w % scale + scale) % scale;
+                x = detail::euclid_mod_i64(w, scale);
                 x = imgWidth - 1 - std::abs(imgWidth - 1 - x);
             }
 
@@ -113,7 +130,7 @@ class BorderWrapper {
                 y = 0;
             } else {
                 int64_t scale = 2 * imgHeight - 2;
-                y = (h % scale + scale) % scale;
+                y = detail::euclid_mod_i64(h, scale);
                 y = imgHeight - 1 - std::abs(imgHeight - 1 - y);
             }
         }
@@ -127,11 +144,11 @@ class BorderWrapper {
         // Wrap border type implementation
         if constexpr (BorderType == eBorderType::BORDER_TYPE_WRAP) {
             if (w < 0 || w >= imgWidth) {
-                x = (w % imgWidth + imgWidth) % imgWidth;
+                x = detail::euclid_mod_i64(w, imgWidth);
             }
 
             if (h < 0 || h >= imgHeight) {
-                y = (h % imgHeight + imgHeight) % imgHeight;
+                y = detail::euclid_mod_i64(h, imgHeight);
             }
         }
 
diff --git a/include/core/wrappers/interpolation_wrapper.hpp b/include/core/wrappers/interpolation_wrapper.hpp
index 7adb8cb6..e56b1e59 100644
--- a/include/core/wrappers/interpolation_wrapper.hpp
+++ b/include/core/wrappers/interpolation_wrapper.hpp
@@ -94,8 +94,12 @@ class InterpolationWrapper {
      */
     inline __device__ __host__ const T at(int64_t n, float h, float w, int64_t c) const {
         if constexpr (I == eInterpolationType::INTERP_TYPE_NEAREST) {
-            // Nearest neighbor interpolation implementation
-            return m_desc.at(n, lroundf(h), lroundf(w), c);
+            const int64_t rh = static_cast<int64_t>(lroundf(h));
+            const int64_t rw = static_cast<int64_t>(lroundf(w));
+            if (rw >= 0 && rw < m_desc.width() && rh >= 0 && rh < m_desc.height()) {
+                return m_desc.at_inbounds(n, rh, rw, c);
+            }
+            return m_desc.at(n, rh, rw, c);
         } else if constexpr (I == eInterpolationType::INTERP_TYPE_LINEAR) {
             // Bilinear interpolation implementation
             // v1 -- v2
@@ -109,34 +113,63 @@ class InterpolationWrapper {
             int64_t y0 = static_cast<int64_t>(floorf(h));
             int64_t y1 = y0 + 1;
 
+            if (x0 >= 0 && y0 >= 0 && x1 < m_desc.width() && y1 < m_desc.height()) {
+                auto v1 = detail::RangeCast<WorkType>(m_desc.at_inbounds(n, y0, x0, c));
+                auto v2 = detail::RangeCast<WorkType>(m_desc.at_inbounds(n, y0, x1, c));
+                auto v3 = detail::RangeCast<WorkType>(m_desc.at_inbounds(n, y1, x0, c));
+                auto v4 = detail::RangeCast<WorkType>(m_desc.at_inbounds(n, y1, x1, c));
+                auto q1 = v1 * static_cast<float>(x1 - w) + v2 * static_cast<float>(w - x0);
+                auto q2 = v3 * static_cast<float>(x1 - w) + v4 * static_cast<float>(w - x0);
+                auto q = q1 * static_cast<float>(y1 - h) + q2 * static_cast<float>(h - y0);
+                return detail::RangeCast<T>(q);
+            }
+
             auto v1 = detail::RangeCast<WorkType>(m_desc.at(n, y0, x0, c));
             auto v2 = detail::RangeCast<WorkType>(m_desc.at(n, y0, x1, c));
             auto v3 = detail::RangeCast<WorkType>(m_desc.at(n, y1, x0, c));
             auto v4 = detail::RangeCast<WorkType>(m_desc.at(n, y1, x1, c));
 
-            auto q1 = v1 * (x1 - w) + v2 * (w - x0);
-            auto q2 = v3 * (x1 - w) + v4 * (w - x0);
-            auto q = q1 * (y1 - h) + q2 * (h - y0);
+            auto q1 = v1 * static_cast<float>(x1 - w) + v2 * static_cast<float>(w - x0);
+            auto q2 = v3 * static_cast<float>(x1 - w) + v4 * static_cast<float>(w - x0);
+            auto q = q1 * static_cast<float>(y1 - h) + q2 * static_cast<float>(h - y0);
 
             return detail::RangeCast<T>(q);
         } else if constexpr (I == eInterpolationType::INTERP_TYPE_CUBIC) {
             using namespace roccv::detail;
             using WorkType = detail::MakeType<float, detail::NumElements<T>>;
 
-            // Integer coordinates for pixel (x, y)
-            int64_t int_x = static_cast<int64_t>(floorf(w));
-            int64_t int_y = static_cast<int64_t>(floorf(h));
+            const int64_t int_x = static_cast<int64_t>(floorf(w));
+            const int64_t int_y = static_cast<int64_t>(floorf(h));
 
-            // Calculate weights
             float weight_x[4], weight_y[4];
-            CalBicubicWeights(w - int_x, weight_x);
-            CalBicubicWeights(h - int_y, weight_y);
+            CalBicubicWeights(w - static_cast<float>(int_x), weight_x);
+            CalBicubicWeights(h - static_cast<float>(int_y), weight_y);
+
+            float wxy[16];
+            int k = 0;
+            for (int j = 0; j < 4; j++) {
+                for (int i = 0; i < 4; i++) {
+                    wxy[k++] = weight_y[j] * weight_x[i];
+                }
+            }
 
-            // Weighted sum
             WorkType sum = SetAll<WorkType>(0.0f);
-            for (int index_y = -1; index_y <= 2; index_y++) {
-                for (int index_x = -1; index_x <= 2; index_x++) {
-                    sum += detail::RangeCast<WorkType>(m_desc.at(n, int_y + index_y, int_x + index_x, 0)) * (weight_x[index_x + 1] * weight_y[index_y + 1]);
+            const bool cubic_fast = int_x >= 1 && int_y >= 1 && (int_x + 2) < m_desc.width() && (int_y + 2) < m_desc.height();
+            k = 0;
+            if (cubic_fast) {
+                for (int index_y = -1; index_y <= 2; index_y++) {
+                    for (int index_x = -1; index_x <= 2; index_x++) {
+                        sum = sum + detail::RangeCast<WorkType>(
+                                          m_desc.at_inbounds(n, int_y + index_y, int_x + index_x, c)) *
+                                  wxy[k++];
+                    }
+                }
+            } else {
+                for (int index_y = -1; index_y <= 2; index_y++) {
+                    for (int index_x = -1; index_x <= 2; index_x++) {
+                        sum = sum + detail::RangeCast<WorkType>(m_desc.at(n, int_y + index_y, int_x + index_x, c)) *
+                                  wxy[k++];
+                    }
                 }
             }
 

From 00f6b891b6dd63ec0275fee4e755a7388931a8ad Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 1 Apr 2026 15:42:20 -0400
Subject: [PATCH 2/9] Add dedicated warp_affine path

---
 include/core/wrappers/border_wrapper.hpp      |  27 +++-
 include/kernels/device/warp_affine_device.hpp |  45 ++++++
 include/kernels/host/warp_affine_host.hpp     |  42 ++++++
 include/op_warp_affine.hpp                    |   9 +-
 src/op_warp_affine.cpp                        | 139 ++++++++++++++++--
 5 files changed, 241 insertions(+), 21 deletions(-)
 create mode 100644 include/kernels/device/warp_affine_device.hpp
 create mode 100644 include/kernels/host/warp_affine_host.hpp

diff --git a/include/core/wrappers/border_wrapper.hpp b/include/core/wrappers/border_wrapper.hpp
index 84c80835..67342ec2 100644
--- a/include/core/wrappers/border_wrapper.hpp
+++ b/include/core/wrappers/border_wrapper.hpp
@@ -21,6 +21,7 @@
 
 #pragma once
 
+#include <cstdint>
 #include <hip/hip_runtime.h>
 
 #include "core/wrappers/image_wrapper.hpp"
@@ -36,6 +37,24 @@ __device__ __host__ inline int64_t euclid_mod_i64(int64_t a, int64_t modulus) {
     return r;
 }
 
+/**
+ * On GPU, use 32-bit remainder when operands fit; avoids 64-bit integer division in REFLECT/WRAP hot paths.
+ * Host always uses euclid_mod_i64. Values must stay in range for correctness.
+ */
+__device__ __host__ inline int64_t euclid_mod_i64_fast(int64_t a, int64_t modulus) {
+#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__)
+    constexpr int64_t kLim = int64_t{1} << 30;
+    if (modulus > 0 && modulus < kLim && a > -kLim && a < kLim) {
+        int32_t ai = static_cast<int32_t>(a);
+        int32_t m = static_cast<int32_t>(modulus);
+        int32_t r = ai % m;
+        if (r < 0) r += m;
+        return static_cast<int64_t>(r);
+    }
+#endif
+    return euclid_mod_i64(a, modulus);
+}
+
 }  // namespace detail
 
 /**
@@ -109,11 +128,11 @@ class BorderWrapper {
         // is the intended behavior for this border mode.)
         if constexpr (BorderType == eBorderType::BORDER_TYPE_REFLECT) {
             int64_t scale = imgWidth * 2;
-            int64_t val = detail::euclid_mod_i64(w, scale);
+            int64_t val = detail::euclid_mod_i64_fast(w, scale);
             x = (val < imgWidth) ? val : scale - 1 - val;
 
             scale = imgHeight * 2;
-            val = detail::euclid_mod_i64(h, scale);
+            val = detail::euclid_mod_i64_fast(h, scale);
             y = (val < imgHeight) ? val : scale - 1 - val;
         }
 
@@ -144,11 +163,11 @@ class BorderWrapper {
         // Wrap border type implementation
         if constexpr (BorderType == eBorderType::BORDER_TYPE_WRAP) {
             if (w < 0 || w >= imgWidth) {
-                x = detail::euclid_mod_i64(w, imgWidth);
+                x = detail::euclid_mod_i64_fast(w, imgWidth);
             }
 
             if (h < 0 || h >= imgHeight) {
-                y = detail::euclid_mod_i64(h, imgHeight);
+                y = detail::euclid_mod_i64_fast(h, imgHeight);
             }
         }
 
diff --git a/include/kernels/device/warp_affine_device.hpp b/include/kernels/device/warp_affine_device.hpp
new file mode 100644
index 00000000..bf417d52
--- /dev/null
+++ b/include/kernels/device/warp_affine_device.hpp
@@ -0,0 +1,45 @@
+/**
+Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+
+#include "core/wrappers/interpolation_wrapper.hpp"
+
+namespace Kernels {
+namespace Device {
+
+template <typename SrcWrapper, typename DstWrapper, typename Mat>
+__global__ void warp_affine(SrcWrapper input, DstWrapper output, Mat mat) {
+    const int x = blockDim.x * blockIdx.x + threadIdx.x;
+    const int y = blockDim.y * blockIdx.y + threadIdx.y;
+    const int b = blockIdx.z;
+
+    if (x >= output.width() || y >= output.height()) return;
+
+    const float ox = mat[0] * static_cast<float>(x) + mat[1] * static_cast<float>(y) + mat[2];
+    const float oy = mat[3] * static_cast<float>(x) + mat[4] * static_cast<float>(y) + mat[5];
+    output.at(b, y, x, 0) = input.at(b, oy, ox, 0);
+}
+}  // namespace Device
+}  // namespace Kernels
diff --git a/include/kernels/host/warp_affine_host.hpp b/include/kernels/host/warp_affine_host.hpp
new file mode 100644
index 00000000..085c2179
--- /dev/null
+++ b/include/kernels/host/warp_affine_host.hpp
@@ -0,0 +1,42 @@
+/**
+Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+
+namespace Kernels {
+namespace Host {
+template <typename SrcWrapper, typename DstWrapper, typename Mat>
+void warp_affine(SrcWrapper input, DstWrapper output, Mat mat) {
+    for (int b = 0; b < output.batches(); b++) {
+        for (int y = 0; y < output.height(); y++) {
+            for (int x = 0; x < output.width(); x++) {
+                const float ox = mat[0] * static_cast<float>(x) + mat[1] * static_cast<float>(y) + mat[2];
+                const float oy = mat[3] * static_cast<float>(x) + mat[4] * static_cast<float>(y) + mat[5];
+                output.at(b, y, x, 0) = input.at(b, oy, ox, 0);
+            }
+        }
+    }
+}
+}  // namespace Host
+}  // namespace Kernels
diff --git a/include/op_warp_affine.hpp b/include/op_warp_affine.hpp
index 26960501..4ca7c751 100644
--- a/include/op_warp_affine.hpp
+++ b/include/op_warp_affine.hpp
@@ -21,9 +21,10 @@ THE SOFTWARE.
 */
 #pragma once
 
+#include <hip/hip_runtime.h>
+
 #include "core/tensor.hpp"
 #include "i_operator.hpp"
-#include "op_warp_perspective.hpp"
 #include "operator_types.h"
 
 namespace roccv {
@@ -40,7 +41,7 @@ class WarpAffine final : public IOperator {
      * @brief Construct a new WarpAffine object
      *
      */
-    WarpAffine() : m_op() {}
+    WarpAffine() = default;
 
     /**
      * @brief Destroy the WarpAffine object
@@ -89,9 +90,5 @@ class WarpAffine final : public IOperator {
     void operator()(hipStream_t stream, const Tensor& input, const Tensor& output, const AffineTransform xform,
                     bool isInverted, eInterpolationType interp, eBorderType borderMode, float4 borderValue,
                     eDeviceType device = eDeviceType::GPU) const;
-
-   private:
-    // WarpPerspective op is used to execute the affine transformation, as affine transformation is a subset.
-    WarpPerspective m_op;
 };
 }  // namespace roccv
\ No newline at end of file
diff --git a/src/op_warp_affine.cpp b/src/op_warp_affine.cpp
index a7346507..ee29111f 100644
--- a/src/op_warp_affine.cpp
+++ b/src/op_warp_affine.cpp
@@ -21,25 +21,142 @@ THE SOFTWARE.
 */
 #include "op_warp_affine.hpp"
 
-#include <hip/hip_runtime.h>
+#include <functional>
+
+#include "common/array_wrapper.hpp"
+#include "common/validation_helpers.hpp"
+#include "core/detail/casting.hpp"
+#include "core/detail/math/math.hpp"
+#include "kernels/device/warp_affine_device.hpp"
+#include "kernels/host/warp_affine_host.hpp"
 
 namespace roccv {
+template <typename T, eBorderType B, eInterpolationType I>
+void dispatch_warp_affine_interp(hipStream_t stream, const Tensor &input, const Tensor &output,
+                                 const AffineTransform affineInv, T borderValue, eDeviceType device) {
+    ArrayWrapper<float, 6> transform(affineInv);
+    ImageWrapper<T> outputWrapper(output);
+    InterpolationWrapper<T, B, I> inputWrapper(input, borderValue);
+
+    switch (device) {
+        case eDeviceType::GPU: {
+            dim3 block(64, 16);
+            dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
+                      outputWrapper.batches());
+            Kernels::Device::warp_affine<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, transform);
+            break;
+        }
+
+        case eDeviceType::CPU: {
+            Kernels::Host::warp_affine(inputWrapper, outputWrapper, transform);
+            break;
+        }
+    }
+}
+
+template <typename T, eBorderType B>
+void dispatch_warp_affine_border_mode(hipStream_t stream, const Tensor &input, const Tensor &output,
+                                      const AffineTransform affineInv, eInterpolationType interpolation, T borderValue,
+                                      eDeviceType device) {
+    // clang-format off
+    static const std::unordered_map<eInterpolationType, std::function<void(hipStream_t stream, const Tensor &, const Tensor &, const AffineTransform, T, eDeviceType)>>
+        funcs = {
+            {eInterpolationType::INTERP_TYPE_NEAREST, dispatch_warp_affine_interp<T, B, eInterpolationType::INTERP_TYPE_NEAREST>},
+            {eInterpolationType::INTERP_TYPE_LINEAR,  dispatch_warp_affine_interp<T, B, eInterpolationType::INTERP_TYPE_LINEAR>},
+            {eInterpolationType::INTERP_TYPE_CUBIC,   dispatch_warp_affine_interp<T, B, eInterpolationType::INTERP_TYPE_CUBIC>}
+        };
+    // clang-format on
+
+    if (!funcs.contains(interpolation)) {
+        throw Exception("Operation does not support the given interpolation mode.", eStatusType::NOT_IMPLEMENTED);
+    }
+
+    auto func = funcs.at(interpolation);
+    func(stream, input, output, affineInv, borderValue, device);
+}
+
+template <typename T>
+void dispatch_warp_affine_dtype(hipStream_t stream, const Tensor &input, const Tensor &output,
+                                const AffineTransform affineInv, eInterpolationType interpolation,
+                                eBorderType borderType, float4 borderValue, eDeviceType device) {
+    // clang-format off
+    static const std::unordered_map<eBorderType, std::function<void(hipStream_t, const Tensor&, const Tensor&, const AffineTransform, eInterpolationType, T, eDeviceType)>>
+        funcs = {
+            {eBorderType::BORDER_TYPE_CONSTANT,   dispatch_warp_affine_border_mode<T, eBorderType::BORDER_TYPE_CONSTANT>},
+            {eBorderType::BORDER_TYPE_REPLICATE,  dispatch_warp_affine_border_mode<T, eBorderType::BORDER_TYPE_REPLICATE>},
+            {eBorderType::BORDER_TYPE_REFLECT,    dispatch_warp_affine_border_mode<T, eBorderType::BORDER_TYPE_REFLECT>},
+            {eBorderType::BORDER_TYPE_REFLECT101, dispatch_warp_affine_border_mode<T, eBorderType::BORDER_TYPE_REFLECT101>},
+            {eBorderType::BORDER_TYPE_WRAP,       dispatch_warp_affine_border_mode<T, eBorderType::BORDER_TYPE_WRAP>}
+        };
+    // clang-format on
+
+    if (!funcs.contains(borderType)) {
+        throw Exception("Operator does not support the given border mode.", eStatusType::NOT_IMPLEMENTED);
+    }
 
-void WarpAffine::operator()(hipStream_t stream, const Tensor& input, const Tensor& output, const AffineTransform xform,
+    auto func = funcs.at(borderType);
+    func(stream, input, output, affineInv, interpolation, detail::SaturateCast<T>(borderValue), device);
+}
+
+void WarpAffine::operator()(hipStream_t stream, const Tensor &input, const Tensor &output, const AffineTransform xform,
                             bool isInverted, eInterpolationType interp, eBorderType borderMode, float4 borderValue,
                             eDeviceType device) const {
-    // An affine transformation is a subset of a perspective transform, so use the existing operator. All tensor
-    // validation is performed in the WarpPerspective operator as well.
+    CHECK_TENSOR_DEVICE(input, device);
+    CHECK_TENSOR_DATATYPES(input, DATA_TYPE_S8, DATA_TYPE_U8, DATA_TYPE_U16, DATA_TYPE_S16, DATA_TYPE_U32,
+                           DATA_TYPE_S32, DATA_TYPE_F32, DATA_TYPE_F64);
+    CHECK_TENSOR_LAYOUT(input, TENSOR_LAYOUT_HWC, TENSOR_LAYOUT_NHWC);
+    CHECK_TENSOR_CHANNELS(input, 1, 3, 4);
+
+    eDataType dtype = input.dtype().etype();
+    int64_t channels = input.shape(input.layout().channels_index());
+
+    CHECK_TENSOR_COMPARISON(input.device() == output.device());
+    CHECK_TENSOR_COMPARISON(output.shape(output.layout().channels_index()) == channels);
+    CHECK_TENSOR_COMPARISON(output.dtype() == input.dtype());
+    CHECK_TENSOR_COMPARISON(output.layout() == input.layout());
+    if (output.layout().batch_index() != -1) {
+        CHECK_TENSOR_COMPARISON(output.shape(output.layout().batch_index()) ==
+                                input.shape(input.layout().batch_index()));
+    }
+
+    PerspectiveTransform full{};
+#pragma unroll
+    for (int i = 0; i < 6; i++) {
+        full[i] = xform[i];
+    }
+    full[6] = 0.0f;
+    full[7] = 0.0f;
+    full[8] = 1.0f;
 
-    PerspectiveTransform transform;
+    detail::math::Matrix<float, 3, 3> mat;
+    mat.load(full);
+    if (!isInverted) {
+        detail::math::inv_inplace(mat);
+    }
+    mat.store(full);
+
+    AffineTransform affineInv{};
 #pragma unroll
     for (int i = 0; i < 6; i++) {
-        transform[i] = xform[i];
+        affineInv[i] = full[i];
     }
-    transform[6] = 0.0f;
-    transform[7] = 0.0f;
-    transform[8] = 1.0f;
 
-    m_op(stream, input, output, transform, isInverted, interp, borderMode, borderValue, device);
+    // clang-format off
+    static const std::unordered_map<eDataType, std::array<std::function<void(hipStream_t, const Tensor &, const Tensor &, const AffineTransform, eInterpolationType, eBorderType, float4, eDeviceType)>, 4>>
+        funcs = {
+            {eDataType::DATA_TYPE_U8,  {dispatch_warp_affine_dtype<uchar1>, 0, dispatch_warp_affine_dtype<uchar3>, dispatch_warp_affine_dtype<uchar4>}},
+            {eDataType::DATA_TYPE_S8,  {dispatch_warp_affine_dtype<char1>, 0, dispatch_warp_affine_dtype<char3>, dispatch_warp_affine_dtype<char4>}},
+            {eDataType::DATA_TYPE_U16,  {dispatch_warp_affine_dtype<ushort1>, 0, dispatch_warp_affine_dtype<ushort3>, dispatch_warp_affine_dtype<ushort4>}},
+            {eDataType::DATA_TYPE_S16,  {dispatch_warp_affine_dtype<short1>, 0, dispatch_warp_affine_dtype<short3>, dispatch_warp_affine_dtype<short4>}},
+            {eDataType::DATA_TYPE_U32,  {dispatch_warp_affine_dtype<uint1>, 0, dispatch_warp_affine_dtype<uint3>, dispatch_warp_affine_dtype<uint4>}},
+            {eDataType::DATA_TYPE_S32,  {dispatch_warp_affine_dtype<int1>, 0, dispatch_warp_affine_dtype<int3>, dispatch_warp_affine_dtype<int4>}},
+            {eDataType::DATA_TYPE_F32, {dispatch_warp_affine_dtype<float1>, 0, dispatch_warp_affine_dtype<float3>, dispatch_warp_affine_dtype<float4>}},
+            {eDataType::DATA_TYPE_F64, {dispatch_warp_affine_dtype<double1>, 0, dispatch_warp_affine_dtype<double3>, dispatch_warp_affine_dtype<double4>}}
+        };
+    // clang-format on
+
+    auto func = funcs.at(dtype)[channels - 1];
+    if (func == 0) throw Exception("Not mapped to a defined function.", eStatusType::INVALID_OPERATION);
+    func(stream, input, output, affineInv, interp, borderMode, borderValue, device);
 }
-}  // namespace roccv
\ No newline at end of file
+}  // namespace roccv

From 5af67d27220e4264a8dd72c3efd3e043462bff97 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 1 Apr 2026 16:02:50 -0400
Subject: [PATCH 3/9] Remove nearest bounds check

---
 include/core/wrappers/interpolation_wrapper.hpp | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/include/core/wrappers/interpolation_wrapper.hpp b/include/core/wrappers/interpolation_wrapper.hpp
index e56b1e59..144addf7 100644
--- a/include/core/wrappers/interpolation_wrapper.hpp
+++ b/include/core/wrappers/interpolation_wrapper.hpp
@@ -23,8 +23,8 @@
 
 #include "core/detail/casting.hpp"
 #include "core/detail/math/vectorized_type_math.hpp"
-#include "core/wrappers/border_wrapper.hpp"
 #include "core/detail/vector_utils.hpp"
+#include "core/wrappers/border_wrapper.hpp"
 #include "operator_types.h"
 
 namespace roccv {
@@ -96,9 +96,6 @@ class InterpolationWrapper {
         if constexpr (I == eInterpolationType::INTERP_TYPE_NEAREST) {
             const int64_t rh = static_cast<int64_t>(lroundf(h));
             const int64_t rw = static_cast<int64_t>(lroundf(w));
-            if (rw >= 0 && rw < m_desc.width() && rh >= 0 && rh < m_desc.height()) {
-                return m_desc.at_inbounds(n, rh, rw, c);
-            }
             return m_desc.at(n, rh, rw, c);
         } else if constexpr (I == eInterpolationType::INTERP_TYPE_LINEAR) {
             // Bilinear interpolation implementation
@@ -154,21 +151,22 @@ class InterpolationWrapper {
             }
 
             WorkType sum = SetAll<WorkType>(0.0f);
-            const bool cubic_fast = int_x >= 1 && int_y >= 1 && (int_x + 2) < m_desc.width() && (int_y + 2) < m_desc.height();
+            const bool cubic_fast =
+                int_x >= 1 && int_y >= 1 && (int_x + 2) < m_desc.width() && (int_y + 2) < m_desc.height();
             k = 0;
             if (cubic_fast) {
                 for (int index_y = -1; index_y <= 2; index_y++) {
                     for (int index_x = -1; index_x <= 2; index_x++) {
-                        sum = sum + detail::RangeCast<WorkType>(
-                                          m_desc.at_inbounds(n, int_y + index_y, int_x + index_x, c)) *
+                        sum = sum +
+                              detail::RangeCast<WorkType>(m_desc.at_inbounds(n, int_y + index_y, int_x + index_x, c)) *
                                   wxy[k++];
                     }
                 }
             } else {
                 for (int index_y = -1; index_y <= 2; index_y++) {
                     for (int index_x = -1; index_x <= 2; index_x++) {
-                        sum = sum + detail::RangeCast<WorkType>(m_desc.at(n, int_y + index_y, int_x + index_x, c)) *
-                                  wxy[k++];
+                        sum = sum +
+                              detail::RangeCast<WorkType>(m_desc.at(n, int_y + index_y, int_x + index_x, c)) * wxy[k++];
                     }
                 }
             }

From b781467f7f45d57d0465fba15f8d429c77743dd1 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 15 Apr 2026 12:39:07 -0400
Subject: [PATCH 4/9] Per-axis bounds checking in border wrapper

---
 include/core/wrappers/border_wrapper.hpp | 30 +++++++++++++++---------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/include/core/wrappers/border_wrapper.hpp b/include/core/wrappers/border_wrapper.hpp
index 67342ec2..80942cc0 100644
--- a/include/core/wrappers/border_wrapper.hpp
+++ b/include/core/wrappers/border_wrapper.hpp
@@ -21,9 +21,10 @@
 
 #pragma once
 
-#include <cstdint>
 #include <hip/hip_runtime.h>
 
+#include <cstdint>
+
 #include "core/wrappers/image_wrapper.hpp"
 #include "operator_types.h"
 
@@ -127,13 +128,16 @@ class BorderWrapper {
         // Reflect border type implementation. (Note: This is NOT REFLECT101, pixels at the border will be duplicated as
         // is the intended behavior for this border mode.)
         if constexpr (BorderType == eBorderType::BORDER_TYPE_REFLECT) {
-            int64_t scale = imgWidth * 2;
-            int64_t val = detail::euclid_mod_i64_fast(w, scale);
-            x = (val < imgWidth) ? val : scale - 1 - val;
-
-            scale = imgHeight * 2;
-            val = detail::euclid_mod_i64_fast(h, scale);
-            y = (val < imgHeight) ? val : scale - 1 - val;
+            if (w < 0 || w >= imgWidth) {
+                int64_t scale = imgWidth * 2;
+                int64_t val = detail::euclid_mod_i64_fast(w, scale);
+                x = (val < imgWidth) ? val : scale - 1 - val;
+            }
+            if (h < 0 || h >= imgHeight) {
+                int64_t scale = imgHeight * 2;
+                int64_t val = detail::euclid_mod_i64_fast(h, scale);
+                y = (val < imgHeight) ? val : scale - 1 - val;
+            }
         }
 
         if constexpr (BorderType == eBorderType::BORDER_TYPE_REFLECT101) {
@@ -154,10 +158,14 @@ class BorderWrapper {
             }
         }
 
-        // Replicate border type implementation
+        // Replicate: snap OOB axes to nearest edge; in-range axes stay x=w / y=h (see global early return).
         if constexpr (BorderType == eBorderType::BORDER_TYPE_REPLICATE) {
-            x = std::clamp<int64_t>(w, 0, imgWidth - 1);
-            y = std::clamp<int64_t>(h, 0, imgHeight - 1);
+            if (w < 0 || w >= imgWidth) {
+                x = (w < 0) ? 0 : imgWidth - 1;
+            }
+            if (h < 0 || h >= imgHeight) {
+                y = (h < 0) ? 0 : imgHeight - 1;
+            }
         }
 
         // Wrap border type implementation

From 56f0831c682facc49bedb61565a8fd01e117203f Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Thu, 16 Apr 2026 10:51:56 -0400
Subject: [PATCH 5/9] Unroll for loops in cubic interpolation implementation

---
 include/core/wrappers/border_wrapper.hpp        | 9 ++-------
 include/core/wrappers/interpolation_wrapper.hpp | 6 ++++++
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/include/core/wrappers/border_wrapper.hpp b/include/core/wrappers/border_wrapper.hpp
index 80942cc0..b99dccce 100644
--- a/include/core/wrappers/border_wrapper.hpp
+++ b/include/core/wrappers/border_wrapper.hpp
@@ -170,13 +170,8 @@ class BorderWrapper {
 
         // Wrap border type implementation
         if constexpr (BorderType == eBorderType::BORDER_TYPE_WRAP) {
-            if (w < 0 || w >= imgWidth) {
-                x = detail::euclid_mod_i64_fast(w, imgWidth);
-            }
-
-            if (h < 0 || h >= imgHeight) {
-                y = detail::euclid_mod_i64_fast(h, imgHeight);
-            }
+            x = detail::euclid_mod_i64_fast(w, imgWidth);
+            y = detail::euclid_mod_i64_fast(h, imgHeight);
         }
 
         return m_desc.at(n, y, x, c);
diff --git a/include/core/wrappers/interpolation_wrapper.hpp b/include/core/wrappers/interpolation_wrapper.hpp
index 144addf7..a3d6e9e7 100644
--- a/include/core/wrappers/interpolation_wrapper.hpp
+++ b/include/core/wrappers/interpolation_wrapper.hpp
@@ -144,7 +144,9 @@ class InterpolationWrapper {
 
             float wxy[16];
             int k = 0;
+#pragma unroll
             for (int j = 0; j < 4; j++) {
+#pragma unroll
                 for (int i = 0; i < 4; i++) {
                     wxy[k++] = weight_y[j] * weight_x[i];
                 }
@@ -155,7 +157,9 @@ class InterpolationWrapper {
                 int_x >= 1 && int_y >= 1 && (int_x + 2) < m_desc.width() && (int_y + 2) < m_desc.height();
             k = 0;
             if (cubic_fast) {
+#pragma unroll
                 for (int index_y = -1; index_y <= 2; index_y++) {
+#pragma unroll
                     for (int index_x = -1; index_x <= 2; index_x++) {
                         sum = sum +
                               detail::RangeCast<WorkType>(m_desc.at_inbounds(n, int_y + index_y, int_x + index_x, c)) *
@@ -163,7 +167,9 @@ class InterpolationWrapper {
                     }
                 }
             } else {
+#pragma unroll
                 for (int index_y = -1; index_y <= 2; index_y++) {
+#pragma unroll
                     for (int index_x = -1; index_x <= 2; index_x++) {
                         sum = sum +
                               detail::RangeCast<WorkType>(m_desc.at(n, int_y + index_y, int_x + index_x, c)) * wxy[k++];

From 852176a1c27f634455850f98bc32fd090803ad42 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Thu, 16 Apr 2026 11:24:14 -0400
Subject: [PATCH 6/9] General optimizations for border/interpolation wrapper
 implementations

---
 include/core/wrappers/border_wrapper.hpp      | 110 +++++++++++++-----
 .../core/wrappers/interpolation_wrapper.hpp   |  76 +++++++-----
 2 files changed, 126 insertions(+), 60 deletions(-)

diff --git a/include/core/wrappers/border_wrapper.hpp b/include/core/wrappers/border_wrapper.hpp
index b99dccce..538d0cec 100644
--- a/include/core/wrappers/border_wrapper.hpp
+++ b/include/core/wrappers/border_wrapper.hpp
@@ -31,6 +31,34 @@
 namespace roccv {
 namespace detail {
 
+/** Branchless absolute value for int64 (two's complement); avoids libm/std::abs on GPU. */
+__device__ __host__ __forceinline__ int64_t abs_i64(int64_t v) {
+    const int64_t mask = v >> 63;
+    return (v ^ mask) - mask;
+}
+
+__device__ __host__ __forceinline__ int32_t abs_i32(int32_t v) {
+    const int32_t mask = v >> 31;
+    return (v ^ mask) - mask;
+}
+
+__device__ __host__ __forceinline__ int32_t min_i32(int32_t a, int32_t b) { return a < b ? a : b; }
+
+__device__ __host__ __forceinline__ int64_t min_i64(int64_t a, int64_t b) { return a < b ? a : b; }
+
+__device__ __host__ __forceinline__ int64_t max_i64(int64_t a, int64_t b) { return a > b ? a : b; }
+
+/** Clamp v to [lo, hi]. */
+__device__ __host__ __forceinline__ int64_t clamp_i64(int64_t v, int64_t lo, int64_t hi) {
+    return min_i64(max_i64(v, lo), hi);
+}
+
+__device__ __host__ inline int32_t euclid_mod_i32(int32_t a, int32_t modulus) {
+    int32_t r = a % modulus;
+    if (r < 0) r += modulus;
+    return r;
+}
+
 /** Euclidean modulo: result in [0, modulus) for modulus > 0. One hardware remainder vs (a%m+m)%m. */
 __device__ __host__ inline int64_t euclid_mod_i64(int64_t a, int64_t modulus) {
     int64_t r = a % modulus;
@@ -56,6 +84,53 @@ __device__ __host__ inline int64_t euclid_mod_i64_fast(int64_t a, int64_t modulu
     return euclid_mod_i64(a, modulus);
 }
 
+/**
+ * OpenCV-style BORDER_REFLECT axis map: period 2*extent, edge pixels duplicated (not REFLECT101).
+ * Equivalent to: val = euclid_mod(coord, 2*extent); min(val, 2*extent - 1 - val).
+ * The min form avoids a branch on val < extent and fuses well with 32-bit mod on GPU.
+ */
+__device__ __host__ inline int64_t reflect_border_coord_i64(int64_t coord, int64_t extent) {
+#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__)
+    constexpr int64_t kLim = int64_t{1} << 30;
+    if (extent > 0 && extent < kLim && coord > -kLim && coord < kLim) {
+        const int32_t e = static_cast<int32_t>(extent);
+        const int32_t scale = e * 2;
+        int32_t val = static_cast<int32_t>(coord) % scale;
+        if (val < 0) val += scale;
+        const int32_t inv = scale - 1 - val;
+        return static_cast<int64_t>(min_i32(val, inv));
+    }
+#endif
+    const int64_t scale = extent * 2;
+    const int64_t val = euclid_mod_i64(coord, scale);
+    const int64_t inv = scale - 1 - val;
+    return min_i64(val, inv);
+}
+
+/**
+ * BORDER_REFLECT101 axis map: period (2*extent - 2), endpoints excluded from reflection.
+ */
+__device__ __host__ inline int64_t reflect101_border_coord_i64(int64_t coord, int64_t extent) {
+    if (extent <= 1) {
+        return 0;
+    }
+    const int64_t scale = 2 * extent - 2;
+#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__)
+    constexpr int64_t kLim = int64_t{1} << 30;
+    if (extent < kLim && coord > -kLim && coord < kLim && scale > 0 && scale < kLim) {
+        const int32_t e = static_cast<int32_t>(extent);
+        const int32_t s = e * 2 - 2;
+        int32_t v = euclid_mod_i32(static_cast<int32_t>(coord), s);
+        const int32_t inner = (e - 1) - v;
+        const int32_t a = abs_i32(inner);
+        return static_cast<int64_t>((e - 1) - a);
+    }
+#endif
+    const int64_t v = euclid_mod_i64_fast(coord, scale);
+    const int64_t inner = (extent - 1) - v;
+    return (extent - 1) - abs_i64(inner);
+}
+
 }  // namespace detail
 
 /**
@@ -129,43 +204,22 @@ class BorderWrapper {
         // is the intended behavior for this border mode.)
         if constexpr (BorderType == eBorderType::BORDER_TYPE_REFLECT) {
             if (w < 0 || w >= imgWidth) {
-                int64_t scale = imgWidth * 2;
-                int64_t val = detail::euclid_mod_i64_fast(w, scale);
-                x = (val < imgWidth) ? val : scale - 1 - val;
+                x = detail::reflect_border_coord_i64(w, imgWidth);
             }
             if (h < 0 || h >= imgHeight) {
-                int64_t scale = imgHeight * 2;
-                int64_t val = detail::euclid_mod_i64_fast(h, scale);
-                y = (val < imgHeight) ? val : scale - 1 - val;
+                y = detail::reflect_border_coord_i64(h, imgHeight);
             }
         }
 
         if constexpr (BorderType == eBorderType::BORDER_TYPE_REFLECT101) {
-            if (imgWidth == 1) {
-                x = 0;
-            } else {
-                int64_t scale = 2 * imgWidth - 2;
-                x = detail::euclid_mod_i64(w, scale);
-                x = imgWidth - 1 - std::abs(imgWidth - 1 - x);
-            }
-
-            if (imgHeight == 1) {
-                y = 0;
-            } else {
-                int64_t scale = 2 * imgHeight - 2;
-                y = detail::euclid_mod_i64(h, scale);
-                y = imgHeight - 1 - std::abs(imgHeight - 1 - y);
-            }
+            x = detail::reflect101_border_coord_i64(w, imgWidth);
+            y = detail::reflect101_border_coord_i64(h, imgHeight);
         }
 
-        // Replicate: snap OOB axes to nearest edge; in-range axes stay x=w / y=h (see global early return).
+        // Replicate: clamp to edge. Equivalent to per-axis OOB snap; min/max maps cleanly to GPU integer ops.
         if constexpr (BorderType == eBorderType::BORDER_TYPE_REPLICATE) {
-            if (w < 0 || w >= imgWidth) {
-                x = (w < 0) ? 0 : imgWidth - 1;
-            }
-            if (h < 0 || h >= imgHeight) {
-                y = (h < 0) ? 0 : imgHeight - 1;
-            }
+            x = detail::clamp_i64(w, 0, imgWidth - 1);
+            y = detail::clamp_i64(h, 0, imgHeight - 1);
         }
 
         // Wrap border type implementation
diff --git a/include/core/wrappers/interpolation_wrapper.hpp b/include/core/wrappers/interpolation_wrapper.hpp
index a3d6e9e7..609f956a 100644
--- a/include/core/wrappers/interpolation_wrapper.hpp
+++ b/include/core/wrappers/interpolation_wrapper.hpp
@@ -21,13 +21,33 @@
 
 #pragma once
 
+#include <cmath>
+
+#include <hip/hip_runtime.h>
+
 #include "core/detail/casting.hpp"
-#include "core/detail/math/vectorized_type_math.hpp"
 #include "core/detail/vector_utils.hpp"
 #include "core/wrappers/border_wrapper.hpp"
 #include "operator_types.h"
 
 namespace roccv {
+namespace detail {
+
+/** Floor to int64; on device uses elementwise floor intrinsic (matches HIP __float2ll_rd lowering). */
+__device__ __host__ __forceinline__ int64_t interp_floor_i64(float x) {
+#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__)
+    return static_cast<int64_t>(static_cast<long long>(__builtin_elementwise_floor(x)));
+#else
+    return static_cast<int64_t>(floorf(x));
+#endif
+}
+
+/** Nearest index; llroundf matches lroundf for float (half away from zero). */
+__device__ __host__ __forceinline__ int64_t interp_nearest_i64(float x) {
+    return static_cast<int64_t>(std::llroundf(x));
+}
+
+}  // namespace detail
 
 /**
  * @brief A kernel-friendly wrapper which provides interpolation logic based on the given
@@ -66,22 +86,12 @@ class InterpolationWrapper {
      * @return None.
      */
     __device__ __host__ inline void CalBicubicWeights(float dist, float* weight) const {
-        weight[0] = -0.5f;
-        weight[0] = weight[0] * dist + 1.0f;
-        weight[0] = weight[0] * dist - 0.5f;
-        weight[0] = weight[0] * dist;
-
-        weight[1] = 1.5f;
-        weight[1] = weight[1] * dist - 2.5f;
-        weight[1] = weight[1] * dist;
-        weight[1] = weight[1] * dist + 1.0f;
-
-        weight[2] = -1.5f;
-        weight[2] = weight[2] * dist + 2.f;
-        weight[2] = weight[2] * dist + 0.5f;
-        weight[2] = weight[2] * dist;
-
-        weight[3] = 1 - weight[0] - weight[1] - weight[2];
+        const float d = dist;
+        // Fused multiply-add: single rounding vs separate mul+add (matches kernels/device style).
+        weight[0] = fmaf(fmaf(fmaf(-0.5f, d, 1.0f), d, -0.5f), d, 0.f);
+        weight[1] = fmaf(fmaf(fmaf(1.5f, d, -2.5f), d, 0.f), d, 1.0f);
+        weight[2] = fmaf(fmaf(fmaf(-1.5f, d, 2.f), d, 0.5f), d, 0.f);
+        weight[3] = 1.f - weight[0] - weight[1] - weight[2];
     }
 
     /**
@@ -94,9 +104,7 @@ class InterpolationWrapper {
      */
     inline __device__ __host__ const T at(int64_t n, float h, float w, int64_t c) const {
         if constexpr (I == eInterpolationType::INTERP_TYPE_NEAREST) {
-            const int64_t rh = static_cast<int64_t>(lroundf(h));
-            const int64_t rw = static_cast<int64_t>(lroundf(w));
-            return m_desc.at(n, rh, rw, c);
+            return m_desc.at(n, detail::interp_nearest_i64(h), detail::interp_nearest_i64(w), c);
         } else if constexpr (I == eInterpolationType::INTERP_TYPE_LINEAR) {
             // Bilinear interpolation implementation
             // v1 -- v2
@@ -105,19 +113,23 @@ class InterpolationWrapper {
 
             using WorkType = detail::MakeType<float, detail::NumElements<T>>;
 
-            int64_t x0 = static_cast<int64_t>(floorf(w));
-            int64_t x1 = x0 + 1;
-            int64_t y0 = static_cast<int64_t>(floorf(h));
-            int64_t y1 = y0 + 1;
+            const int64_t x0 = detail::interp_floor_i64(w);
+            const int64_t y0 = detail::interp_floor_i64(h);
+            const int64_t x1 = x0 + 1;
+            const int64_t y1 = y0 + 1;
+            const float fx = w - static_cast<float>(x0);
+            const float fy = h - static_cast<float>(y0);
+            const float omfx = 1.f - fx;
+            const float omfy = 1.f - fy;
 
             if (x0 >= 0 && y0 >= 0 && x1 < m_desc.width() && y1 < m_desc.height()) {
                 auto v1 = detail::RangeCast<WorkType>(m_desc.at_inbounds(n, y0, x0, c));
                 auto v2 = detail::RangeCast<WorkType>(m_desc.at_inbounds(n, y0, x1, c));
                 auto v3 = detail::RangeCast<WorkType>(m_desc.at_inbounds(n, y1, x0, c));
                 auto v4 = detail::RangeCast<WorkType>(m_desc.at_inbounds(n, y1, x1, c));
-                auto q1 = v1 * static_cast<float>(x1 - w) + v2 * static_cast<float>(w - x0);
-                auto q2 = v3 * static_cast<float>(x1 - w) + v4 * static_cast<float>(w - x0);
-                auto q = q1 * static_cast<float>(y1 - h) + q2 * static_cast<float>(h - y0);
+                auto q1 = v1 * omfx + v2 * fx;
+                auto q2 = v3 * omfx + v4 * fx;
+                auto q = q1 * omfy + q2 * fy;
                 return detail::RangeCast<T>(q);
             }
 
@@ -126,17 +138,17 @@ class InterpolationWrapper {
             auto v3 = detail::RangeCast<WorkType>(m_desc.at(n, y1, x0, c));
             auto v4 = detail::RangeCast<WorkType>(m_desc.at(n, y1, x1, c));
 
-            auto q1 = v1 * static_cast<float>(x1 - w) + v2 * static_cast<float>(w - x0);
-            auto q2 = v3 * static_cast<float>(x1 - w) + v4 * static_cast<float>(w - x0);
-            auto q = q1 * static_cast<float>(y1 - h) + q2 * static_cast<float>(h - y0);
+            auto q1 = v1 * omfx + v2 * fx;
+            auto q2 = v3 * omfx + v4 * fx;
+            auto q = q1 * omfy + q2 * fy;
 
             return detail::RangeCast<T>(q);
         } else if constexpr (I == eInterpolationType::INTERP_TYPE_CUBIC) {
             using namespace roccv::detail;
             using WorkType = detail::MakeType<float, detail::NumElements<T>>;
 
-            const int64_t int_x = static_cast<int64_t>(floorf(w));
-            const int64_t int_y = static_cast<int64_t>(floorf(h));
+            const int64_t int_x = detail::interp_floor_i64(w);
+            const int64_t int_y = detail::interp_floor_i64(h);
 
             float weight_x[4], weight_y[4];
             CalBicubicWeights(w - static_cast<float>(int_x), weight_x);

From 2beed7a87f441de98e41826f5c60596100b60c61 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Thu, 16 Apr 2026 11:42:10 -0400
Subject: [PATCH 7/9] Change warp perspective block size

---
 src/op_warp_perspective.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/op_warp_perspective.cpp b/src/op_warp_perspective.cpp
index ca77fc8c..33717694 100644
--- a/src/op_warp_perspective.cpp
+++ b/src/op_warp_perspective.cpp
@@ -42,7 +42,7 @@ void dispatch_warp_perspective_interp(hipStream_t stream, const Tensor &input, c
     // Launch CPU/GPU kernel depending on requested device type.
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block(64, 16);
+            dim3 block(32, 8);
             dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
                       outputWrapper.batches());
             Kernels::Device::warp_perspective<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, transform);

From 79d84a1e5d0532c37a408ad6e577f6d7004383ec Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Thu, 16 Apr 2026 11:45:27 -0400
Subject: [PATCH 8/9] Revert block size changes

---
 src/op_warp_perspective.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/op_warp_perspective.cpp b/src/op_warp_perspective.cpp
index 33717694..ca77fc8c 100644
--- a/src/op_warp_perspective.cpp
+++ b/src/op_warp_perspective.cpp
@@ -42,7 +42,7 @@ void dispatch_warp_perspective_interp(hipStream_t stream, const Tensor &input, c
     // Launch CPU/GPU kernel depending on requested device type.
     switch (device) {
         case eDeviceType::GPU: {
-            dim3 block(32, 8);
+            dim3 block(64, 16);
             dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
                       outputWrapper.batches());
             Kernels::Device::warp_perspective<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, transform);

From 99ac06789ec47d8eec5a52cd89ff46eea3f69bd7 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Thu, 16 Apr 2026 12:02:35 -0400
Subject: [PATCH 9/9] Organize helpers/improve docstrings

---
 include/core/detail/sampling_helpers.hpp      | 164 ++++++++++++++++++
 include/core/wrappers/border_wrapper.hpp      |  81 ++-------
 .../core/wrappers/interpolation_wrapper.hpp   |  23 +--
 3 files changed, 182 insertions(+), 86 deletions(-)
 create mode 100644 include/core/detail/sampling_helpers.hpp

diff --git a/include/core/detail/sampling_helpers.hpp b/include/core/detail/sampling_helpers.hpp
new file mode 100644
index 00000000..812868f8
--- /dev/null
+++ b/include/core/detail/sampling_helpers.hpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#pragma once
+
+/**
+ * @file sampling_helpers.hpp
+ * @brief Small host/device helpers for border coordinate math and interpolation index conversion.
+ */
+
+#include <hip/hip_runtime.h>
+
+#include <cmath>
+#include <cstdint>
+
+namespace roccv {
+namespace detail {
+
+/**
+ * @brief Branchless absolute value of a signed 64-bit integer (two's complement).
+ * @param v Input value.
+ * @return Non-negative absolute value of @p v.
+ * @note Avoids @c std::abs in device code; useful on GPU hot paths.
+ */
+__device__ __host__ __forceinline__ int64_t abs_i64(int64_t v) {
+    const int64_t mask = v >> 63;
+    return (v ^ mask) - mask;
+}
+
+/**
+ * @brief Branchless absolute value of a signed 32-bit integer (two's complement).
+ * @param v Input value.
+ * @return Non-negative absolute value of @p v.
+ */
+__device__ __host__ __forceinline__ int32_t abs_i32(int32_t v) {
+    const int32_t mask = v >> 31;
+    return (v ^ mask) - mask;
+}
+
+/**
+ * @brief Minimum of two 32-bit signed integers.
+ * @param a First operand.
+ * @param b Second operand.
+ * @return The lesser of @p a and @p b.
+ */
+__device__ __host__ __forceinline__ int32_t min_i32(int32_t a, int32_t b) { return a < b ? a : b; }
+
+/**
+ * @brief Minimum of two 64-bit signed integers.
+ * @param a First operand.
+ * @param b Second operand.
+ * @return The lesser of @p a and @p b.
+ */
+__device__ __host__ __forceinline__ int64_t min_i64(int64_t a, int64_t b) { return a < b ? a : b; }
+
+/**
+ * @brief Maximum of two 64-bit signed integers.
+ * @param a First operand.
+ * @param b Second operand.
+ * @return The greater of @p a and @p b.
+ */
+__device__ __host__ __forceinline__ int64_t max_i64(int64_t a, int64_t b) { return a > b ? a : b; }
+
+/**
+ * @brief Clamp a signed 64-bit integer to a closed interval.
+ * @param v Value to clamp.
+ * @param lo Lower bound (inclusive).
+ * @param hi Upper bound (inclusive); must satisfy @p lo <= @p hi.
+ * @return @p v restricted to the inclusive interval between @p lo and @p hi.
+ */
+__device__ __host__ __forceinline__ int64_t clamp_i64(int64_t v, int64_t lo, int64_t hi) {
+    return min_i64(max_i64(v, lo), hi);
+}
+
+/**
+ * @brief Euclidean (non-negative) modulo for 32-bit operands.
+ * @param a Dividend.
+ * @param modulus Strictly positive modulus.
+ * @return Remainder in the half-open range <tt>[0, modulus)</tt>, congruent to @p a modulo @p modulus.
+ */
+__device__ __host__ inline int32_t euclid_mod_i32(int32_t a, int32_t modulus) {
+    int32_t r = a % modulus;
+    if (r < 0) r += modulus;
+    return r;
+}
+
+/**
+ * @brief Euclidean (non-negative) modulo for 64-bit operands.
+ * @param a Dividend.
+ * @param modulus Strictly positive modulus.
+ * @return Remainder in the half-open range <tt>[0, modulus)</tt>, congruent to @p a modulo @p modulus.
+ * @note Uses a single remainder and correction instead of repeating modulo and add.
+ */
+__device__ __host__ inline int64_t euclid_mod_i64(int64_t a, int64_t modulus) {
+    int64_t r = a % modulus;
+    if (r < 0) r += modulus;
+    return r;
+}
+
+/**
+ * @brief Euclidean modulo with a 32-bit fast path on the GPU when operands are in a safe range.
+ * @param a Dividend.
+ * @param modulus Strictly positive modulus.
+ * @return Same as euclid_mod_i64() when operands fit the device fast path; otherwise defers to euclid_mod_i64().
+ * @note On device, uses 32-bit remainder when @p modulus and @p a are sufficiently small to avoid 64-bit division.
+ *       On host, always uses euclid_mod_i64(). Caller must keep values in range when relying on the fast path.
+ */
+__device__ __host__ inline int64_t euclid_mod_i64_fast(int64_t a, int64_t modulus) {
+#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__)
+    constexpr int64_t kLim = int64_t{1} << 30;
+    if (modulus > 0 && modulus < kLim && a > -kLim && a < kLim) {
+        int32_t ai = static_cast<int32_t>(a);
+        int32_t m = static_cast<int32_t>(modulus);
+        int32_t r = ai % m;
+        if (r < 0) r += m;
+        return static_cast<int64_t>(r);
+    }
+#endif
+    return euclid_mod_i64(a, modulus);
+}
+
+/**
+ * @brief Convert a subpixel coordinate to the integer grid index below @p x (floor).
+ * @param x Source coordinate in pixels.
+ * @return Largest int64 not greater than @p x (i.e. floor), suitable as the left/top neighbor index for bilinear/cubic.
+ * @note On device, uses a floor intrinsic compatible with HIP @c __float2ll_rd lowering; on host uses @c floorf().
+ */
+__device__ __host__ __forceinline__ int64_t interp_floor_i64(float x) {
+#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__)
+    return static_cast<int64_t>(static_cast<long long>(__builtin_elementwise_floor(x)));
+#else
+    return static_cast<int64_t>(floorf(x));
+#endif
+}
+
+/**
+ * @brief Nearest-neighbor rounding of a subpixel coordinate to an integer index.
+ * @param x Source coordinate in pixels.
+ * @return Integer closest to @p x, with half values rounded away from zero (same convention as @c std::llroundf()).
+ */
+__device__ __host__ __forceinline__ int64_t interp_nearest_i64(float x) {
+    return static_cast<int64_t>(std::llroundf(x));
+}
+
+}  // namespace detail
+}  // namespace roccv
diff --git a/include/core/wrappers/border_wrapper.hpp b/include/core/wrappers/border_wrapper.hpp
index 538d0cec..6289590b 100644
--- a/include/core/wrappers/border_wrapper.hpp
+++ b/include/core/wrappers/border_wrapper.hpp
@@ -21,73 +21,22 @@
 
 #pragma once
 
-#include <hip/hip_runtime.h>
-
-#include <cstdint>
-
+#include "core/detail/sampling_helpers.hpp"
 #include "core/wrappers/image_wrapper.hpp"
 #include "operator_types.h"
 
 namespace roccv {
-namespace detail {
-
-/** Branchless absolute value for int64 (two's complement); avoids libm/std::abs on GPU. */
-__device__ __host__ __forceinline__ int64_t abs_i64(int64_t v) {
-    const int64_t mask = v >> 63;
-    return (v ^ mask) - mask;
-}
-
-__device__ __host__ __forceinline__ int32_t abs_i32(int32_t v) {
-    const int32_t mask = v >> 31;
-    return (v ^ mask) - mask;
-}
-
-__device__ __host__ __forceinline__ int32_t min_i32(int32_t a, int32_t b) { return a < b ? a : b; }
-
-__device__ __host__ __forceinline__ int64_t min_i64(int64_t a, int64_t b) { return a < b ? a : b; }
-
-__device__ __host__ __forceinline__ int64_t max_i64(int64_t a, int64_t b) { return a > b ? a : b; }
-
-/** Clamp v to [lo, hi]. */
-__device__ __host__ __forceinline__ int64_t clamp_i64(int64_t v, int64_t lo, int64_t hi) {
-    return min_i64(max_i64(v, lo), hi);
-}
-
-__device__ __host__ inline int32_t euclid_mod_i32(int32_t a, int32_t modulus) {
-    int32_t r = a % modulus;
-    if (r < 0) r += modulus;
-    return r;
-}
-
-/** Euclidean modulo: result in [0, modulus) for modulus > 0. One hardware remainder vs (a%m+m)%m. */
-__device__ __host__ inline int64_t euclid_mod_i64(int64_t a, int64_t modulus) {
-    int64_t r = a % modulus;
-    if (r < 0) r += modulus;
-    return r;
-}
-
-/**
- * On GPU, use 32-bit remainder when operands fit; avoids 64-bit integer division in REFLECT/WRAP hot paths.
- * Host always uses euclid_mod_i64. Values must stay in range for correctness.
- */
-__device__ __host__ inline int64_t euclid_mod_i64_fast(int64_t a, int64_t modulus) {
-#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__)
-    constexpr int64_t kLim = int64_t{1} << 30;
-    if (modulus > 0 && modulus < kLim && a > -kLim && a < kLim) {
-        int32_t ai = static_cast<int32_t>(a);
-        int32_t m = static_cast<int32_t>(modulus);
-        int32_t r = ai % m;
-        if (r < 0) r += m;
-        return static_cast<int64_t>(r);
-    }
-#endif
-    return euclid_mod_i64(a, modulus);
-}
 
+namespace detail {
 /**
- * OpenCV-style BORDER_REFLECT axis map: period 2*extent, edge pixels duplicated (not REFLECT101).
- * Equivalent to: val = euclid_mod(coord, 2*extent); min(val, 2*extent - 1 - val).
- * The min form avoids a branch on val < extent and fuses well with 32-bit mod on GPU.
+ * @brief Map one axis coordinate for OpenCV-style @c BORDER_REFLECT (edge pixels duplicated; not @c BORDER_REFLECT101).
+ * @param coord Possibly out-of-bounds coordinate along the axis (width or height index space).
+ * @param extent Positive extent of the axis (number of samples, e.g. image width or height).
+ * @return In-bounds index in <tt>[0, extent)</tt> after reflection.
+ * @note Period is <tt>2 * extent</tt>. Implementation uses Euclidean modulo then
+ *       <tt>min(val, 2*extent - 1 - val)</tt>, which matches comparing @c val to @c extent with a ternary, without a
+ *       separate branch on @p extent alone. On device, a 32-bit remainder path is used when @p extent and @p coord are
+ *       in a safe range.
  */
 __device__ __host__ inline int64_t reflect_border_coord_i64(int64_t coord, int64_t extent) {
 #if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__)
@@ -108,7 +57,12 @@ __device__ __host__ inline int64_t reflect_border_coord_i64(int64_t coord, int64
 }
 
 /**
- * BORDER_REFLECT101 axis map: period (2*extent - 2), endpoints excluded from reflection.
+ * @brief Map one axis coordinate for OpenCV-style @c BORDER_REFLECT101 (endpoints are not repeated in the reflection).
+ * @param coord Possibly out-of-bounds coordinate along the axis.
+ * @param extent Positive extent of the axis (number of samples). If @p extent is at most 1, returns @c 0.
+ * @return In-bounds index in <tt>[0, extent)</tt> after reflection.
+ * @note Period is <tt>2 * extent - 2</tt> when @p extent is greater than 1. Uses Euclidean modulo then folds with
+ *       <tt>(extent - 1) - abs((extent - 1) - v)</tt>. On device, a 32-bit path applies when values fit a fixed bound.
  */
 __device__ __host__ inline int64_t reflect101_border_coord_i64(int64_t coord, int64_t extent) {
     if (extent <= 1) {
@@ -130,7 +84,6 @@ __device__ __host__ inline int64_t reflect101_border_coord_i64(int64_t coord, in
     const int64_t inner = (extent - 1) - v;
     return (extent - 1) - abs_i64(inner);
 }
-
 }  // namespace detail
 
 /**
@@ -253,7 +206,7 @@ class BorderWrapper {
     __device__ __host__ inline int64_t batches() const { return m_desc.batches(); }
 
     /**
-     * @brief Retries the number of channels in the image.
+     * @brief Retrieves the number of channels in the image.
      *
      * @return Image channels.
      */
diff --git a/include/core/wrappers/interpolation_wrapper.hpp b/include/core/wrappers/interpolation_wrapper.hpp
index 609f956a..48fed6c3 100644
--- a/include/core/wrappers/interpolation_wrapper.hpp
+++ b/include/core/wrappers/interpolation_wrapper.hpp
@@ -21,34 +21,13 @@
 
 #pragma once
 
-#include <cmath>
-
-#include <hip/hip_runtime.h>
-
 #include "core/detail/casting.hpp"
+#include "core/detail/sampling_helpers.hpp"
 #include "core/detail/vector_utils.hpp"
 #include "core/wrappers/border_wrapper.hpp"
 #include "operator_types.h"
 
 namespace roccv {
-namespace detail {
-
-/** Floor to int64; on device uses elementwise floor intrinsic (matches HIP __float2ll_rd lowering). */
-__device__ __host__ __forceinline__ int64_t interp_floor_i64(float x) {
-#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__)
-    return static_cast<int64_t>(static_cast<long long>(__builtin_elementwise_floor(x)));
-#else
-    return static_cast<int64_t>(floorf(x));
-#endif
-}
-
-/** Nearest index; llroundf matches lroundf for float (half away from zero). */
-__device__ __host__ __forceinline__ int64_t interp_nearest_i64(float x) {
-    return static_cast<int64_t>(std::llroundf(x));
-}
-
-}  // namespace detail
-
 /**
  * @brief A kernel-friendly wrapper which provides interpolation logic based on the given
  * coordinates. This tensor wrapper is typically only used for input tensors and does not provide write access to its