diff --git a/include/core/detail/sampling_helpers.hpp b/include/core/detail/sampling_helpers.hpp
new file mode 100644
index 00000000..812868f8
--- /dev/null
+++ b/include/core/detail/sampling_helpers.hpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#pragma once
+
+/**
+ * @file sampling_helpers.hpp
+ * @brief Small host/device helpers for border coordinate math and interpolation index conversion.
+ */
+
+#include <hip/hip_runtime.h>
+
+#include <cmath>
+#include <cstdint>
+
+namespace roccv {
+namespace detail {
+
+/**
+ * @brief Branchless absolute value of a signed 64-bit integer (two's complement).
+ * @param v Input value.
+ * @return Non-negative absolute value of @p v.
+ * @note Avoids @c std::abs in device code; useful on GPU hot paths.
+ */
+__device__ __host__ __forceinline__ int64_t abs_i64(int64_t v) {
+    const int64_t mask = v >> 63;
+    return (v ^ mask) - mask;
+}
+
+/**
+ * @brief Branchless absolute value of a signed 32-bit integer (two's complement).
+ * @param v Input value.
+ * @return Non-negative absolute value of @p v.
+ */
+__device__ __host__ __forceinline__ int32_t abs_i32(int32_t v) {
+    const int32_t mask = v >> 31;
+    return (v ^ mask) - mask;
+}
+
+/**
+ * @brief Minimum of two 32-bit signed integers.
+ * @param a First operand.
+ * @param b Second operand.
+ * @return The lesser of @p a and @p b.
+ */
+__device__ __host__ __forceinline__ int32_t min_i32(int32_t a, int32_t b) { return a < b ? a : b; }
+
+/**
+ * @brief Minimum of two 64-bit signed integers.
+ * @param a First operand.
+ * @param b Second operand.
+ * @return The lesser of @p a and @p b.
+ */
+__device__ __host__ __forceinline__ int64_t min_i64(int64_t a, int64_t b) { return a < b ? a : b; }
+
+/**
+ * @brief Maximum of two 64-bit signed integers.
+ * @param a First operand.
+ * @param b Second operand.
+ * @return The greater of @p a and @p b.
+ */
+__device__ __host__ __forceinline__ int64_t max_i64(int64_t a, int64_t b) { return a > b ? a : b; }
+
+/**
+ * @brief Clamp a signed 64-bit integer to a closed interval.
+ * @param v Value to clamp.
+ * @param lo Lower bound (inclusive).
+ * @param hi Upper bound (inclusive); must satisfy @p lo <= @p hi.
+ * @return @p v restricted to the inclusive interval between @p lo and @p hi.
+ */
+__device__ __host__ __forceinline__ int64_t clamp_i64(int64_t v, int64_t lo, int64_t hi) {
+    return min_i64(max_i64(v, lo), hi);
+}
+
+/**
+ * @brief Euclidean (non-negative) modulo for 32-bit operands.
+ * @param a Dividend.
+ * @param modulus Strictly positive modulus.
+ * @return Remainder in the half-open range <tt>[0, modulus)</tt>, congruent to @p a modulo @p modulus.
+ */
+__device__ __host__ inline int32_t euclid_mod_i32(int32_t a, int32_t modulus) {
+    int32_t r = a % modulus;
+    if (r < 0) r += modulus;
+    return r;
+}
+
+/**
+ * @brief Euclidean (non-negative) modulo for 64-bit operands.
+ * @param a Dividend.
+ * @param modulus Strictly positive modulus.
+ * @return Remainder in the half-open range <tt>[0, modulus)</tt>, congruent to @p a modulo @p modulus.
+ * @note Uses a single remainder and correction instead of repeating modulo and add.
+ */
+__device__ __host__ inline int64_t euclid_mod_i64(int64_t a, int64_t modulus) {
+    int64_t r = a % modulus;
+    if (r < 0) r += modulus;
+    return r;
+}
+
+/**
+ * @brief Euclidean modulo with a 32-bit fast path on the GPU when operands are in a safe range.
+ * @param a Dividend.
+ * @param modulus Strictly positive modulus.
+ * @return Same as euclid_mod_i64() when operands fit the device fast path; otherwise defers to euclid_mod_i64().
+ * @note On device, uses 32-bit remainder when @p modulus and @p a are sufficiently small to avoid 64-bit division.
+ *       On host, always uses euclid_mod_i64(). Caller must keep values in range when relying on the fast path.
+ */
+__device__ __host__ inline int64_t euclid_mod_i64_fast(int64_t a, int64_t modulus) {
+#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__)
+    constexpr int64_t kLim = int64_t{1} << 30;
+    if (modulus > 0 && modulus < kLim && a > -kLim && a < kLim) {
+        int32_t ai = static_cast<int32_t>(a);
+        int32_t m = static_cast<int32_t>(modulus);
+        int32_t r = ai % m;
+        if (r < 0) r += m;
+        return static_cast<int64_t>(r);
+    }
+#endif
+    return euclid_mod_i64(a, modulus);
+}
+
+/**
+ * @brief Convert a subpixel coordinate to the integer grid index below @p x (floor).
+ * @param x Source coordinate in pixels.
+ * @return Largest int64 not greater than @p x (i.e. floor), suitable as the left/top neighbor index for bilinear/cubic.
+ * @note On device, uses a floor intrinsic compatible with HIP @c __float2ll_rd lowering; on host uses @c floorf().
+ */
+__device__ __host__ __forceinline__ int64_t interp_floor_i64(float x) {
+#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__)
+    return static_cast<int64_t>(static_cast<long long>(__builtin_elementwise_floor(x)));
+#else
+    return static_cast<int64_t>(floorf(x));
+#endif
+}
+
+/**
+ * @brief Nearest-neighbor rounding of a subpixel coordinate to an integer index.
+ * @param x Source coordinate in pixels.
+ * @return Integer closest to @p x, with half values rounded away from zero (same convention as @c std::llroundf()).
+ */
+__device__ __host__ __forceinline__ int64_t interp_nearest_i64(float x) {
+    return static_cast<int64_t>(std::llroundf(x));
+}
+
+}  // namespace detail
+}  // namespace roccv
diff --git a/include/core/wrappers/border_wrapper.hpp b/include/core/wrappers/border_wrapper.hpp
index f55493a5..6289590b 100644
--- a/include/core/wrappers/border_wrapper.hpp
+++ b/include/core/wrappers/border_wrapper.hpp
@@ -21,13 +21,71 @@
 
 #pragma once
 
-#include <hip/hip_runtime.h>
-
+#include "core/detail/sampling_helpers.hpp"
 #include "core/wrappers/image_wrapper.hpp"
 #include "operator_types.h"
 
 namespace roccv {
 
+namespace detail {
+/**
+ * @brief Map one axis coordinate for OpenCV-style @c BORDER_REFLECT (edge pixels duplicated; not @c BORDER_REFLECT101).
+ * @param coord Possibly out-of-bounds coordinate along the axis (width or height index space).
+ * @param extent Positive extent of the axis (number of samples, e.g. image width or height).
+ * @return In-bounds index in <tt>[0, extent)</tt> after reflection.
+ * @note Period is <tt>2 * extent</tt>. Implementation uses Euclidean modulo then
+ *       <tt>min(val, 2*extent - 1 - val)</tt>, which matches comparing @c val to @c extent with a ternary, without a
+ *       separate branch on @p extent alone. On device, a 32-bit remainder path is used when @p extent and @p coord are
+ *       in a safe range.
+ */
+__device__ __host__ inline int64_t reflect_border_coord_i64(int64_t coord, int64_t extent) {
+#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__)
+    constexpr int64_t kLim = int64_t{1} << 30;
+    if (extent > 0 && extent < kLim && coord > -kLim && coord < kLim) {
+        const int32_t e = static_cast<int32_t>(extent);
+        const int32_t scale = e * 2;
+        int32_t val = static_cast<int32_t>(coord) % scale;
+        if (val < 0) val += scale;
+        const int32_t inv = scale - 1 - val;
+        return static_cast<int64_t>(min_i32(val, inv));
+    }
+#endif
+    const int64_t scale = extent * 2;
+    const int64_t val = euclid_mod_i64(coord, scale);
+    const int64_t inv = scale - 1 - val;
+    return min_i64(val, inv);
+}
+
+/**
+ * @brief Map one axis coordinate for OpenCV-style @c BORDER_REFLECT101 (endpoints are not repeated in the reflection).
+ * @param coord Possibly out-of-bounds coordinate along the axis.
+ * @param extent Positive extent of the axis (number of samples). If @p extent is at most 1, returns @c 0.
+ * @return In-bounds index in <tt>[0, extent)</tt> after reflection.
+ * @note Period is <tt>2 * extent - 2</tt> when @p extent is greater than 1. Uses Euclidean modulo then folds with
+ *       <tt>(extent - 1) - abs((extent - 1) - v)</tt>. On device, a 32-bit path applies when values fit a fixed bound.
+ */
+__device__ __host__ inline int64_t reflect101_border_coord_i64(int64_t coord, int64_t extent) {
+    if (extent <= 1) {
+        return 0;
+    }
+    const int64_t scale = 2 * extent - 2;
+#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__)
+    constexpr int64_t kLim = int64_t{1} << 30;
+    if (extent < kLim && coord > -kLim && coord < kLim && scale > 0 && scale < kLim) {
+        const int32_t e = static_cast<int32_t>(extent);
+        const int32_t s = e * 2 - 2;
+        int32_t v = euclid_mod_i32(static_cast<int32_t>(coord), s);
+        const int32_t inner = (e - 1) - v;
+        const int32_t a = abs_i32(inner);
+        return static_cast<int64_t>((e - 1) - a);
+    }
+#endif
+    const int64_t v = euclid_mod_i64_fast(coord, scale);
+    const int64_t inner = (extent - 1) - v;
+    return (extent - 1) - abs_i64(inner);
+}
+}  // namespace detail
+
 /**
  * @brief Wrapper class for ImageWrapper. This extends the descriptors by defining behaviors for when tensor
  * coordinates go out of scope.
@@ -56,6 +114,13 @@ class BorderWrapper {
     BorderWrapper(ImageWrapper<T> image_wrapper, T border_value)
         : m_desc(image_wrapper), m_border_value(border_value) {}
 
+    /**
+     * @brief Sample the underlying image with no border logic. Caller must ensure coordinates are in-range.
+     */
+    __device__ __host__ inline const T at_inbounds(int64_t n, int64_t h, int64_t w, int64_t c) const {
+        return m_desc.at(n, h, w, c);
+    }
+
     /**
      * @brief Returns a reference to the underlying data given image coordinates. If the coordinates fall out of bounds,
      * a fallback reference based on the provided border type will be given instead.
@@ -91,48 +156,29 @@ class BorderWrapper {
         // Reflect border type implementation. (Note: This is NOT REFLECT101, pixels at the border will be duplicated as
         // is the intended behavior for this border mode.)
         if constexpr (BorderType == eBorderType::BORDER_TYPE_REFLECT) {
-            int64_t scale = imgWidth * 2;
-            int64_t val = (w % scale + scale) % scale;
-            x = (val < imgWidth) ? val : scale - 1 - val;
-
-            scale = imgHeight * 2;
-            val = (h % scale + scale) % scale;
-            y = (val < imgHeight) ? val : scale - 1 - val;
+            if (w < 0 || w >= imgWidth) {
+                x = detail::reflect_border_coord_i64(w, imgWidth);
+            }
+            if (h < 0 || h >= imgHeight) {
+                y = detail::reflect_border_coord_i64(h, imgHeight);
+            }
         }
 
         if constexpr (BorderType == eBorderType::BORDER_TYPE_REFLECT101) {
-            if (imgWidth == 1) {
-                x = 0;
-            } else {
-                int64_t scale = 2 * imgWidth - 2;
-                x = (w % scale + scale) % scale;
-                x = imgWidth - 1 - std::abs(imgWidth - 1 - x);
-            }
-
-            if (imgHeight == 1) {
-                y = 0;
-            } else {
-                int64_t scale = 2 * imgHeight - 2;
-                y = (h % scale + scale) % scale;
-                y = imgHeight - 1 - std::abs(imgHeight - 1 - y);
-            }
+            x = detail::reflect101_border_coord_i64(w, imgWidth);
+            y = detail::reflect101_border_coord_i64(h, imgHeight);
         }
 
-        // Replicate border type implementation
+        // Replicate: clamp to edge. Equivalent to per-axis OOB snap; min/max maps cleanly to GPU integer ops.
         if constexpr (BorderType == eBorderType::BORDER_TYPE_REPLICATE) {
-            x = std::clamp<int64_t>(w, 0, imgWidth - 1);
-            y = std::clamp<int64_t>(h, 0, imgHeight - 1);
+            x = detail::clamp_i64(w, 0, imgWidth - 1);
+            y = detail::clamp_i64(h, 0, imgHeight - 1);
         }
 
         // Wrap border type implementation
         if constexpr (BorderType == eBorderType::BORDER_TYPE_WRAP) {
-            if (w < 0 || w >= imgWidth) {
-                x = (w % imgWidth + imgWidth) % imgWidth;
-            }
-
-            if (h < 0 || h >= imgHeight) {
-                y = (h % imgHeight + imgHeight) % imgHeight;
-            }
+            x = detail::euclid_mod_i64_fast(w, imgWidth);
+            y = detail::euclid_mod_i64_fast(h, imgHeight);
         }
 
         return m_desc.at(n, y, x, c);
@@ -160,7 +206,7 @@ class BorderWrapper {
     __device__ __host__ inline int64_t batches() const { return m_desc.batches(); }
 
     /**
-     * @brief Retries the number of channels in the image.
+     * @brief Retrieves the number of channels in the image.
      *
      * @return Image channels.
      */
diff --git a/include/core/wrappers/interpolation_wrapper.hpp b/include/core/wrappers/interpolation_wrapper.hpp
index 7adb8cb6..48fed6c3 100644
--- a/include/core/wrappers/interpolation_wrapper.hpp
+++ b/include/core/wrappers/interpolation_wrapper.hpp
@@ -22,13 +22,12 @@
 #pragma once
 
 #include "core/detail/casting.hpp"
-#include "core/detail/math/vectorized_type_math.hpp"
-#include "core/wrappers/border_wrapper.hpp"
+#include "core/detail/sampling_helpers.hpp"
 #include "core/detail/vector_utils.hpp"
+#include "core/wrappers/border_wrapper.hpp"
 #include "operator_types.h"
 
 namespace roccv {
-
 /**
  * @brief A kernel-friendly wrapper which provides interpolation logic based on the given
  * coordinates. This tensor wrapper is typically only used for input tensors and does not provide write access to its
@@ -66,22 +65,12 @@ class InterpolationWrapper {
      * @return None.
      */
     __device__ __host__ inline void CalBicubicWeights(float dist, float* weight) const {
-        weight[0] = -0.5f;
-        weight[0] = weight[0] * dist + 1.0f;
-        weight[0] = weight[0] * dist - 0.5f;
-        weight[0] = weight[0] * dist;
-
-        weight[1] = 1.5f;
-        weight[1] = weight[1] * dist - 2.5f;
-        weight[1] = weight[1] * dist;
-        weight[1] = weight[1] * dist + 1.0f;
-
-        weight[2] = -1.5f;
-        weight[2] = weight[2] * dist + 2.f;
-        weight[2] = weight[2] * dist + 0.5f;
-        weight[2] = weight[2] * dist;
-
-        weight[3] = 1 - weight[0] - weight[1] - weight[2];
+        const float d = dist;
+        // Fused multiply-add: single rounding vs separate mul+add (matches kernels/device style).
+        weight[0] = fmaf(fmaf(fmaf(-0.5f, d, 1.0f), d, -0.5f), d, 0.f);
+        weight[1] = fmaf(fmaf(fmaf(1.5f, d, -2.5f), d, 0.f), d, 1.0f);
+        weight[2] = fmaf(fmaf(fmaf(-1.5f, d, 2.f), d, 0.5f), d, 0.f);
+        weight[3] = 1.f - weight[0] - weight[1] - weight[2];
     }
 
     /**
@@ -94,8 +83,7 @@ class InterpolationWrapper {
      */
     inline __device__ __host__ const T at(int64_t n, float h, float w, int64_t c) const {
         if constexpr (I == eInterpolationType::INTERP_TYPE_NEAREST) {
-            // Nearest neighbor interpolation implementation
-            return m_desc.at(n, lroundf(h), lroundf(w), c);
+            return m_desc.at(n, detail::interp_nearest_i64(h), detail::interp_nearest_i64(w), c);
         } else if constexpr (I == eInterpolationType::INTERP_TYPE_LINEAR) {
             // Bilinear interpolation implementation
             // v1 -- v2
@@ -104,39 +92,79 @@ class InterpolationWrapper {
 
             using WorkType = detail::MakeType<float, detail::NumElements<T>>;
 
-            int64_t x0 = static_cast<int64_t>(floorf(w));
-            int64_t x1 = x0 + 1;
-            int64_t y0 = static_cast<int64_t>(floorf(h));
-            int64_t y1 = y0 + 1;
+            const int64_t x0 = detail::interp_floor_i64(w);
+            const int64_t y0 = detail::interp_floor_i64(h);
+            const int64_t x1 = x0 + 1;
+            const int64_t y1 = y0 + 1;
+            const float fx = w - static_cast<float>(x0);
+            const float fy = h - static_cast<float>(y0);
+            const float omfx = 1.f - fx;
+            const float omfy = 1.f - fy;
+
+            if (x0 >= 0 && y0 >= 0 && x1 < m_desc.width() && y1 < m_desc.height()) {
+                auto v1 = detail::RangeCast<WorkType>(m_desc.at_inbounds(n, y0, x0, c));
+                auto v2 = detail::RangeCast<WorkType>(m_desc.at_inbounds(n, y0, x1, c));
+                auto v3 = detail::RangeCast<WorkType>(m_desc.at_inbounds(n, y1, x0, c));
+                auto v4 = detail::RangeCast<WorkType>(m_desc.at_inbounds(n, y1, x1, c));
+                auto q1 = v1 * omfx + v2 * fx;
+                auto q2 = v3 * omfx + v4 * fx;
+                auto q = q1 * omfy + q2 * fy;
+                return detail::RangeCast<T>(q);
+            }
 
             auto v1 = detail::RangeCast<WorkType>(m_desc.at(n, y0, x0, c));
             auto v2 = detail::RangeCast<WorkType>(m_desc.at(n, y0, x1, c));
             auto v3 = detail::RangeCast<WorkType>(m_desc.at(n, y1, x0, c));
             auto v4 = detail::RangeCast<WorkType>(m_desc.at(n, y1, x1, c));
 
-            auto q1 = v1 * (x1 - w) + v2 * (w - x0);
-            auto q2 = v3 * (x1 - w) + v4 * (w - x0);
-            auto q = q1 * (y1 - h) + q2 * (h - y0);
+            auto q1 = v1 * omfx + v2 * fx;
+            auto q2 = v3 * omfx + v4 * fx;
+            auto q = q1 * omfy + q2 * fy;
 
             return detail::RangeCast<T>(q);
         } else if constexpr (I == eInterpolationType::INTERP_TYPE_CUBIC) {
             using namespace roccv::detail;
             using WorkType = detail::MakeType<float, detail::NumElements<T>>;
 
-            // Integer coordinates for pixel (x, y)
-            int64_t int_x = static_cast<int64_t>(floorf(w));
-            int64_t int_y = static_cast<int64_t>(floorf(h));
+            const int64_t int_x = detail::interp_floor_i64(w);
+            const int64_t int_y = detail::interp_floor_i64(h);
 
-            // Calculate weights
             float weight_x[4], weight_y[4];
-            CalBicubicWeights(w - int_x, weight_x);
-            CalBicubicWeights(h - int_y, weight_y);
+            CalBicubicWeights(w - static_cast<float>(int_x), weight_x);
+            CalBicubicWeights(h - static_cast<float>(int_y), weight_y);
+
+            float wxy[16];
+            int k = 0;
+#pragma unroll
+            for (int j = 0; j < 4; j++) {
+#pragma unroll
+                for (int i = 0; i < 4; i++) {
+                    wxy[k++] = weight_y[j] * weight_x[i];
+                }
+            }
 
-            // Weighted sum
             WorkType sum = SetAll<WorkType>(0.0f);
-            for (int index_y = -1; index_y <= 2; index_y++) {
-                for (int index_x = -1; index_x <= 2; index_x++) {
-                    sum += detail::RangeCast<WorkType>(m_desc.at(n, int_y + index_y, int_x + index_x, 0)) * (weight_x[index_x + 1] * weight_y[index_y + 1]);
+            const bool cubic_fast =
+                int_x >= 1 && int_y >= 1 && (int_x + 2) < m_desc.width() && (int_y + 2) < m_desc.height();
+            k = 0;
+            if (cubic_fast) {
+#pragma unroll
+                for (int index_y = -1; index_y <= 2; index_y++) {
+#pragma unroll
+                    for (int index_x = -1; index_x <= 2; index_x++) {
+                        sum = sum +
+                              detail::RangeCast<WorkType>(m_desc.at_inbounds(n, int_y + index_y, int_x + index_x, c)) *
+                                  wxy[k++];
+                    }
+                }
+            } else {
+#pragma unroll
+                for (int index_y = -1; index_y <= 2; index_y++) {
+#pragma unroll
+                    for (int index_x = -1; index_x <= 2; index_x++) {
+                        sum = sum +
+                              detail::RangeCast<WorkType>(m_desc.at(n, int_y + index_y, int_x + index_x, c)) * wxy[k++];
+                    }
                 }
             }
 
diff --git a/include/kernels/device/warp_affine_device.hpp b/include/kernels/device/warp_affine_device.hpp
new file mode 100644
index 00000000..bf417d52
--- /dev/null
+++ b/include/kernels/device/warp_affine_device.hpp
@@ -0,0 +1,45 @@
+/**
+Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+
+#include "core/wrappers/interpolation_wrapper.hpp"
+
+namespace Kernels {
+namespace Device {
+
+template <typename SrcWrapper, typename DstWrapper, typename Mat>
+__global__ void warp_affine(SrcWrapper input, DstWrapper output, Mat mat) {
+    const int x = blockDim.x * blockIdx.x + threadIdx.x;
+    const int y = blockDim.y * blockIdx.y + threadIdx.y;
+    const int b = blockIdx.z;
+
+    if (x >= output.width() || y >= output.height()) return;
+
+    const float ox = mat[0] * static_cast<float>(x) + mat[1] * static_cast<float>(y) + mat[2];
+    const float oy = mat[3] * static_cast<float>(x) + mat[4] * static_cast<float>(y) + mat[5];
+    output.at(b, y, x, 0) = input.at(b, oy, ox, 0);
+}
+}  // namespace Device
+}  // namespace Kernels
diff --git a/include/kernels/host/warp_affine_host.hpp b/include/kernels/host/warp_affine_host.hpp
new file mode 100644
index 00000000..085c2179
--- /dev/null
+++ b/include/kernels/host/warp_affine_host.hpp
@@ -0,0 +1,42 @@
+/**
+Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+
+namespace Kernels {
+namespace Host {
+template <typename SrcWrapper, typename DstWrapper, typename Mat>
+void warp_affine(SrcWrapper input, DstWrapper output, Mat mat) {
+    for (int b = 0; b < output.batches(); b++) {
+        for (int y = 0; y < output.height(); y++) {
+            for (int x = 0; x < output.width(); x++) {
+                const float ox = mat[0] * static_cast<float>(x) + mat[1] * static_cast<float>(y) + mat[2];
+                const float oy = mat[3] * static_cast<float>(x) + mat[4] * static_cast<float>(y) + mat[5];
+                output.at(b, y, x, 0) = input.at(b, oy, ox, 0);
+            }
+        }
+    }
+}
+}  // namespace Host
+}  // namespace Kernels
diff --git a/include/op_warp_affine.hpp b/include/op_warp_affine.hpp
index 26960501..4ca7c751 100644
--- a/include/op_warp_affine.hpp
+++ b/include/op_warp_affine.hpp
@@ -21,9 +21,10 @@ THE SOFTWARE.
 */
 #pragma once
 
+#include <hip/hip_runtime.h>
+
 #include "core/tensor.hpp"
 #include "i_operator.hpp"
-#include "op_warp_perspective.hpp"
 #include "operator_types.h"
 
 namespace roccv {
@@ -40,7 +41,7 @@ class WarpAffine final : public IOperator {
      * @brief Construct a new WarpAffine object
      *
      */
-    WarpAffine() : m_op() {}
+    WarpAffine() = default;
 
     /**
      * @brief Destroy the WarpAffine object
@@ -89,9 +90,5 @@ class WarpAffine final : public IOperator {
     void operator()(hipStream_t stream, const Tensor& input, const Tensor& output, const AffineTransform xform,
                     bool isInverted, eInterpolationType interp, eBorderType borderMode, float4 borderValue,
                     eDeviceType device = eDeviceType::GPU) const;
-
-   private:
-    // WarpPerspective op is used to execute the affine transformation, as affine transformation is a subset.
-    WarpPerspective m_op;
 };
 }  // namespace roccv
\ No newline at end of file
diff --git a/src/op_warp_affine.cpp b/src/op_warp_affine.cpp
index a7346507..ee29111f 100644
--- a/src/op_warp_affine.cpp
+++ b/src/op_warp_affine.cpp
@@ -21,25 +21,142 @@ THE SOFTWARE.
 */
 #include "op_warp_affine.hpp"
 
-#include <hip/hip_runtime.h>
+#include <functional>
+
+#include "common/array_wrapper.hpp"
+#include "common/validation_helpers.hpp"
+#include "core/detail/casting.hpp"
+#include "core/detail/math/math.hpp"
+#include "kernels/device/warp_affine_device.hpp"
+#include "kernels/host/warp_affine_host.hpp"
 
 namespace roccv {
+template <typename T, eBorderType B, eInterpolationType I>
+void dispatch_warp_affine_interp(hipStream_t stream, const Tensor &input, const Tensor &output,
+                                 const AffineTransform affineInv, T borderValue, eDeviceType device) {
+    ArrayWrapper<float, 6> transform(affineInv);
+    ImageWrapper<T> outputWrapper(output);
+    InterpolationWrapper<T, B, I> inputWrapper(input, borderValue);
+
+    switch (device) {
+        case eDeviceType::GPU: {
+            dim3 block(64, 16);
+            dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
+                      outputWrapper.batches());
+            Kernels::Device::warp_affine<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, transform);
+            break;
+        }
+
+        case eDeviceType::CPU: {
+            Kernels::Host::warp_affine(inputWrapper, outputWrapper, transform);
+            break;
+        }
+    }
+}
+
+template <typename T, eBorderType B>
+void dispatch_warp_affine_border_mode(hipStream_t stream, const Tensor &input, const Tensor &output,
+                                      const AffineTransform affineInv, eInterpolationType interpolation, T borderValue,
+                                      eDeviceType device) {
+    // clang-format off
+    static const std::unordered_map<eInterpolationType, std::function<void(hipStream_t stream, const Tensor &, const Tensor &, const AffineTransform, T, eDeviceType)>>
+        funcs = {
+            {eInterpolationType::INTERP_TYPE_NEAREST, dispatch_warp_affine_interp<T, B, eInterpolationType::INTERP_TYPE_NEAREST>},
+            {eInterpolationType::INTERP_TYPE_LINEAR,  dispatch_warp_affine_interp<T, B, eInterpolationType::INTERP_TYPE_LINEAR>},
+            {eInterpolationType::INTERP_TYPE_CUBIC,   dispatch_warp_affine_interp<T, B, eInterpolationType::INTERP_TYPE_CUBIC>}
+        };
+    // clang-format on
+
+    if (!funcs.contains(interpolation)) {
+        throw Exception("Operation does not support the given interpolation mode.", eStatusType::NOT_IMPLEMENTED);
+    }
+
+    auto func = funcs.at(interpolation);
+    func(stream, input, output, affineInv, borderValue, device);
+}
+
+template <typename T>
+void dispatch_warp_affine_dtype(hipStream_t stream, const Tensor &input, const Tensor &output,
+                                const AffineTransform affineInv, eInterpolationType interpolation,
+                                eBorderType borderType, float4 borderValue, eDeviceType device) {
+    // clang-format off
+    static const std::unordered_map<eBorderType, std::function<void(hipStream_t, const Tensor&, const Tensor&, const AffineTransform, eInterpolationType, T, eDeviceType)>>
+        funcs = {
+            {eBorderType::BORDER_TYPE_CONSTANT,   dispatch_warp_affine_border_mode<T, eBorderType::BORDER_TYPE_CONSTANT>},
+            {eBorderType::BORDER_TYPE_REPLICATE,  dispatch_warp_affine_border_mode<T, eBorderType::BORDER_TYPE_REPLICATE>},
+            {eBorderType::BORDER_TYPE_REFLECT,    dispatch_warp_affine_border_mode<T, eBorderType::BORDER_TYPE_REFLECT>},
+            {eBorderType::BORDER_TYPE_REFLECT101, dispatch_warp_affine_border_mode<T, eBorderType::BORDER_TYPE_REFLECT101>},
+            {eBorderType::BORDER_TYPE_WRAP,       dispatch_warp_affine_border_mode<T, eBorderType::BORDER_TYPE_WRAP>}
+        };
+    // clang-format on
+
+    if (!funcs.contains(borderType)) {
+        throw Exception("Operator does not support the given border mode.", eStatusType::NOT_IMPLEMENTED);
+    }
 
-void WarpAffine::operator()(hipStream_t stream, const Tensor& input, const Tensor& output, const AffineTransform xform,
+    auto func = funcs.at(borderType);
+    func(stream, input, output, affineInv, interpolation, detail::SaturateCast<T>(borderValue), device);
+}
+
+void WarpAffine::operator()(hipStream_t stream, const Tensor &input, const Tensor &output, const AffineTransform xform,
                             bool isInverted, eInterpolationType interp, eBorderType borderMode, float4 borderValue,
                             eDeviceType device) const {
-    // An affine transformation is a subset of a perspective transform, so use the existing operator. All tensor
-    // validation is performed in the WarpPerspective operator as well.
+    CHECK_TENSOR_DEVICE(input, device);
+    CHECK_TENSOR_DATATYPES(input, DATA_TYPE_S8, DATA_TYPE_U8, DATA_TYPE_U16, DATA_TYPE_S16, DATA_TYPE_U32,
+                           DATA_TYPE_S32, DATA_TYPE_F32, DATA_TYPE_F64);
+    CHECK_TENSOR_LAYOUT(input, TENSOR_LAYOUT_HWC, TENSOR_LAYOUT_NHWC);
+    CHECK_TENSOR_CHANNELS(input, 1, 3, 4);
+
+    eDataType dtype = input.dtype().etype();
+    int64_t channels = input.shape(input.layout().channels_index());
+
+    CHECK_TENSOR_COMPARISON(input.device() == output.device());
+    CHECK_TENSOR_COMPARISON(output.shape(output.layout().channels_index()) == channels);
+    CHECK_TENSOR_COMPARISON(output.dtype() == input.dtype());
+    CHECK_TENSOR_COMPARISON(output.layout() == input.layout());
+    if (output.layout().batch_index() != -1) {
+        CHECK_TENSOR_COMPARISON(output.shape(output.layout().batch_index()) ==
+                                input.shape(input.layout().batch_index()));
+    }
+
+    PerspectiveTransform full{};
+#pragma unroll
+    for (int i = 0; i < 6; i++) {
+        full[i] = xform[i];
+    }
+    full[6] = 0.0f;
+    full[7] = 0.0f;
+    full[8] = 1.0f;
 
-    PerspectiveTransform transform;
+    detail::math::Matrix<float, 3, 3> mat;
+    mat.load(full);
+    if (!isInverted) {
+        detail::math::inv_inplace(mat);
+    }
+    mat.store(full);
+
+    AffineTransform affineInv{};
 #pragma unroll
     for (int i = 0; i < 6; i++) {
-        transform[i] = xform[i];
+        affineInv[i] = full[i];
     }
-    transform[6] = 0.0f;
-    transform[7] = 0.0f;
-    transform[8] = 1.0f;
 
-    m_op(stream, input, output, transform, isInverted, interp, borderMode, borderValue, device);
+    // clang-format off
+    static const std::unordered_map<eDataType, std::array<std::function<void(hipStream_t, const Tensor &, const Tensor &, const AffineTransform, eInterpolationType, eBorderType, float4, eDeviceType)>, 4>>
+        funcs = {
+            {eDataType::DATA_TYPE_U8,  {dispatch_warp_affine_dtype<uchar1>, 0, dispatch_warp_affine_dtype<uchar3>, dispatch_warp_affine_dtype<uchar4>}},
+            {eDataType::DATA_TYPE_S8,  {dispatch_warp_affine_dtype<char1>, 0, dispatch_warp_affine_dtype<char3>, dispatch_warp_affine_dtype<char4>}},
+            {eDataType::DATA_TYPE_U16,  {dispatch_warp_affine_dtype<ushort1>, 0, dispatch_warp_affine_dtype<ushort3>, dispatch_warp_affine_dtype<ushort4>}},
+            {eDataType::DATA_TYPE_S16,  {dispatch_warp_affine_dtype<short1>, 0, dispatch_warp_affine_dtype<short3>, dispatch_warp_affine_dtype<short4>}},
+            {eDataType::DATA_TYPE_U32,  {dispatch_warp_affine_dtype<uint1>, 0, dispatch_warp_affine_dtype<uint3>, dispatch_warp_affine_dtype<uint4>}},
+            {eDataType::DATA_TYPE_S32,  {dispatch_warp_affine_dtype<int1>, 0, dispatch_warp_affine_dtype<int3>, dispatch_warp_affine_dtype<int4>}},
+            {eDataType::DATA_TYPE_F32, {dispatch_warp_affine_dtype<float1>, 0, dispatch_warp_affine_dtype<float3>, dispatch_warp_affine_dtype<float4>}},
+            {eDataType::DATA_TYPE_F64, {dispatch_warp_affine_dtype<double1>, 0, dispatch_warp_affine_dtype<double3>, dispatch_warp_affine_dtype<double4>}}
+        };
+    // clang-format on
+
+    auto func = funcs.at(dtype)[channels - 1];
+    if (func == 0) throw Exception("Not mapped to a defined function.", eStatusType::INVALID_OPERATION);
+    func(stream, input, output, affineInv, interp, borderMode, borderValue, device);
 }
-}  // namespace roccv
\ No newline at end of file
+}  // namespace roccv