diff --git a/include/core/detail/sampling_helpers.hpp b/include/core/detail/sampling_helpers.hpp new file mode 100644 index 00000000..812868f8 --- /dev/null +++ b/include/core/detail/sampling_helpers.hpp @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#pragma once + +/** + * @file sampling_helpers.hpp + * @brief Small host/device helpers for border coordinate math and interpolation index conversion. + */ + +#include + +#include +#include + +namespace roccv { +namespace detail { + +/** + * @brief Branchless absolute value of a signed 64-bit integer (two's complement). + * @param v Input value. + * @return Non-negative absolute value of @p v. + * @note Avoids @c std::abs in device code; useful on GPU hot paths. + */ +__device__ __host__ __forceinline__ int64_t abs_i64(int64_t v) { + const int64_t mask = v >> 63; + return (v ^ mask) - mask; +} + +/** + * @brief Branchless absolute value of a signed 32-bit integer (two's complement). + * @param v Input value. + * @return Non-negative absolute value of @p v. + */ +__device__ __host__ __forceinline__ int32_t abs_i32(int32_t v) { + const int32_t mask = v >> 31; + return (v ^ mask) - mask; +} + +/** + * @brief Minimum of two 32-bit signed integers. + * @param a First operand. + * @param b Second operand. + * @return The lesser of @p a and @p b. + */ +__device__ __host__ __forceinline__ int32_t min_i32(int32_t a, int32_t b) { return a < b ? a : b; } + +/** + * @brief Minimum of two 64-bit signed integers. + * @param a First operand. + * @param b Second operand. + * @return The lesser of @p a and @p b. + */ +__device__ __host__ __forceinline__ int64_t min_i64(int64_t a, int64_t b) { return a < b ? a : b; } + +/** + * @brief Maximum of two 64-bit signed integers. + * @param a First operand. + * @param b Second operand. + * @return The greater of @p a and @p b. + */ +__device__ __host__ __forceinline__ int64_t max_i64(int64_t a, int64_t b) { return a > b ? a : b; } + +/** + * @brief Clamp a signed 64-bit integer to a closed interval. + * @param v Value to clamp. + * @param lo Lower bound (inclusive). + * @param hi Upper bound (inclusive); must satisfy @p lo <= @p hi. + * @return @p v restricted to the inclusive interval between @p lo and @p hi. + */ +__device__ __host__ __forceinline__ int64_t clamp_i64(int64_t v, int64_t lo, int64_t hi) { + return min_i64(max_i64(v, lo), hi); +} + +/** + * @brief Euclidean (non-negative) modulo for 32-bit operands. + * @param a Dividend. + * @param modulus Strictly positive modulus. + * @return Remainder in the half-open range [0, modulus), congruent to @p a modulo @p modulus. + */ +__device__ __host__ inline int32_t euclid_mod_i32(int32_t a, int32_t modulus) { + int32_t r = a % modulus; + if (r < 0) r += modulus; + return r; +} + +/** + * @brief Euclidean (non-negative) modulo for 64-bit operands. + * @param a Dividend. + * @param modulus Strictly positive modulus. + * @return Remainder in the half-open range [0, modulus), congruent to @p a modulo @p modulus. + * @note Uses a single remainder and correction instead of repeating modulo and add. + */ +__device__ __host__ inline int64_t euclid_mod_i64(int64_t a, int64_t modulus) { + int64_t r = a % modulus; + if (r < 0) r += modulus; + return r; +} + +/** + * @brief Euclidean modulo with a 32-bit fast path on the GPU when operands are in a safe range. + * @param a Dividend. + * @param modulus Strictly positive modulus. + * @return Same as euclid_mod_i64() when operands fit the device fast path; otherwise defers to euclid_mod_i64(). + * @note On device, uses 32-bit remainder when @p modulus and @p a are sufficiently small to avoid 64-bit division. + * On host, always uses euclid_mod_i64(). Caller must keep values in range when relying on the fast path. + */ +__device__ __host__ inline int64_t euclid_mod_i64_fast(int64_t a, int64_t modulus) { +#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__) + constexpr int64_t kLim = int64_t{1} << 30; + if (modulus > 0 && modulus < kLim && a > -kLim && a < kLim) { + int32_t ai = static_cast(a); + int32_t m = static_cast(modulus); + int32_t r = ai % m; + if (r < 0) r += m; + return static_cast(r); + } +#endif + return euclid_mod_i64(a, modulus); +} + +/** + * @brief Convert a subpixel coordinate to the integer grid index below @p x (floor). + * @param x Source coordinate in pixels. + * @return Largest int64 not greater than @p x (i.e. floor), suitable as the left/top neighbor index for bilinear/cubic. + * @note On device, uses a floor intrinsic compatible with HIP @c __float2ll_rd lowering; on host uses @c floorf(). + */ +__device__ __host__ __forceinline__ int64_t interp_floor_i64(float x) { +#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__) + return static_cast(static_cast(__builtin_elementwise_floor(x))); +#else + return static_cast(floorf(x)); +#endif +} + +/** + * @brief Nearest-neighbor rounding of a subpixel coordinate to an integer index. + * @param x Source coordinate in pixels. + * @return Integer closest to @p x, with half values rounded away from zero (same convention as @c std::llroundf()). + */ +__device__ __host__ __forceinline__ int64_t interp_nearest_i64(float x) { + return static_cast(std::llroundf(x)); +} + +} // namespace detail +} // namespace roccv diff --git a/include/core/wrappers/border_wrapper.hpp b/include/core/wrappers/border_wrapper.hpp index f55493a5..6289590b 100644 --- a/include/core/wrappers/border_wrapper.hpp +++ b/include/core/wrappers/border_wrapper.hpp @@ -21,13 +21,71 @@ #pragma once -#include - +#include "core/detail/sampling_helpers.hpp" #include "core/wrappers/image_wrapper.hpp" #include "operator_types.h" namespace roccv { +namespace detail { +/** + * @brief Map one axis coordinate for OpenCV-style @c BORDER_REFLECT (edge pixels duplicated; not @c BORDER_REFLECT101). + * @param coord Possibly out-of-bounds coordinate along the axis (width or height index space). + * @param extent Positive extent of the axis (number of samples, e.g. image width or height). + * @return In-bounds index in [0, extent) after reflection. + * @note Period is 2 * extent. Implementation uses Euclidean modulo then + * min(val, 2*extent - 1 - val), which matches comparing @c val to @c extent with a ternary, without a + * separate branch on @p extent alone. On device, a 32-bit remainder path is used when @p extent and @p coord are + * in a safe range. + */ +__device__ __host__ inline int64_t reflect_border_coord_i64(int64_t coord, int64_t extent) { +#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__) + constexpr int64_t kLim = int64_t{1} << 30; + if (extent > 0 && extent < kLim && coord > -kLim && coord < kLim) { + const int32_t e = static_cast(extent); + const int32_t scale = e * 2; + int32_t val = static_cast(coord) % scale; + if (val < 0) val += scale; + const int32_t inv = scale - 1 - val; + return static_cast(min_i32(val, inv)); + } +#endif + const int64_t scale = extent * 2; + const int64_t val = euclid_mod_i64(coord, scale); + const int64_t inv = scale - 1 - val; + return min_i64(val, inv); +} + +/** + * @brief Map one axis coordinate for OpenCV-style @c BORDER_REFLECT101 (endpoints are not repeated in the reflection). + * @param coord Possibly out-of-bounds coordinate along the axis. + * @param extent Positive extent of the axis (number of samples). If @p extent is at most 1, returns @c 0. + * @return In-bounds index in [0, extent) after reflection. + * @note Period is 2 * extent - 2 when @p extent is greater than 1. Uses Euclidean modulo then folds with + * (extent - 1) - abs((extent - 1) - v). On device, a 32-bit path applies when values fit a fixed bound. + */ +__device__ __host__ inline int64_t reflect101_border_coord_i64(int64_t coord, int64_t extent) { + if (extent <= 1) { + return 0; + } + const int64_t scale = 2 * extent - 2; +#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__) + constexpr int64_t kLim = int64_t{1} << 30; + if (extent < kLim && coord > -kLim && coord < kLim && scale > 0 && scale < kLim) { + const int32_t e = static_cast(extent); + const int32_t s = e * 2 - 2; + int32_t v = euclid_mod_i32(static_cast(coord), s); + const int32_t inner = (e - 1) - v; + const int32_t a = abs_i32(inner); + return static_cast((e - 1) - a); + } +#endif + const int64_t v = euclid_mod_i64_fast(coord, scale); + const int64_t inner = (extent - 1) - v; + return (extent - 1) - abs_i64(inner); +} +} // namespace detail + /** * @brief Wrapper class for ImageWrapper. This extends the descriptors by defining behaviors for when tensor * coordinates go out of scope. @@ -56,6 +114,13 @@ class BorderWrapper { BorderWrapper(ImageWrapper image_wrapper, T border_value) : m_desc(image_wrapper), m_border_value(border_value) {} + /** + * @brief Sample the underlying image with no border logic. Caller must ensure coordinates are in-range. + */ + __device__ __host__ inline const T at_inbounds(int64_t n, int64_t h, int64_t w, int64_t c) const { + return m_desc.at(n, h, w, c); + } + /** * @brief Returns a reference to the underlying data given image coordinates. If the coordinates fall out of bounds, * a fallback reference based on the provided border type will be given instead. @@ -91,48 +156,29 @@ class BorderWrapper { // Reflect border type implementation. (Note: This is NOT REFLECT101, pixels at the border will be duplicated as // is the intended behavior for this border mode.) if constexpr (BorderType == eBorderType::BORDER_TYPE_REFLECT) { - int64_t scale = imgWidth * 2; - int64_t val = (w % scale + scale) % scale; - x = (val < imgWidth) ? val : scale - 1 - val; - - scale = imgHeight * 2; - val = (h % scale + scale) % scale; - y = (val < imgHeight) ? val : scale - 1 - val; + if (w < 0 || w >= imgWidth) { + x = detail::reflect_border_coord_i64(w, imgWidth); + } + if (h < 0 || h >= imgHeight) { + y = detail::reflect_border_coord_i64(h, imgHeight); + } } if constexpr (BorderType == eBorderType::BORDER_TYPE_REFLECT101) { - if (imgWidth == 1) { - x = 0; - } else { - int64_t scale = 2 * imgWidth - 2; - x = (w % scale + scale) % scale; - x = imgWidth - 1 - std::abs(imgWidth - 1 - x); - } - - if (imgHeight == 1) { - y = 0; - } else { - int64_t scale = 2 * imgHeight - 2; - y = (h % scale + scale) % scale; - y = imgHeight - 1 - std::abs(imgHeight - 1 - y); - } + x = detail::reflect101_border_coord_i64(w, imgWidth); + y = detail::reflect101_border_coord_i64(h, imgHeight); } - // Replicate border type implementation + // Replicate: clamp to edge. Equivalent to per-axis OOB snap; min/max maps cleanly to GPU integer ops. if constexpr (BorderType == eBorderType::BORDER_TYPE_REPLICATE) { - x = std::clamp(w, 0, imgWidth - 1); - y = std::clamp(h, 0, imgHeight - 1); + x = detail::clamp_i64(w, 0, imgWidth - 1); + y = detail::clamp_i64(h, 0, imgHeight - 1); } // Wrap border type implementation if constexpr (BorderType == eBorderType::BORDER_TYPE_WRAP) { - if (w < 0 || w >= imgWidth) { - x = (w % imgWidth + imgWidth) % imgWidth; - } - - if (h < 0 || h >= imgHeight) { - y = (h % imgHeight + imgHeight) % imgHeight; - } + x = detail::euclid_mod_i64_fast(w, imgWidth); + y = detail::euclid_mod_i64_fast(h, imgHeight); } return m_desc.at(n, y, x, c); @@ -160,7 +206,7 @@ class BorderWrapper { __device__ __host__ inline int64_t batches() const { return m_desc.batches(); } /** - * @brief Retries the number of channels in the image. + * @brief Retrieves the number of channels in the image. * * @return Image channels. */ diff --git a/include/core/wrappers/interpolation_wrapper.hpp b/include/core/wrappers/interpolation_wrapper.hpp index 7adb8cb6..48fed6c3 100644 --- a/include/core/wrappers/interpolation_wrapper.hpp +++ b/include/core/wrappers/interpolation_wrapper.hpp @@ -22,13 +22,12 @@ #pragma once #include "core/detail/casting.hpp" -#include "core/detail/math/vectorized_type_math.hpp" -#include "core/wrappers/border_wrapper.hpp" +#include "core/detail/sampling_helpers.hpp" #include "core/detail/vector_utils.hpp" +#include "core/wrappers/border_wrapper.hpp" #include "operator_types.h" namespace roccv { - /** * @brief A kernel-friendly wrapper which provides interpolation logic based on the given * coordinates. This tensor wrapper is typically only used for input tensors and does not provide write access to its @@ -66,22 +65,12 @@ class InterpolationWrapper { * @return None. */ __device__ __host__ inline void CalBicubicWeights(float dist, float* weight) const { - weight[0] = -0.5f; - weight[0] = weight[0] * dist + 1.0f; - weight[0] = weight[0] * dist - 0.5f; - weight[0] = weight[0] * dist; - - weight[1] = 1.5f; - weight[1] = weight[1] * dist - 2.5f; - weight[1] = weight[1] * dist; - weight[1] = weight[1] * dist + 1.0f; - - weight[2] = -1.5f; - weight[2] = weight[2] * dist + 2.f; - weight[2] = weight[2] * dist + 0.5f; - weight[2] = weight[2] * dist; - - weight[3] = 1 - weight[0] - weight[1] - weight[2]; + const float d = dist; + // Fused multiply-add: single rounding vs separate mul+add (matches kernels/device style). + weight[0] = fmaf(fmaf(fmaf(-0.5f, d, 1.0f), d, -0.5f), d, 0.f); + weight[1] = fmaf(fmaf(fmaf(1.5f, d, -2.5f), d, 0.f), d, 1.0f); + weight[2] = fmaf(fmaf(fmaf(-1.5f, d, 2.f), d, 0.5f), d, 0.f); + weight[3] = 1.f - weight[0] - weight[1] - weight[2]; } /** @@ -94,8 +83,7 @@ class InterpolationWrapper { */ inline __device__ __host__ const T at(int64_t n, float h, float w, int64_t c) const { if constexpr (I == eInterpolationType::INTERP_TYPE_NEAREST) { - // Nearest neighbor interpolation implementation - return m_desc.at(n, lroundf(h), lroundf(w), c); + return m_desc.at(n, detail::interp_nearest_i64(h), detail::interp_nearest_i64(w), c); } else if constexpr (I == eInterpolationType::INTERP_TYPE_LINEAR) { // Bilinear interpolation implementation // v1 -- v2 @@ -104,39 +92,79 @@ class InterpolationWrapper { using WorkType = detail::MakeType>; - int64_t x0 = static_cast(floorf(w)); - int64_t x1 = x0 + 1; - int64_t y0 = static_cast(floorf(h)); - int64_t y1 = y0 + 1; + const int64_t x0 = detail::interp_floor_i64(w); + const int64_t y0 = detail::interp_floor_i64(h); + const int64_t x1 = x0 + 1; + const int64_t y1 = y0 + 1; + const float fx = w - static_cast(x0); + const float fy = h - static_cast(y0); + const float omfx = 1.f - fx; + const float omfy = 1.f - fy; + + if (x0 >= 0 && y0 >= 0 && x1 < m_desc.width() && y1 < m_desc.height()) { + auto v1 = detail::RangeCast(m_desc.at_inbounds(n, y0, x0, c)); + auto v2 = detail::RangeCast(m_desc.at_inbounds(n, y0, x1, c)); + auto v3 = detail::RangeCast(m_desc.at_inbounds(n, y1, x0, c)); + auto v4 = detail::RangeCast(m_desc.at_inbounds(n, y1, x1, c)); + auto q1 = v1 * omfx + v2 * fx; + auto q2 = v3 * omfx + v4 * fx; + auto q = q1 * omfy + q2 * fy; + return detail::RangeCast(q); + } auto v1 = detail::RangeCast(m_desc.at(n, y0, x0, c)); auto v2 = detail::RangeCast(m_desc.at(n, y0, x1, c)); auto v3 = detail::RangeCast(m_desc.at(n, y1, x0, c)); auto v4 = detail::RangeCast(m_desc.at(n, y1, x1, c)); - auto q1 = v1 * (x1 - w) + v2 * (w - x0); - auto q2 = v3 * (x1 - w) + v4 * (w - x0); - auto q = q1 * (y1 - h) + q2 * (h - y0); + auto q1 = v1 * omfx + v2 * fx; + auto q2 = v3 * omfx + v4 * fx; + auto q = q1 * omfy + q2 * fy; return detail::RangeCast(q); } else if constexpr (I == eInterpolationType::INTERP_TYPE_CUBIC) { using namespace roccv::detail; using WorkType = detail::MakeType>; - // Integer coordinates for pixel (x, y) - int64_t int_x = static_cast(floorf(w)); - int64_t int_y = static_cast(floorf(h)); + const int64_t int_x = detail::interp_floor_i64(w); + const int64_t int_y = detail::interp_floor_i64(h); - // Calculate weights float weight_x[4], weight_y[4]; - CalBicubicWeights(w - int_x, weight_x); - CalBicubicWeights(h - int_y, weight_y); + CalBicubicWeights(w - static_cast(int_x), weight_x); + CalBicubicWeights(h - static_cast(int_y), weight_y); + + float wxy[16]; + int k = 0; +#pragma unroll + for (int j = 0; j < 4; j++) { +#pragma unroll + for (int i = 0; i < 4; i++) { + wxy[k++] = weight_y[j] * weight_x[i]; + } + } - // Weighted sum WorkType sum = SetAll(0.0f); - for (int index_y = -1; index_y <= 2; index_y++) { - for (int index_x = -1; index_x <= 2; index_x++) { - sum += detail::RangeCast(m_desc.at(n, int_y + index_y, int_x + index_x, 0)) * (weight_x[index_x + 1] * weight_y[index_y + 1]); + const bool cubic_fast = + int_x >= 1 && int_y >= 1 && (int_x + 2) < m_desc.width() && (int_y + 2) < m_desc.height(); + k = 0; + if (cubic_fast) { +#pragma unroll + for (int index_y = -1; index_y <= 2; index_y++) { +#pragma unroll + for (int index_x = -1; index_x <= 2; index_x++) { + sum = sum + + detail::RangeCast(m_desc.at_inbounds(n, int_y + index_y, int_x + index_x, c)) * + wxy[k++]; + } + } + } else { +#pragma unroll + for (int index_y = -1; index_y <= 2; index_y++) { +#pragma unroll + for (int index_x = -1; index_x <= 2; index_x++) { + sum = sum + + detail::RangeCast(m_desc.at(n, int_y + index_y, int_x + index_x, c)) * wxy[k++]; + } } } diff --git a/include/kernels/device/warp_affine_device.hpp b/include/kernels/device/warp_affine_device.hpp new file mode 100644 index 00000000..bf417d52 --- /dev/null +++ b/include/kernels/device/warp_affine_device.hpp @@ -0,0 +1,45 @@ +/** +Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include + +#include "core/wrappers/interpolation_wrapper.hpp" + +namespace Kernels { +namespace Device { + +template +__global__ void warp_affine(SrcWrapper input, DstWrapper output, Mat mat) { + const int x = blockDim.x * blockIdx.x + threadIdx.x; + const int y = blockDim.y * blockIdx.y + threadIdx.y; + const int b = blockIdx.z; + + if (x >= output.width() || y >= output.height()) return; + + const float ox = mat[0] * static_cast(x) + mat[1] * static_cast(y) + mat[2]; + const float oy = mat[3] * static_cast(x) + mat[4] * static_cast(y) + mat[5]; + output.at(b, y, x, 0) = input.at(b, oy, ox, 0); +} +} // namespace Device +} // namespace Kernels diff --git a/include/kernels/host/warp_affine_host.hpp b/include/kernels/host/warp_affine_host.hpp new file mode 100644 index 00000000..085c2179 --- /dev/null +++ b/include/kernels/host/warp_affine_host.hpp @@ -0,0 +1,42 @@ +/** +Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include + +namespace Kernels { +namespace Host { +template +void warp_affine(SrcWrapper input, DstWrapper output, Mat mat) { + for (int b = 0; b < output.batches(); b++) { + for (int y = 0; y < output.height(); y++) { + for (int x = 0; x < output.width(); x++) { + const float ox = mat[0] * static_cast(x) + mat[1] * static_cast(y) + mat[2]; + const float oy = mat[3] * static_cast(x) + mat[4] * static_cast(y) + mat[5]; + output.at(b, y, x, 0) = input.at(b, oy, ox, 0); + } + } + } +} +} // namespace Host +} // namespace Kernels diff --git a/include/op_warp_affine.hpp b/include/op_warp_affine.hpp index 26960501..4ca7c751 100644 --- a/include/op_warp_affine.hpp +++ b/include/op_warp_affine.hpp @@ -21,9 +21,10 @@ THE SOFTWARE. */ #pragma once +#include + #include "core/tensor.hpp" #include "i_operator.hpp" -#include "op_warp_perspective.hpp" #include "operator_types.h" namespace roccv { @@ -40,7 +41,7 @@ class WarpAffine final : public IOperator { * @brief Construct a new WarpAffine object * */ - WarpAffine() : m_op() {} + WarpAffine() = default; /** * @brief Destroy the WarpAffine object @@ -89,9 +90,5 @@ class WarpAffine final : public IOperator { void operator()(hipStream_t stream, const Tensor& input, const Tensor& output, const AffineTransform xform, bool isInverted, eInterpolationType interp, eBorderType borderMode, float4 borderValue, eDeviceType device = eDeviceType::GPU) const; - - private: - // WarpPerspective op is used to execute the affine transformation, as affine transformation is a subset. - WarpPerspective m_op; }; } // namespace roccv \ No newline at end of file diff --git a/src/op_warp_affine.cpp b/src/op_warp_affine.cpp index a7346507..ee29111f 100644 --- a/src/op_warp_affine.cpp +++ b/src/op_warp_affine.cpp @@ -21,25 +21,142 @@ THE SOFTWARE. */ #include "op_warp_affine.hpp" -#include +#include + +#include "common/array_wrapper.hpp" +#include "common/validation_helpers.hpp" +#include "core/detail/casting.hpp" +#include "core/detail/math/math.hpp" +#include "kernels/device/warp_affine_device.hpp" +#include "kernels/host/warp_affine_host.hpp" namespace roccv { +template +void dispatch_warp_affine_interp(hipStream_t stream, const Tensor &input, const Tensor &output, + const AffineTransform affineInv, T borderValue, eDeviceType device) { + ArrayWrapper transform(affineInv); + ImageWrapper outputWrapper(output); + InterpolationWrapper inputWrapper(input, borderValue); + + switch (device) { + case eDeviceType::GPU: { + dim3 block(64, 16); + dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y, + outputWrapper.batches()); + Kernels::Device::warp_affine<<>>(inputWrapper, outputWrapper, transform); + break; + } + + case eDeviceType::CPU: { + Kernels::Host::warp_affine(inputWrapper, outputWrapper, transform); + break; + } + } +} + +template +void dispatch_warp_affine_border_mode(hipStream_t stream, const Tensor &input, const Tensor &output, + const AffineTransform affineInv, eInterpolationType interpolation, T borderValue, + eDeviceType device) { + // clang-format off + static const std::unordered_map> + funcs = { + {eInterpolationType::INTERP_TYPE_NEAREST, dispatch_warp_affine_interp}, + {eInterpolationType::INTERP_TYPE_LINEAR, dispatch_warp_affine_interp}, + {eInterpolationType::INTERP_TYPE_CUBIC, dispatch_warp_affine_interp} + }; + // clang-format on + + if (!funcs.contains(interpolation)) { + throw Exception("Operation does not support the given interpolation mode.", eStatusType::NOT_IMPLEMENTED); + } + + auto func = funcs.at(interpolation); + func(stream, input, output, affineInv, borderValue, device); +} + +template +void dispatch_warp_affine_dtype(hipStream_t stream, const Tensor &input, const Tensor &output, + const AffineTransform affineInv, eInterpolationType interpolation, + eBorderType borderType, float4 borderValue, eDeviceType device) { + // clang-format off + static const std::unordered_map> + funcs = { + {eBorderType::BORDER_TYPE_CONSTANT, dispatch_warp_affine_border_mode}, + {eBorderType::BORDER_TYPE_REPLICATE, dispatch_warp_affine_border_mode}, + {eBorderType::BORDER_TYPE_REFLECT, dispatch_warp_affine_border_mode}, + {eBorderType::BORDER_TYPE_REFLECT101, dispatch_warp_affine_border_mode}, + {eBorderType::BORDER_TYPE_WRAP, dispatch_warp_affine_border_mode} + }; + // clang-format on + + if (!funcs.contains(borderType)) { + throw Exception("Operator does not support the given border mode.", eStatusType::NOT_IMPLEMENTED); + } -void WarpAffine::operator()(hipStream_t stream, const Tensor& input, const Tensor& output, const AffineTransform xform, + auto func = funcs.at(borderType); + func(stream, input, output, affineInv, interpolation, detail::SaturateCast(borderValue), device); +} + +void WarpAffine::operator()(hipStream_t stream, const Tensor &input, const Tensor &output, const AffineTransform xform, bool isInverted, eInterpolationType interp, eBorderType borderMode, float4 borderValue, eDeviceType device) const { - // An affine transformation is a subset of a perspective transform, so use the existing operator. All tensor - // validation is performed in the WarpPerspective operator as well. + CHECK_TENSOR_DEVICE(input, device); + CHECK_TENSOR_DATATYPES(input, DATA_TYPE_S8, DATA_TYPE_U8, DATA_TYPE_U16, DATA_TYPE_S16, DATA_TYPE_U32, + DATA_TYPE_S32, DATA_TYPE_F32, DATA_TYPE_F64); + CHECK_TENSOR_LAYOUT(input, TENSOR_LAYOUT_HWC, TENSOR_LAYOUT_NHWC); + CHECK_TENSOR_CHANNELS(input, 1, 3, 4); + + eDataType dtype = input.dtype().etype(); + int64_t channels = input.shape(input.layout().channels_index()); + + CHECK_TENSOR_COMPARISON(input.device() == output.device()); + CHECK_TENSOR_COMPARISON(output.shape(output.layout().channels_index()) == channels); + CHECK_TENSOR_COMPARISON(output.dtype() == input.dtype()); + CHECK_TENSOR_COMPARISON(output.layout() == input.layout()); + if (output.layout().batch_index() != -1) { + CHECK_TENSOR_COMPARISON(output.shape(output.layout().batch_index()) == + input.shape(input.layout().batch_index())); + } + + PerspectiveTransform full{}; +#pragma unroll + for (int i = 0; i < 6; i++) { + full[i] = xform[i]; + } + full[6] = 0.0f; + full[7] = 0.0f; + full[8] = 1.0f; - PerspectiveTransform transform; + detail::math::Matrix mat; + mat.load(full); + if (!isInverted) { + detail::math::inv_inplace(mat); + } + mat.store(full); + + AffineTransform affineInv{}; #pragma unroll for (int i = 0; i < 6; i++) { - transform[i] = xform[i]; + affineInv[i] = full[i]; } - transform[6] = 0.0f; - transform[7] = 0.0f; - transform[8] = 1.0f; - m_op(stream, input, output, transform, isInverted, interp, borderMode, borderValue, device); + // clang-format off + static const std::unordered_map, 4>> + funcs = { + {eDataType::DATA_TYPE_U8, {dispatch_warp_affine_dtype, 0, dispatch_warp_affine_dtype, dispatch_warp_affine_dtype}}, + {eDataType::DATA_TYPE_S8, {dispatch_warp_affine_dtype, 0, dispatch_warp_affine_dtype, dispatch_warp_affine_dtype}}, + {eDataType::DATA_TYPE_U16, {dispatch_warp_affine_dtype, 0, dispatch_warp_affine_dtype, dispatch_warp_affine_dtype}}, + {eDataType::DATA_TYPE_S16, {dispatch_warp_affine_dtype, 0, dispatch_warp_affine_dtype, dispatch_warp_affine_dtype}}, + {eDataType::DATA_TYPE_U32, {dispatch_warp_affine_dtype, 0, dispatch_warp_affine_dtype, dispatch_warp_affine_dtype}}, + {eDataType::DATA_TYPE_S32, {dispatch_warp_affine_dtype, 0, dispatch_warp_affine_dtype, dispatch_warp_affine_dtype}}, + {eDataType::DATA_TYPE_F32, {dispatch_warp_affine_dtype, 0, dispatch_warp_affine_dtype, dispatch_warp_affine_dtype}}, + {eDataType::DATA_TYPE_F64, {dispatch_warp_affine_dtype, 0, dispatch_warp_affine_dtype, dispatch_warp_affine_dtype}} + }; + // clang-format on + + auto func = funcs.at(dtype)[channels - 1]; + if (func == 0) throw Exception("Not mapped to a defined function.", eStatusType::INVALID_OPERATION); + func(stream, input, output, affineInv, interp, borderMode, borderValue, device); } -} // namespace roccv \ No newline at end of file +} // namespace roccv