From 718c1ad3d3985379fe73ab4ab6a37cc2ac209356 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Wed, 1 Apr 2026 15:01:59 -0400 Subject: [PATCH 1/9] Improve hot paths for BorderWrapper/InterpolationWrapper --- include/core/wrappers/border_wrapper.hpp | 29 +++++++-- .../core/wrappers/interpolation_wrapper.hpp | 63 ++++++++++++++----- 2 files changed, 71 insertions(+), 21 deletions(-) diff --git a/include/core/wrappers/border_wrapper.hpp b/include/core/wrappers/border_wrapper.hpp index f55493a5..84c80835 100644 --- a/include/core/wrappers/border_wrapper.hpp +++ b/include/core/wrappers/border_wrapper.hpp @@ -27,6 +27,16 @@ #include "operator_types.h" namespace roccv { +namespace detail { + +/** Euclidean modulo: result in [0, modulus) for modulus > 0. One hardware remainder vs (a%m+m)%m. */ +__device__ __host__ inline int64_t euclid_mod_i64(int64_t a, int64_t modulus) { + int64_t r = a % modulus; + if (r < 0) r += modulus; + return r; +} + +} // namespace detail /** * @brief Wrapper class for ImageWrapper. This extends the descriptors by defining behaviors for when tensor @@ -56,6 +66,13 @@ class BorderWrapper { BorderWrapper(ImageWrapper image_wrapper, T border_value) : m_desc(image_wrapper), m_border_value(border_value) {} + /** + * @brief Sample the underlying image with no border logic. Caller must ensure coordinates are in-range. + */ + __device__ __host__ inline const T at_inbounds(int64_t n, int64_t h, int64_t w, int64_t c) const { + return m_desc.at(n, h, w, c); + } + /** * @brief Returns a reference to the underlying data given image coordinates. If the coordinates fall out of bounds, * a fallback reference based on the provided border type will be given instead. @@ -92,11 +109,11 @@ class BorderWrapper { // is the intended behavior for this border mode.) if constexpr (BorderType == eBorderType::BORDER_TYPE_REFLECT) { int64_t scale = imgWidth * 2; - int64_t val = (w % scale + scale) % scale; + int64_t val = detail::euclid_mod_i64(w, scale); x = (val < imgWidth) ? val : scale - 1 - val; scale = imgHeight * 2; - val = (h % scale + scale) % scale; + val = detail::euclid_mod_i64(h, scale); y = (val < imgHeight) ? val : scale - 1 - val; } @@ -105,7 +122,7 @@ class BorderWrapper { x = 0; } else { int64_t scale = 2 * imgWidth - 2; - x = (w % scale + scale) % scale; + x = detail::euclid_mod_i64(w, scale); x = imgWidth - 1 - std::abs(imgWidth - 1 - x); } @@ -113,7 +130,7 @@ class BorderWrapper { y = 0; } else { int64_t scale = 2 * imgHeight - 2; - y = (h % scale + scale) % scale; + y = detail::euclid_mod_i64(h, scale); y = imgHeight - 1 - std::abs(imgHeight - 1 - y); } } @@ -127,11 +144,11 @@ class BorderWrapper { // Wrap border type implementation if constexpr (BorderType == eBorderType::BORDER_TYPE_WRAP) { if (w < 0 || w >= imgWidth) { - x = (w % imgWidth + imgWidth) % imgWidth; + x = detail::euclid_mod_i64(w, imgWidth); } if (h < 0 || h >= imgHeight) { - y = (h % imgHeight + imgHeight) % imgHeight; + y = detail::euclid_mod_i64(h, imgHeight); } } diff --git a/include/core/wrappers/interpolation_wrapper.hpp b/include/core/wrappers/interpolation_wrapper.hpp index 7adb8cb6..e56b1e59 100644 --- a/include/core/wrappers/interpolation_wrapper.hpp +++ b/include/core/wrappers/interpolation_wrapper.hpp @@ -94,8 +94,12 @@ class InterpolationWrapper { */ inline __device__ __host__ const T at(int64_t n, float h, float w, int64_t c) const { if constexpr (I == eInterpolationType::INTERP_TYPE_NEAREST) { - // Nearest neighbor interpolation implementation - return m_desc.at(n, lroundf(h), lroundf(w), c); + const int64_t rh = static_cast(lroundf(h)); + const int64_t rw = static_cast(lroundf(w)); + if (rw >= 0 && rw < m_desc.width() && rh >= 0 && rh < m_desc.height()) { + return m_desc.at_inbounds(n, rh, rw, c); + } + return m_desc.at(n, rh, rw, c); } else if constexpr (I == eInterpolationType::INTERP_TYPE_LINEAR) { // Bilinear interpolation implementation // v1 -- v2 @@ -109,34 +113,63 @@ class InterpolationWrapper { int64_t y0 = static_cast(floorf(h)); int64_t y1 = y0 + 1; + if (x0 >= 0 && y0 >= 0 && x1 < m_desc.width() && y1 < m_desc.height()) { + auto v1 = detail::RangeCast(m_desc.at_inbounds(n, y0, x0, c)); + auto v2 = detail::RangeCast(m_desc.at_inbounds(n, y0, x1, c)); + auto v3 = detail::RangeCast(m_desc.at_inbounds(n, y1, x0, c)); + auto v4 = detail::RangeCast(m_desc.at_inbounds(n, y1, x1, c)); + auto q1 = v1 * static_cast(x1 - w) + v2 * static_cast(w - x0); + auto q2 = v3 * static_cast(x1 - w) + v4 * static_cast(w - x0); + auto q = q1 * static_cast(y1 - h) + q2 * static_cast(h - y0); + return detail::RangeCast(q); + } + auto v1 = detail::RangeCast(m_desc.at(n, y0, x0, c)); auto v2 = detail::RangeCast(m_desc.at(n, y0, x1, c)); auto v3 = detail::RangeCast(m_desc.at(n, y1, x0, c)); auto v4 = detail::RangeCast(m_desc.at(n, y1, x1, c)); - auto q1 = v1 * (x1 - w) + v2 * (w - x0); - auto q2 = v3 * (x1 - w) + v4 * (w - x0); - auto q = q1 * (y1 - h) + q2 * (h - y0); + auto q1 = v1 * static_cast(x1 - w) + v2 * static_cast(w - x0); + auto q2 = v3 * static_cast(x1 - w) + v4 * static_cast(w - x0); + auto q = q1 * static_cast(y1 - h) + q2 * static_cast(h - y0); return detail::RangeCast(q); } else if constexpr (I == eInterpolationType::INTERP_TYPE_CUBIC) { using namespace roccv::detail; using WorkType = detail::MakeType>; - // Integer coordinates for pixel (x, y) - int64_t int_x = static_cast(floorf(w)); - int64_t int_y = static_cast(floorf(h)); + const int64_t int_x = static_cast(floorf(w)); + const int64_t int_y = static_cast(floorf(h)); - // Calculate weights float weight_x[4], weight_y[4]; - CalBicubicWeights(w - int_x, weight_x); - CalBicubicWeights(h - int_y, weight_y); + CalBicubicWeights(w - static_cast(int_x), weight_x); + CalBicubicWeights(h - static_cast(int_y), weight_y); + + float wxy[16]; + int k = 0; + for (int j = 0; j < 4; j++) { + for (int i = 0; i < 4; i++) { + wxy[k++] = weight_y[j] * weight_x[i]; + } + } - // Weighted sum WorkType sum = SetAll(0.0f); - for (int index_y = -1; index_y <= 2; index_y++) { - for (int index_x = -1; index_x <= 2; index_x++) { - sum += detail::RangeCast(m_desc.at(n, int_y + index_y, int_x + index_x, 0)) * (weight_x[index_x + 1] * weight_y[index_y + 1]); + const bool cubic_fast = int_x >= 1 && int_y >= 1 && (int_x + 2) < m_desc.width() && (int_y + 2) < m_desc.height(); + k = 0; + if (cubic_fast) { + for (int index_y = -1; index_y <= 2; index_y++) { + for (int index_x = -1; index_x <= 2; index_x++) { + sum = sum + detail::RangeCast( + m_desc.at_inbounds(n, int_y + index_y, int_x + index_x, c)) * + wxy[k++]; + } + } + } else { + for (int index_y = -1; index_y <= 2; index_y++) { + for (int index_x = -1; index_x <= 2; index_x++) { + sum = sum + detail::RangeCast(m_desc.at(n, int_y + index_y, int_x + index_x, c)) * + wxy[k++]; + } } } From 00f6b891b6dd63ec0275fee4e755a7388931a8ad Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Wed, 1 Apr 2026 15:42:20 -0400 Subject: [PATCH 2/9] Add dedicated warp_affine path --- include/core/wrappers/border_wrapper.hpp | 27 +++- include/kernels/device/warp_affine_device.hpp | 45 ++++++ include/kernels/host/warp_affine_host.hpp | 42 ++++++ include/op_warp_affine.hpp | 9 +- src/op_warp_affine.cpp | 139 ++++++++++++++++-- 5 files changed, 241 insertions(+), 21 deletions(-) create mode 100644 include/kernels/device/warp_affine_device.hpp create mode 100644 include/kernels/host/warp_affine_host.hpp diff --git a/include/core/wrappers/border_wrapper.hpp b/include/core/wrappers/border_wrapper.hpp index 84c80835..67342ec2 100644 --- a/include/core/wrappers/border_wrapper.hpp +++ b/include/core/wrappers/border_wrapper.hpp @@ -21,6 +21,7 @@ #pragma once +#include #include #include "core/wrappers/image_wrapper.hpp" @@ -36,6 +37,24 @@ __device__ __host__ inline int64_t euclid_mod_i64(int64_t a, int64_t modulus) { return r; } +/** + * On GPU, use 32-bit remainder when operands fit; avoids 64-bit integer division in REFLECT/WRAP hot paths. + * Host always uses euclid_mod_i64. Values must stay in range for correctness. + */ +__device__ __host__ inline int64_t euclid_mod_i64_fast(int64_t a, int64_t modulus) { +#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__) + constexpr int64_t kLim = int64_t{1} << 30; + if (modulus > 0 && modulus < kLim && a > -kLim && a < kLim) { + int32_t ai = static_cast(a); + int32_t m = static_cast(modulus); + int32_t r = ai % m; + if (r < 0) r += m; + return static_cast(r); + } +#endif + return euclid_mod_i64(a, modulus); +} + } // namespace detail /** @@ -109,11 +128,11 @@ class BorderWrapper { // is the intended behavior for this border mode.) if constexpr (BorderType == eBorderType::BORDER_TYPE_REFLECT) { int64_t scale = imgWidth * 2; - int64_t val = detail::euclid_mod_i64(w, scale); + int64_t val = detail::euclid_mod_i64_fast(w, scale); x = (val < imgWidth) ? val : scale - 1 - val; scale = imgHeight * 2; - val = detail::euclid_mod_i64(h, scale); + val = detail::euclid_mod_i64_fast(h, scale); y = (val < imgHeight) ? val : scale - 1 - val; } @@ -144,11 +163,11 @@ class BorderWrapper { // Wrap border type implementation if constexpr (BorderType == eBorderType::BORDER_TYPE_WRAP) { if (w < 0 || w >= imgWidth) { - x = detail::euclid_mod_i64(w, imgWidth); + x = detail::euclid_mod_i64_fast(w, imgWidth); } if (h < 0 || h >= imgHeight) { - y = detail::euclid_mod_i64(h, imgHeight); + y = detail::euclid_mod_i64_fast(h, imgHeight); } } diff --git a/include/kernels/device/warp_affine_device.hpp b/include/kernels/device/warp_affine_device.hpp new file mode 100644 index 00000000..bf417d52 --- /dev/null +++ b/include/kernels/device/warp_affine_device.hpp @@ -0,0 +1,45 @@ +/** +Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include + +#include "core/wrappers/interpolation_wrapper.hpp" + +namespace Kernels { +namespace Device { + +template +__global__ void warp_affine(SrcWrapper input, DstWrapper output, Mat mat) { + const int x = blockDim.x * blockIdx.x + threadIdx.x; + const int y = blockDim.y * blockIdx.y + threadIdx.y; + const int b = blockIdx.z; + + if (x >= output.width() || y >= output.height()) return; + + const float ox = mat[0] * static_cast(x) + mat[1] * static_cast(y) + mat[2]; + const float oy = mat[3] * static_cast(x) + mat[4] * static_cast(y) + mat[5]; + output.at(b, y, x, 0) = input.at(b, oy, ox, 0); +} +} // namespace Device +} // namespace Kernels diff --git a/include/kernels/host/warp_affine_host.hpp b/include/kernels/host/warp_affine_host.hpp new file mode 100644 index 00000000..085c2179 --- /dev/null +++ b/include/kernels/host/warp_affine_host.hpp @@ -0,0 +1,42 @@ +/** +Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include + +namespace Kernels { +namespace Host { +template +void warp_affine(SrcWrapper input, DstWrapper output, Mat mat) { + for (int b = 0; b < output.batches(); b++) { + for (int y = 0; y < output.height(); y++) { + for (int x = 0; x < output.width(); x++) { + const float ox = mat[0] * static_cast(x) + mat[1] * static_cast(y) + mat[2]; + const float oy = mat[3] * static_cast(x) + mat[4] * static_cast(y) + mat[5]; + output.at(b, y, x, 0) = input.at(b, oy, ox, 0); + } + } + } +} +} // namespace Host +} // namespace Kernels diff --git a/include/op_warp_affine.hpp b/include/op_warp_affine.hpp index 26960501..4ca7c751 100644 --- a/include/op_warp_affine.hpp +++ b/include/op_warp_affine.hpp @@ -21,9 +21,10 @@ THE SOFTWARE. */ #pragma once +#include + #include "core/tensor.hpp" #include "i_operator.hpp" -#include "op_warp_perspective.hpp" #include "operator_types.h" namespace roccv { @@ -40,7 +41,7 @@ class WarpAffine final : public IOperator { * @brief Construct a new WarpAffine object * */ - WarpAffine() : m_op() {} + WarpAffine() = default; /** * @brief Destroy the WarpAffine object @@ -89,9 +90,5 @@ class WarpAffine final : public IOperator { void operator()(hipStream_t stream, const Tensor& input, const Tensor& output, const AffineTransform xform, bool isInverted, eInterpolationType interp, eBorderType borderMode, float4 borderValue, eDeviceType device = eDeviceType::GPU) const; - - private: - // WarpPerspective op is used to execute the affine transformation, as affine transformation is a subset. - WarpPerspective m_op; }; } // namespace roccv \ No newline at end of file diff --git a/src/op_warp_affine.cpp b/src/op_warp_affine.cpp index a7346507..ee29111f 100644 --- a/src/op_warp_affine.cpp +++ b/src/op_warp_affine.cpp @@ -21,25 +21,142 @@ THE SOFTWARE. */ #include "op_warp_affine.hpp" -#include +#include + +#include "common/array_wrapper.hpp" +#include "common/validation_helpers.hpp" +#include "core/detail/casting.hpp" +#include "core/detail/math/math.hpp" +#include "kernels/device/warp_affine_device.hpp" +#include "kernels/host/warp_affine_host.hpp" namespace roccv { +template +void dispatch_warp_affine_interp(hipStream_t stream, const Tensor &input, const Tensor &output, + const AffineTransform affineInv, T borderValue, eDeviceType device) { + ArrayWrapper transform(affineInv); + ImageWrapper outputWrapper(output); + InterpolationWrapper inputWrapper(input, borderValue); + + switch (device) { + case eDeviceType::GPU: { + dim3 block(64, 16); + dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y, + outputWrapper.batches()); + Kernels::Device::warp_affine<<>>(inputWrapper, outputWrapper, transform); + break; + } + + case eDeviceType::CPU: { + Kernels::Host::warp_affine(inputWrapper, outputWrapper, transform); + break; + } + } +} + +template +void dispatch_warp_affine_border_mode(hipStream_t stream, const Tensor &input, const Tensor &output, + const AffineTransform affineInv, eInterpolationType interpolation, T borderValue, + eDeviceType device) { + // clang-format off + static const std::unordered_map> + funcs = { + {eInterpolationType::INTERP_TYPE_NEAREST, dispatch_warp_affine_interp}, + {eInterpolationType::INTERP_TYPE_LINEAR, dispatch_warp_affine_interp}, + {eInterpolationType::INTERP_TYPE_CUBIC, dispatch_warp_affine_interp} + }; + // clang-format on + + if (!funcs.contains(interpolation)) { + throw Exception("Operation does not support the given interpolation mode.", eStatusType::NOT_IMPLEMENTED); + } + + auto func = funcs.at(interpolation); + func(stream, input, output, affineInv, borderValue, device); +} + +template +void dispatch_warp_affine_dtype(hipStream_t stream, const Tensor &input, const Tensor &output, + const AffineTransform affineInv, eInterpolationType interpolation, + eBorderType borderType, float4 borderValue, eDeviceType device) { + // clang-format off + static const std::unordered_map> + funcs = { + {eBorderType::BORDER_TYPE_CONSTANT, dispatch_warp_affine_border_mode}, + {eBorderType::BORDER_TYPE_REPLICATE, dispatch_warp_affine_border_mode}, + {eBorderType::BORDER_TYPE_REFLECT, dispatch_warp_affine_border_mode}, + {eBorderType::BORDER_TYPE_REFLECT101, dispatch_warp_affine_border_mode}, + {eBorderType::BORDER_TYPE_WRAP, dispatch_warp_affine_border_mode} + }; + // clang-format on + + if (!funcs.contains(borderType)) { + throw Exception("Operator does not support the given border mode.", eStatusType::NOT_IMPLEMENTED); + } -void WarpAffine::operator()(hipStream_t stream, const Tensor& input, const Tensor& output, const AffineTransform xform, + auto func = funcs.at(borderType); + func(stream, input, output, affineInv, interpolation, detail::SaturateCast(borderValue), device); +} + +void WarpAffine::operator()(hipStream_t stream, const Tensor &input, const Tensor &output, const AffineTransform xform, bool isInverted, eInterpolationType interp, eBorderType borderMode, float4 borderValue, eDeviceType device) const { - // An affine transformation is a subset of a perspective transform, so use the existing operator. All tensor - // validation is performed in the WarpPerspective operator as well. + CHECK_TENSOR_DEVICE(input, device); + CHECK_TENSOR_DATATYPES(input, DATA_TYPE_S8, DATA_TYPE_U8, DATA_TYPE_U16, DATA_TYPE_S16, DATA_TYPE_U32, + DATA_TYPE_S32, DATA_TYPE_F32, DATA_TYPE_F64); + CHECK_TENSOR_LAYOUT(input, TENSOR_LAYOUT_HWC, TENSOR_LAYOUT_NHWC); + CHECK_TENSOR_CHANNELS(input, 1, 3, 4); + + eDataType dtype = input.dtype().etype(); + int64_t channels = input.shape(input.layout().channels_index()); + + CHECK_TENSOR_COMPARISON(input.device() == output.device()); + CHECK_TENSOR_COMPARISON(output.shape(output.layout().channels_index()) == channels); + CHECK_TENSOR_COMPARISON(output.dtype() == input.dtype()); + CHECK_TENSOR_COMPARISON(output.layout() == input.layout()); + if (output.layout().batch_index() != -1) { + CHECK_TENSOR_COMPARISON(output.shape(output.layout().batch_index()) == + input.shape(input.layout().batch_index())); + } + + PerspectiveTransform full{}; +#pragma unroll + for (int i = 0; i < 6; i++) { + full[i] = xform[i]; + } + full[6] = 0.0f; + full[7] = 0.0f; + full[8] = 1.0f; - PerspectiveTransform transform; + detail::math::Matrix mat; + mat.load(full); + if (!isInverted) { + detail::math::inv_inplace(mat); + } + mat.store(full); + + AffineTransform affineInv{}; #pragma unroll for (int i = 0; i < 6; i++) { - transform[i] = xform[i]; + affineInv[i] = full[i]; } - transform[6] = 0.0f; - transform[7] = 0.0f; - transform[8] = 1.0f; - m_op(stream, input, output, transform, isInverted, interp, borderMode, borderValue, device); + // clang-format off + static const std::unordered_map, 4>> + funcs = { + {eDataType::DATA_TYPE_U8, {dispatch_warp_affine_dtype, 0, dispatch_warp_affine_dtype, dispatch_warp_affine_dtype}}, + {eDataType::DATA_TYPE_S8, {dispatch_warp_affine_dtype, 0, dispatch_warp_affine_dtype, dispatch_warp_affine_dtype}}, + {eDataType::DATA_TYPE_U16, {dispatch_warp_affine_dtype, 0, dispatch_warp_affine_dtype, dispatch_warp_affine_dtype}}, + {eDataType::DATA_TYPE_S16, {dispatch_warp_affine_dtype, 0, dispatch_warp_affine_dtype, dispatch_warp_affine_dtype}}, + {eDataType::DATA_TYPE_U32, {dispatch_warp_affine_dtype, 0, dispatch_warp_affine_dtype, dispatch_warp_affine_dtype}}, + {eDataType::DATA_TYPE_S32, {dispatch_warp_affine_dtype, 0, dispatch_warp_affine_dtype, dispatch_warp_affine_dtype}}, + {eDataType::DATA_TYPE_F32, {dispatch_warp_affine_dtype, 0, dispatch_warp_affine_dtype, dispatch_warp_affine_dtype}}, + {eDataType::DATA_TYPE_F64, {dispatch_warp_affine_dtype, 0, dispatch_warp_affine_dtype, dispatch_warp_affine_dtype}} + }; + // clang-format on + + auto func = funcs.at(dtype)[channels - 1]; + if (func == 0) throw Exception("Not mapped to a defined function.", eStatusType::INVALID_OPERATION); + func(stream, input, output, affineInv, interp, borderMode, borderValue, device); } -} // namespace roccv \ No newline at end of file +} // namespace roccv From 5af67d27220e4264a8dd72c3efd3e043462bff97 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Wed, 1 Apr 2026 16:02:50 -0400 Subject: [PATCH 3/9] Remove nearest bounds check --- include/core/wrappers/interpolation_wrapper.hpp | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/include/core/wrappers/interpolation_wrapper.hpp b/include/core/wrappers/interpolation_wrapper.hpp index e56b1e59..144addf7 100644 --- a/include/core/wrappers/interpolation_wrapper.hpp +++ b/include/core/wrappers/interpolation_wrapper.hpp @@ -23,8 +23,8 @@ #include "core/detail/casting.hpp" #include "core/detail/math/vectorized_type_math.hpp" -#include "core/wrappers/border_wrapper.hpp" #include "core/detail/vector_utils.hpp" +#include "core/wrappers/border_wrapper.hpp" #include "operator_types.h" namespace roccv { @@ -96,9 +96,6 @@ class InterpolationWrapper { if constexpr (I == eInterpolationType::INTERP_TYPE_NEAREST) { const int64_t rh = static_cast(lroundf(h)); const int64_t rw = static_cast(lroundf(w)); - if (rw >= 0 && rw < m_desc.width() && rh >= 0 && rh < m_desc.height()) { - return m_desc.at_inbounds(n, rh, rw, c); - } return m_desc.at(n, rh, rw, c); } else if constexpr (I == eInterpolationType::INTERP_TYPE_LINEAR) { // Bilinear interpolation implementation @@ -154,21 +151,22 @@ class InterpolationWrapper { } WorkType sum = SetAll(0.0f); - const bool cubic_fast = int_x >= 1 && int_y >= 1 && (int_x + 2) < m_desc.width() && (int_y + 2) < m_desc.height(); + const bool cubic_fast = + int_x >= 1 && int_y >= 1 && (int_x + 2) < m_desc.width() && (int_y + 2) < m_desc.height(); k = 0; if (cubic_fast) { for (int index_y = -1; index_y <= 2; index_y++) { for (int index_x = -1; index_x <= 2; index_x++) { - sum = sum + detail::RangeCast( - m_desc.at_inbounds(n, int_y + index_y, int_x + index_x, c)) * + sum = sum + + detail::RangeCast(m_desc.at_inbounds(n, int_y + index_y, int_x + index_x, c)) * wxy[k++]; } } } else { for (int index_y = -1; index_y <= 2; index_y++) { for (int index_x = -1; index_x <= 2; index_x++) { - sum = sum + detail::RangeCast(m_desc.at(n, int_y + index_y, int_x + index_x, c)) * - wxy[k++]; + sum = sum + + detail::RangeCast(m_desc.at(n, int_y + index_y, int_x + index_x, c)) * wxy[k++]; } } } From b781467f7f45d57d0465fba15f8d429c77743dd1 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Wed, 15 Apr 2026 12:39:07 -0400 Subject: [PATCH 4/9] Per-axis bounds checking in border wrapper --- include/core/wrappers/border_wrapper.hpp | 30 +++++++++++++++--------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/include/core/wrappers/border_wrapper.hpp b/include/core/wrappers/border_wrapper.hpp index 67342ec2..80942cc0 100644 --- a/include/core/wrappers/border_wrapper.hpp +++ b/include/core/wrappers/border_wrapper.hpp @@ -21,9 +21,10 @@ #pragma once -#include #include +#include + #include "core/wrappers/image_wrapper.hpp" #include "operator_types.h" @@ -127,13 +128,16 @@ class BorderWrapper { // Reflect border type implementation. (Note: This is NOT REFLECT101, pixels at the border will be duplicated as // is the intended behavior for this border mode.) if constexpr (BorderType == eBorderType::BORDER_TYPE_REFLECT) { - int64_t scale = imgWidth * 2; - int64_t val = detail::euclid_mod_i64_fast(w, scale); - x = (val < imgWidth) ? val : scale - 1 - val; - - scale = imgHeight * 2; - val = detail::euclid_mod_i64_fast(h, scale); - y = (val < imgHeight) ? val : scale - 1 - val; + if (w < 0 || w >= imgWidth) { + int64_t scale = imgWidth * 2; + int64_t val = detail::euclid_mod_i64_fast(w, scale); + x = (val < imgWidth) ? val : scale - 1 - val; + } + if (h < 0 || h >= imgHeight) { + int64_t scale = imgHeight * 2; + int64_t val = detail::euclid_mod_i64_fast(h, scale); + y = (val < imgHeight) ? val : scale - 1 - val; + } } if constexpr (BorderType == eBorderType::BORDER_TYPE_REFLECT101) { @@ -154,10 +158,14 @@ class BorderWrapper { } } - // Replicate border type implementation + // Replicate: snap OOB axes to nearest edge; in-range axes stay x=w / y=h (see global early return). if constexpr (BorderType == eBorderType::BORDER_TYPE_REPLICATE) { - x = std::clamp(w, 0, imgWidth - 1); - y = std::clamp(h, 0, imgHeight - 1); + if (w < 0 || w >= imgWidth) { + x = (w < 0) ? 0 : imgWidth - 1; + } + if (h < 0 || h >= imgHeight) { + y = (h < 0) ? 0 : imgHeight - 1; + } } // Wrap border type implementation From 56f0831c682facc49bedb61565a8fd01e117203f Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Thu, 16 Apr 2026 10:51:56 -0400 Subject: [PATCH 5/9] Unroll for loops in cubic interpolation implementation --- include/core/wrappers/border_wrapper.hpp | 9 ++------- include/core/wrappers/interpolation_wrapper.hpp | 6 ++++++ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/core/wrappers/border_wrapper.hpp b/include/core/wrappers/border_wrapper.hpp index 80942cc0..b99dccce 100644 --- a/include/core/wrappers/border_wrapper.hpp +++ b/include/core/wrappers/border_wrapper.hpp @@ -170,13 +170,8 @@ class BorderWrapper { // Wrap border type implementation if constexpr (BorderType == eBorderType::BORDER_TYPE_WRAP) { - if (w < 0 || w >= imgWidth) { - x = detail::euclid_mod_i64_fast(w, imgWidth); - } - - if (h < 0 || h >= imgHeight) { - y = detail::euclid_mod_i64_fast(h, imgHeight); - } + x = detail::euclid_mod_i64_fast(w, imgWidth); + y = detail::euclid_mod_i64_fast(h, imgHeight); } return m_desc.at(n, y, x, c); diff --git a/include/core/wrappers/interpolation_wrapper.hpp b/include/core/wrappers/interpolation_wrapper.hpp index 144addf7..a3d6e9e7 100644 --- a/include/core/wrappers/interpolation_wrapper.hpp +++ b/include/core/wrappers/interpolation_wrapper.hpp @@ -144,7 +144,9 @@ class InterpolationWrapper { float wxy[16]; int k = 0; +#pragma unroll for (int j = 0; j < 4; j++) { +#pragma unroll for (int i = 0; i < 4; i++) { wxy[k++] = weight_y[j] * weight_x[i]; } @@ -155,7 +157,9 @@ class InterpolationWrapper { int_x >= 1 && int_y >= 1 && (int_x + 2) < m_desc.width() && (int_y + 2) < m_desc.height(); k = 0; if (cubic_fast) { +#pragma unroll for (int index_y = -1; index_y <= 2; index_y++) { +#pragma unroll for (int index_x = -1; index_x <= 2; index_x++) { sum = sum + detail::RangeCast(m_desc.at_inbounds(n, int_y + index_y, int_x + index_x, c)) * @@ -163,7 +167,9 @@ class InterpolationWrapper { } } } else { +#pragma unroll for (int index_y = -1; index_y <= 2; index_y++) { +#pragma unroll for (int index_x = -1; index_x <= 2; index_x++) { sum = sum + detail::RangeCast(m_desc.at(n, int_y + index_y, int_x + index_x, c)) * wxy[k++]; From 852176a1c27f634455850f98bc32fd090803ad42 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Thu, 16 Apr 2026 11:24:14 -0400 Subject: [PATCH 6/9] General optimizations for border/interpolation wrapper implementations --- include/core/wrappers/border_wrapper.hpp | 110 +++++++++++++----- .../core/wrappers/interpolation_wrapper.hpp | 76 +++++++----- 2 files changed, 126 insertions(+), 60 deletions(-) diff --git a/include/core/wrappers/border_wrapper.hpp b/include/core/wrappers/border_wrapper.hpp index b99dccce..538d0cec 100644 --- a/include/core/wrappers/border_wrapper.hpp +++ b/include/core/wrappers/border_wrapper.hpp @@ -31,6 +31,34 @@ namespace roccv { namespace detail { +/** Branchless absolute value for int64 (two's complement); avoids libm/std::abs on GPU. */ +__device__ __host__ __forceinline__ int64_t abs_i64(int64_t v) { + const int64_t mask = v >> 63; + return (v ^ mask) - mask; +} + +__device__ __host__ __forceinline__ int32_t abs_i32(int32_t v) { + const int32_t mask = v >> 31; + return (v ^ mask) - mask; +} + +__device__ __host__ __forceinline__ int32_t min_i32(int32_t a, int32_t b) { return a < b ? a : b; } + +__device__ __host__ __forceinline__ int64_t min_i64(int64_t a, int64_t b) { return a < b ? a : b; } + +__device__ __host__ __forceinline__ int64_t max_i64(int64_t a, int64_t b) { return a > b ? a : b; } + +/** Clamp v to [lo, hi]. */ +__device__ __host__ __forceinline__ int64_t clamp_i64(int64_t v, int64_t lo, int64_t hi) { + return min_i64(max_i64(v, lo), hi); +} + +__device__ __host__ inline int32_t euclid_mod_i32(int32_t a, int32_t modulus) { + int32_t r = a % modulus; + if (r < 0) r += modulus; + return r; +} + /** Euclidean modulo: result in [0, modulus) for modulus > 0. One hardware remainder vs (a%m+m)%m. */ __device__ __host__ inline int64_t euclid_mod_i64(int64_t a, int64_t modulus) { int64_t r = a % modulus; @@ -56,6 +84,53 @@ __device__ __host__ inline int64_t euclid_mod_i64_fast(int64_t a, int64_t modulu return euclid_mod_i64(a, modulus); } +/** + * OpenCV-style BORDER_REFLECT axis map: period 2*extent, edge pixels duplicated (not REFLECT101). + * Equivalent to: val = euclid_mod(coord, 2*extent); min(val, 2*extent - 1 - val). + * The min form avoids a branch on val < extent and fuses well with 32-bit mod on GPU. + */ +__device__ __host__ inline int64_t reflect_border_coord_i64(int64_t coord, int64_t extent) { +#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__) + constexpr int64_t kLim = int64_t{1} << 30; + if (extent > 0 && extent < kLim && coord > -kLim && coord < kLim) { + const int32_t e = static_cast(extent); + const int32_t scale = e * 2; + int32_t val = static_cast(coord) % scale; + if (val < 0) val += scale; + const int32_t inv = scale - 1 - val; + return static_cast(min_i32(val, inv)); + } +#endif + const int64_t scale = extent * 2; + const int64_t val = euclid_mod_i64(coord, scale); + const int64_t inv = scale - 1 - val; + return min_i64(val, inv); +} + +/** + * BORDER_REFLECT101 axis map: period (2*extent - 2), endpoints excluded from reflection. + */ +__device__ __host__ inline int64_t reflect101_border_coord_i64(int64_t coord, int64_t extent) { + if (extent <= 1) { + return 0; + } + const int64_t scale = 2 * extent - 2; +#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__) + constexpr int64_t kLim = int64_t{1} << 30; + if (extent < kLim && coord > -kLim && coord < kLim && scale > 0 && scale < kLim) { + const int32_t e = static_cast(extent); + const int32_t s = e * 2 - 2; + int32_t v = euclid_mod_i32(static_cast(coord), s); + const int32_t inner = (e - 1) - v; + const int32_t a = abs_i32(inner); + return static_cast((e - 1) - a); + } +#endif + const int64_t v = euclid_mod_i64_fast(coord, scale); + const int64_t inner = (extent - 1) - v; + return (extent - 1) - abs_i64(inner); +} + } // namespace detail /** @@ -129,43 +204,22 @@ class BorderWrapper { // is the intended behavior for this border mode.) if constexpr (BorderType == eBorderType::BORDER_TYPE_REFLECT) { if (w < 0 || w >= imgWidth) { - int64_t scale = imgWidth * 2; - int64_t val = detail::euclid_mod_i64_fast(w, scale); - x = (val < imgWidth) ? val : scale - 1 - val; + x = detail::reflect_border_coord_i64(w, imgWidth); } if (h < 0 || h >= imgHeight) { - int64_t scale = imgHeight * 2; - int64_t val = detail::euclid_mod_i64_fast(h, scale); - y = (val < imgHeight) ? val : scale - 1 - val; + y = detail::reflect_border_coord_i64(h, imgHeight); } } if constexpr (BorderType == eBorderType::BORDER_TYPE_REFLECT101) { - if (imgWidth == 1) { - x = 0; - } else { - int64_t scale = 2 * imgWidth - 2; - x = detail::euclid_mod_i64(w, scale); - x = imgWidth - 1 - std::abs(imgWidth - 1 - x); - } - - if (imgHeight == 1) { - y = 0; - } else { - int64_t scale = 2 * imgHeight - 2; - y = detail::euclid_mod_i64(h, scale); - y = imgHeight - 1 - std::abs(imgHeight - 1 - y); - } + x = detail::reflect101_border_coord_i64(w, imgWidth); + y = detail::reflect101_border_coord_i64(h, imgHeight); } - // Replicate: snap OOB axes to nearest edge; in-range axes stay x=w / y=h (see global early return). + // Replicate: clamp to edge. Equivalent to per-axis OOB snap; min/max maps cleanly to GPU integer ops. if constexpr (BorderType == eBorderType::BORDER_TYPE_REPLICATE) { - if (w < 0 || w >= imgWidth) { - x = (w < 0) ? 0 : imgWidth - 1; - } - if (h < 0 || h >= imgHeight) { - y = (h < 0) ? 0 : imgHeight - 1; - } + x = detail::clamp_i64(w, 0, imgWidth - 1); + y = detail::clamp_i64(h, 0, imgHeight - 1); } // Wrap border type implementation diff --git a/include/core/wrappers/interpolation_wrapper.hpp b/include/core/wrappers/interpolation_wrapper.hpp index a3d6e9e7..609f956a 100644 --- a/include/core/wrappers/interpolation_wrapper.hpp +++ b/include/core/wrappers/interpolation_wrapper.hpp @@ -21,13 +21,33 @@ #pragma once +#include + +#include + #include "core/detail/casting.hpp" -#include "core/detail/math/vectorized_type_math.hpp" #include "core/detail/vector_utils.hpp" #include "core/wrappers/border_wrapper.hpp" #include "operator_types.h" namespace roccv { +namespace detail { + +/** Floor to int64; on device uses elementwise floor intrinsic (matches HIP __float2ll_rd lowering). */ +__device__ __host__ __forceinline__ int64_t interp_floor_i64(float x) { +#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__) + return static_cast(static_cast(__builtin_elementwise_floor(x))); +#else + return static_cast(floorf(x)); +#endif +} + +/** Nearest index; llroundf matches lroundf for float (half away from zero). */ +__device__ __host__ __forceinline__ int64_t interp_nearest_i64(float x) { + return static_cast(std::llroundf(x)); +} + +} // namespace detail /** * @brief A kernel-friendly wrapper which provides interpolation logic based on the given @@ -66,22 +86,12 @@ class InterpolationWrapper { * @return None. */ __device__ __host__ inline void CalBicubicWeights(float dist, float* weight) const { - weight[0] = -0.5f; - weight[0] = weight[0] * dist + 1.0f; - weight[0] = weight[0] * dist - 0.5f; - weight[0] = weight[0] * dist; - - weight[1] = 1.5f; - weight[1] = weight[1] * dist - 2.5f; - weight[1] = weight[1] * dist; - weight[1] = weight[1] * dist + 1.0f; - - weight[2] = -1.5f; - weight[2] = weight[2] * dist + 2.f; - weight[2] = weight[2] * dist + 0.5f; - weight[2] = weight[2] * dist; - - weight[3] = 1 - weight[0] - weight[1] - weight[2]; + const float d = dist; + // Fused multiply-add: single rounding vs separate mul+add (matches kernels/device style). + weight[0] = fmaf(fmaf(fmaf(-0.5f, d, 1.0f), d, -0.5f), d, 0.f); + weight[1] = fmaf(fmaf(fmaf(1.5f, d, -2.5f), d, 0.f), d, 1.0f); + weight[2] = fmaf(fmaf(fmaf(-1.5f, d, 2.f), d, 0.5f), d, 0.f); + weight[3] = 1.f - weight[0] - weight[1] - weight[2]; } /** @@ -94,9 +104,7 @@ class InterpolationWrapper { */ inline __device__ __host__ const T at(int64_t n, float h, float w, int64_t c) const { if constexpr (I == eInterpolationType::INTERP_TYPE_NEAREST) { - const int64_t rh = static_cast(lroundf(h)); - const int64_t rw = static_cast(lroundf(w)); - return m_desc.at(n, rh, rw, c); + return m_desc.at(n, detail::interp_nearest_i64(h), detail::interp_nearest_i64(w), c); } else if constexpr (I == eInterpolationType::INTERP_TYPE_LINEAR) { // Bilinear interpolation implementation // v1 -- v2 @@ -105,19 +113,23 @@ class InterpolationWrapper { using WorkType = detail::MakeType>; - int64_t x0 = static_cast(floorf(w)); - int64_t x1 = x0 + 1; - int64_t y0 = static_cast(floorf(h)); - int64_t y1 = y0 + 1; + const int64_t x0 = detail::interp_floor_i64(w); + const int64_t y0 = detail::interp_floor_i64(h); + const int64_t x1 = x0 + 1; + const int64_t y1 = y0 + 1; + const float fx = w - static_cast(x0); + const float fy = h - static_cast(y0); + const float omfx = 1.f - fx; + const float omfy = 1.f - fy; if (x0 >= 0 && y0 >= 0 && x1 < m_desc.width() && y1 < m_desc.height()) { auto v1 = detail::RangeCast(m_desc.at_inbounds(n, y0, x0, c)); auto v2 = detail::RangeCast(m_desc.at_inbounds(n, y0, x1, c)); auto v3 = detail::RangeCast(m_desc.at_inbounds(n, y1, x0, c)); auto v4 = detail::RangeCast(m_desc.at_inbounds(n, y1, x1, c)); - auto q1 = v1 * static_cast(x1 - w) + v2 * static_cast(w - x0); - auto q2 = v3 * static_cast(x1 - w) + v4 * static_cast(w - x0); - auto q = q1 * static_cast(y1 - h) + q2 * static_cast(h - y0); + auto q1 = v1 * omfx + v2 * fx; + auto q2 = v3 * omfx + v4 * fx; + auto q = q1 * omfy + q2 * fy; return detail::RangeCast(q); } @@ -126,17 +138,17 @@ class InterpolationWrapper { auto v3 = detail::RangeCast(m_desc.at(n, y1, x0, c)); auto v4 = detail::RangeCast(m_desc.at(n, y1, x1, c)); - auto q1 = v1 * static_cast(x1 - w) + v2 * static_cast(w - x0); - auto q2 = v3 * static_cast(x1 - w) + v4 * static_cast(w - x0); - auto q = q1 * static_cast(y1 - h) + q2 * static_cast(h - y0); + auto q1 = v1 * omfx + v2 * fx; + auto q2 = v3 * omfx + v4 * fx; + auto q = q1 * omfy + q2 * fy; return detail::RangeCast(q); } else if constexpr (I == eInterpolationType::INTERP_TYPE_CUBIC) { using namespace roccv::detail; using WorkType = detail::MakeType>; - const int64_t int_x = static_cast(floorf(w)); - const int64_t int_y = static_cast(floorf(h)); + const int64_t int_x = detail::interp_floor_i64(w); + const int64_t int_y = detail::interp_floor_i64(h); float weight_x[4], weight_y[4]; CalBicubicWeights(w - static_cast(int_x), weight_x); From 2beed7a87f441de98e41826f5c60596100b60c61 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Thu, 16 Apr 2026 11:42:10 -0400 Subject: [PATCH 7/9] Change warp perspective block size --- src/op_warp_perspective.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/op_warp_perspective.cpp b/src/op_warp_perspective.cpp index ca77fc8c..33717694 100644 --- a/src/op_warp_perspective.cpp +++ b/src/op_warp_perspective.cpp @@ -42,7 +42,7 @@ void dispatch_warp_perspective_interp(hipStream_t stream, const Tensor &input, c // Launch CPU/GPU kernel depending on requested device type. switch (device) { case eDeviceType::GPU: { - dim3 block(64, 16); + dim3 block(32, 8); dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y, outputWrapper.batches()); Kernels::Device::warp_perspective<<>>(inputWrapper, outputWrapper, transform); From 79d84a1e5d0532c37a408ad6e577f6d7004383ec Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Thu, 16 Apr 2026 11:45:27 -0400 Subject: [PATCH 8/9] Revert block size changes --- src/op_warp_perspective.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/op_warp_perspective.cpp b/src/op_warp_perspective.cpp index 33717694..ca77fc8c 100644 --- a/src/op_warp_perspective.cpp +++ b/src/op_warp_perspective.cpp @@ -42,7 +42,7 @@ void dispatch_warp_perspective_interp(hipStream_t stream, const Tensor &input, c // Launch CPU/GPU kernel depending on requested device type. switch (device) { case eDeviceType::GPU: { - dim3 block(32, 8); + dim3 block(64, 16); dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y, outputWrapper.batches()); Kernels::Device::warp_perspective<<>>(inputWrapper, outputWrapper, transform); From 99ac06789ec47d8eec5a52cd89ff46eea3f69bd7 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Thu, 16 Apr 2026 12:02:35 -0400 Subject: [PATCH 9/9] Organize helpers/improve docstrings --- include/core/detail/sampling_helpers.hpp | 164 ++++++++++++++++++ include/core/wrappers/border_wrapper.hpp | 81 ++------- .../core/wrappers/interpolation_wrapper.hpp | 23 +-- 3 files changed, 182 insertions(+), 86 deletions(-) create mode 100644 include/core/detail/sampling_helpers.hpp diff --git a/include/core/detail/sampling_helpers.hpp b/include/core/detail/sampling_helpers.hpp new file mode 100644 index 00000000..812868f8 --- /dev/null +++ b/include/core/detail/sampling_helpers.hpp @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#pragma once + +/** + * @file sampling_helpers.hpp + * @brief Small host/device helpers for border coordinate math and interpolation index conversion. + */ + +#include + +#include +#include + +namespace roccv { +namespace detail { + +/** + * @brief Branchless absolute value of a signed 64-bit integer (two's complement). + * @param v Input value. + * @return Non-negative absolute value of @p v. + * @note Avoids @c std::abs in device code; useful on GPU hot paths. + */ +__device__ __host__ __forceinline__ int64_t abs_i64(int64_t v) { + const int64_t mask = v >> 63; + return (v ^ mask) - mask; +} + +/** + * @brief Branchless absolute value of a signed 32-bit integer (two's complement). + * @param v Input value. + * @return Non-negative absolute value of @p v. + */ +__device__ __host__ __forceinline__ int32_t abs_i32(int32_t v) { + const int32_t mask = v >> 31; + return (v ^ mask) - mask; +} + +/** + * @brief Minimum of two 32-bit signed integers. + * @param a First operand. + * @param b Second operand. + * @return The lesser of @p a and @p b. + */ +__device__ __host__ __forceinline__ int32_t min_i32(int32_t a, int32_t b) { return a < b ? a : b; } + +/** + * @brief Minimum of two 64-bit signed integers. + * @param a First operand. + * @param b Second operand. + * @return The lesser of @p a and @p b. + */ +__device__ __host__ __forceinline__ int64_t min_i64(int64_t a, int64_t b) { return a < b ? a : b; } + +/** + * @brief Maximum of two 64-bit signed integers. + * @param a First operand. + * @param b Second operand. + * @return The greater of @p a and @p b. + */ +__device__ __host__ __forceinline__ int64_t max_i64(int64_t a, int64_t b) { return a > b ? a : b; } + +/** + * @brief Clamp a signed 64-bit integer to a closed interval. + * @param v Value to clamp. + * @param lo Lower bound (inclusive). + * @param hi Upper bound (inclusive); must satisfy @p lo <= @p hi. + * @return @p v restricted to the inclusive interval between @p lo and @p hi. + */ +__device__ __host__ __forceinline__ int64_t clamp_i64(int64_t v, int64_t lo, int64_t hi) { + return min_i64(max_i64(v, lo), hi); +} + +/** + * @brief Euclidean (non-negative) modulo for 32-bit operands. + * @param a Dividend. + * @param modulus Strictly positive modulus. + * @return Remainder in the half-open range [0, modulus), congruent to @p a modulo @p modulus. + */ +__device__ __host__ inline int32_t euclid_mod_i32(int32_t a, int32_t modulus) { + int32_t r = a % modulus; + if (r < 0) r += modulus; + return r; +} + +/** + * @brief Euclidean (non-negative) modulo for 64-bit operands. + * @param a Dividend. + * @param modulus Strictly positive modulus. + * @return Remainder in the half-open range [0, modulus), congruent to @p a modulo @p modulus. + * @note Uses a single remainder and correction instead of repeating modulo and add. + */ +__device__ __host__ inline int64_t euclid_mod_i64(int64_t a, int64_t modulus) { + int64_t r = a % modulus; + if (r < 0) r += modulus; + return r; +} + +/** + * @brief Euclidean modulo with a 32-bit fast path on the GPU when operands are in a safe range. + * @param a Dividend. + * @param modulus Strictly positive modulus. + * @return Same as euclid_mod_i64() when operands fit the device fast path; otherwise defers to euclid_mod_i64(). + * @note On device, uses 32-bit remainder when @p modulus and @p a are sufficiently small to avoid 64-bit division. + * On host, always uses euclid_mod_i64(). Caller must keep values in range when relying on the fast path. + */ +__device__ __host__ inline int64_t euclid_mod_i64_fast(int64_t a, int64_t modulus) { +#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__) + constexpr int64_t kLim = int64_t{1} << 30; + if (modulus > 0 && modulus < kLim && a > -kLim && a < kLim) { + int32_t ai = static_cast(a); + int32_t m = static_cast(modulus); + int32_t r = ai % m; + if (r < 0) r += m; + return static_cast(r); + } +#endif + return euclid_mod_i64(a, modulus); +} + +/** + * @brief Convert a subpixel coordinate to the integer grid index below @p x (floor). + * @param x Source coordinate in pixels. + * @return Largest int64 not greater than @p x (i.e. floor), suitable as the left/top neighbor index for bilinear/cubic. + * @note On device, uses a floor intrinsic compatible with HIP @c __float2ll_rd lowering; on host uses @c floorf(). + */ +__device__ __host__ __forceinline__ int64_t interp_floor_i64(float x) { +#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__) + return static_cast(static_cast(__builtin_elementwise_floor(x))); +#else + return static_cast(floorf(x)); +#endif +} + +/** + * @brief Nearest-neighbor rounding of a subpixel coordinate to an integer index. + * @param x Source coordinate in pixels. + * @return Integer closest to @p x, with half values rounded away from zero (same convention as @c std::llroundf()). + */ +__device__ __host__ __forceinline__ int64_t interp_nearest_i64(float x) { + return static_cast(std::llroundf(x)); +} + +} // namespace detail +} // namespace roccv diff --git a/include/core/wrappers/border_wrapper.hpp b/include/core/wrappers/border_wrapper.hpp index 538d0cec..6289590b 100644 --- a/include/core/wrappers/border_wrapper.hpp +++ b/include/core/wrappers/border_wrapper.hpp @@ -21,73 +21,22 @@ #pragma once -#include - -#include - +#include "core/detail/sampling_helpers.hpp" #include "core/wrappers/image_wrapper.hpp" #include "operator_types.h" namespace roccv { -namespace detail { - -/** Branchless absolute value for int64 (two's complement); avoids libm/std::abs on GPU. */ -__device__ __host__ __forceinline__ int64_t abs_i64(int64_t v) { - const int64_t mask = v >> 63; - return (v ^ mask) - mask; -} - -__device__ __host__ __forceinline__ int32_t abs_i32(int32_t v) { - const int32_t mask = v >> 31; - return (v ^ mask) - mask; -} - -__device__ __host__ __forceinline__ int32_t min_i32(int32_t a, int32_t b) { return a < b ? a : b; } - -__device__ __host__ __forceinline__ int64_t min_i64(int64_t a, int64_t b) { return a < b ? a : b; } - -__device__ __host__ __forceinline__ int64_t max_i64(int64_t a, int64_t b) { return a > b ? a : b; } - -/** Clamp v to [lo, hi]. */ -__device__ __host__ __forceinline__ int64_t clamp_i64(int64_t v, int64_t lo, int64_t hi) { - return min_i64(max_i64(v, lo), hi); -} - -__device__ __host__ inline int32_t euclid_mod_i32(int32_t a, int32_t modulus) { - int32_t r = a % modulus; - if (r < 0) r += modulus; - return r; -} - -/** Euclidean modulo: result in [0, modulus) for modulus > 0. One hardware remainder vs (a%m+m)%m. */ -__device__ __host__ inline int64_t euclid_mod_i64(int64_t a, int64_t modulus) { - int64_t r = a % modulus; - if (r < 0) r += modulus; - return r; -} - -/** - * On GPU, use 32-bit remainder when operands fit; avoids 64-bit integer division in REFLECT/WRAP hot paths. - * Host always uses euclid_mod_i64. Values must stay in range for correctness. - */ -__device__ __host__ inline int64_t euclid_mod_i64_fast(int64_t a, int64_t modulus) { -#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__) - constexpr int64_t kLim = int64_t{1} << 30; - if (modulus > 0 && modulus < kLim && a > -kLim && a < kLim) { - int32_t ai = static_cast(a); - int32_t m = static_cast(modulus); - int32_t r = ai % m; - if (r < 0) r += m; - return static_cast(r); - } -#endif - return euclid_mod_i64(a, modulus); -} +namespace detail { /** - * OpenCV-style BORDER_REFLECT axis map: period 2*extent, edge pixels duplicated (not REFLECT101). - * Equivalent to: val = euclid_mod(coord, 2*extent); min(val, 2*extent - 1 - val). - * The min form avoids a branch on val < extent and fuses well with 32-bit mod on GPU. + * @brief Map one axis coordinate for OpenCV-style @c BORDER_REFLECT (edge pixels duplicated; not @c BORDER_REFLECT101). + * @param coord Possibly out-of-bounds coordinate along the axis (width or height index space). + * @param extent Positive extent of the axis (number of samples, e.g. image width or height). + * @return In-bounds index in [0, extent) after reflection. + * @note Period is 2 * extent. Implementation uses Euclidean modulo then + * min(val, 2*extent - 1 - val), which matches comparing @c val to @c extent with a ternary, without a + * separate branch on @p extent alone. On device, a 32-bit remainder path is used when @p extent and @p coord are + * in a safe range. */ __device__ __host__ inline int64_t reflect_border_coord_i64(int64_t coord, int64_t extent) { #if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__) @@ -108,7 +57,12 @@ __device__ __host__ inline int64_t reflect_border_coord_i64(int64_t coord, int64 } /** - * BORDER_REFLECT101 axis map: period (2*extent - 2), endpoints excluded from reflection. + * @brief Map one axis coordinate for OpenCV-style @c BORDER_REFLECT101 (endpoints are not repeated in the reflection). + * @param coord Possibly out-of-bounds coordinate along the axis. + * @param extent Positive extent of the axis (number of samples). If @p extent is at most 1, returns @c 0. + * @return In-bounds index in [0, extent) after reflection. + * @note Period is 2 * extent - 2 when @p extent is greater than 1. Uses Euclidean modulo then folds with + * (extent - 1) - abs((extent - 1) - v). On device, a 32-bit path applies when values fit a fixed bound. */ __device__ __host__ inline int64_t reflect101_border_coord_i64(int64_t coord, int64_t extent) { if (extent <= 1) { @@ -130,7 +84,6 @@ __device__ __host__ inline int64_t reflect101_border_coord_i64(int64_t coord, in const int64_t inner = (extent - 1) - v; return (extent - 1) - abs_i64(inner); } - } // namespace detail /** @@ -253,7 +206,7 @@ class BorderWrapper { __device__ __host__ inline int64_t batches() const { return m_desc.batches(); } /** - * @brief Retries the number of channels in the image. + * @brief Retrieves the number of channels in the image. * * @return Image channels. */ diff --git a/include/core/wrappers/interpolation_wrapper.hpp b/include/core/wrappers/interpolation_wrapper.hpp index 609f956a..48fed6c3 100644 --- a/include/core/wrappers/interpolation_wrapper.hpp +++ b/include/core/wrappers/interpolation_wrapper.hpp @@ -21,34 +21,13 @@ #pragma once -#include - -#include - #include "core/detail/casting.hpp" +#include "core/detail/sampling_helpers.hpp" #include "core/detail/vector_utils.hpp" #include "core/wrappers/border_wrapper.hpp" #include "operator_types.h" namespace roccv { -namespace detail { - -/** Floor to int64; on device uses elementwise floor intrinsic (matches HIP __float2ll_rd lowering). */ -__device__ __host__ __forceinline__ int64_t interp_floor_i64(float x) { -#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__) - return static_cast(static_cast(__builtin_elementwise_floor(x))); -#else - return static_cast(floorf(x)); -#endif -} - -/** Nearest index; llroundf matches lroundf for float (half away from zero). */ -__device__ __host__ __forceinline__ int64_t interp_nearest_i64(float x) { - return static_cast(std::llroundf(x)); -} - -} // namespace detail - /** * @brief A kernel-friendly wrapper which provides interpolation logic based on the given * coordinates. This tensor wrapper is typically only used for input tensors and does not provide write access to its