Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 164 additions & 0 deletions include/core/detail/sampling_helpers.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
/*
* Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/

#pragma once

/**
* @file sampling_helpers.hpp
* @brief Small host/device helpers for border coordinate math and interpolation index conversion.
*/

#include <hip/hip_runtime.h>

#include <cmath>
#include <cstdint>

namespace roccv {
namespace detail {

/**
* @brief Branchless absolute value of a signed 64-bit integer (two's complement).
* @param v Input value.
* @return Non-negative absolute value of @p v.
* @note Avoids @c std::abs in device code; useful on GPU hot paths.
*/
__device__ __host__ __forceinline__ int64_t abs_i64(int64_t v) {
const int64_t mask = v >> 63;
return (v ^ mask) - mask;
}

/**
* @brief Branchless absolute value of a signed 32-bit integer (two's complement).
* @param v Input value.
* @return Non-negative absolute value of @p v.
*/
__device__ __host__ __forceinline__ int32_t abs_i32(int32_t v) {
const int32_t mask = v >> 31;
return (v ^ mask) - mask;
}

/**
* @brief Minimum of two 32-bit signed integers.
* @param a First operand.
* @param b Second operand.
* @return The lesser of @p a and @p b.
*/
__device__ __host__ __forceinline__ int32_t min_i32(int32_t a, int32_t b) { return a < b ? a : b; }

/**
* @brief Minimum of two 64-bit signed integers.
* @param a First operand.
* @param b Second operand.
* @return The lesser of @p a and @p b.
*/
__device__ __host__ __forceinline__ int64_t min_i64(int64_t a, int64_t b) { return a < b ? a : b; }

/**
* @brief Maximum of two 64-bit signed integers.
* @param a First operand.
* @param b Second operand.
* @return The greater of @p a and @p b.
*/
__device__ __host__ __forceinline__ int64_t max_i64(int64_t a, int64_t b) { return a > b ? a : b; }

/**
* @brief Clamp a signed 64-bit integer to a closed interval.
* @param v Value to clamp.
* @param lo Lower bound (inclusive).
* @param hi Upper bound (inclusive); must satisfy @p lo <= @p hi.
* @return @p v restricted to the inclusive interval between @p lo and @p hi.
*/
__device__ __host__ __forceinline__ int64_t clamp_i64(int64_t v, int64_t lo, int64_t hi) {
return min_i64(max_i64(v, lo), hi);
}

/**
* @brief Euclidean (non-negative) modulo for 32-bit operands.
* @param a Dividend.
* @param modulus Strictly positive modulus.
* @return Remainder in the half-open range <tt>[0, modulus)</tt>, congruent to @p a modulo @p modulus.
*/
__device__ __host__ inline int32_t euclid_mod_i32(int32_t a, int32_t modulus) {
int32_t r = a % modulus;
if (r < 0) r += modulus;
return r;
}

/**
* @brief Euclidean (non-negative) modulo for 64-bit operands.
* @param a Dividend.
* @param modulus Strictly positive modulus.
* @return Remainder in the half-open range <tt>[0, modulus)</tt>, congruent to @p a modulo @p modulus.
* @note Uses a single remainder and correction instead of repeating modulo and add.
*/
__device__ __host__ inline int64_t euclid_mod_i64(int64_t a, int64_t modulus) {
int64_t r = a % modulus;
if (r < 0) r += modulus;
return r;
}

/**
* @brief Euclidean modulo with a 32-bit fast path on the GPU when operands are in a safe range.
* @param a Dividend.
* @param modulus Strictly positive modulus.
* @return Same as euclid_mod_i64() when operands fit the device fast path; otherwise defers to euclid_mod_i64().
* @note On device, uses 32-bit remainder when @p modulus and @p a are sufficiently small to avoid 64-bit division.
* On host, always uses euclid_mod_i64(). Caller must keep values in range when relying on the fast path.
*/
__device__ __host__ inline int64_t euclid_mod_i64_fast(int64_t a, int64_t modulus) {
#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__)
constexpr int64_t kLim = int64_t{1} << 30;
if (modulus > 0 && modulus < kLim && a > -kLim && a < kLim) {
int32_t ai = static_cast<int32_t>(a);
int32_t m = static_cast<int32_t>(modulus);
int32_t r = ai % m;
if (r < 0) r += m;
return static_cast<int64_t>(r);
}
#endif
return euclid_mod_i64(a, modulus);
}

/**
* @brief Convert a subpixel coordinate to the integer grid index below @p x (floor).
* @param x Source coordinate in pixels.
* @return Largest int64 not greater than @p x (i.e. floor), suitable as the left/top neighbor index for bilinear/cubic.
* @note On device, uses a floor intrinsic compatible with HIP @c __float2ll_rd lowering; on host uses @c floorf().
*/
__device__ __host__ __forceinline__ int64_t interp_floor_i64(float x) {
#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__)
return static_cast<int64_t>(static_cast<long long>(__builtin_elementwise_floor(x)));
#else
return static_cast<int64_t>(floorf(x));
#endif
}

/**
* @brief Nearest-neighbor rounding of a subpixel coordinate to an integer index.
* @param x Source coordinate in pixels.
* @return Integer closest to @p x, with half values rounded away from zero (same convention as @c std::llroundf()).
*/
__device__ __host__ __forceinline__ int64_t interp_nearest_i64(float x) {
return static_cast<int64_t>(std::llroundf(x));
Comment thread
zacharyvincze marked this conversation as resolved.
}

} // namespace detail
} // namespace roccv
116 changes: 81 additions & 35 deletions include/core/wrappers/border_wrapper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,71 @@

#pragma once

#include <hip/hip_runtime.h>

#include "core/detail/sampling_helpers.hpp"
#include "core/wrappers/image_wrapper.hpp"
#include "operator_types.h"

namespace roccv {

namespace detail {
/**
* @brief Map one axis coordinate for OpenCV-style @c BORDER_REFLECT (edge pixels duplicated; not @c BORDER_REFLECT101).
* @param coord Possibly out-of-bounds coordinate along the axis (width or height index space).
* @param extent Positive extent of the axis (number of samples, e.g. image width or height).
* @return In-bounds index in <tt>[0, extent)</tt> after reflection.
* @note Period is <tt>2 * extent</tt>. Implementation uses Euclidean modulo then
* <tt>min(val, 2*extent - 1 - val)</tt>, which matches comparing @c val to @c extent with a ternary, without a
* separate branch on @p extent alone. On device, a 32-bit remainder path is used when @p extent and @p coord are
* in a safe range.
*/
__device__ __host__ inline int64_t reflect_border_coord_i64(int64_t coord, int64_t extent) {
#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__)
constexpr int64_t kLim = int64_t{1} << 30;
if (extent > 0 && extent < kLim && coord > -kLim && coord < kLim) {
const int32_t e = static_cast<int32_t>(extent);
const int32_t scale = e * 2;
int32_t val = static_cast<int32_t>(coord) % scale;
if (val < 0) val += scale;
const int32_t inv = scale - 1 - val;
return static_cast<int64_t>(min_i32(val, inv));
}
#endif
const int64_t scale = extent * 2;
const int64_t val = euclid_mod_i64(coord, scale);
const int64_t inv = scale - 1 - val;
return min_i64(val, inv);
}

/**
* @brief Map one axis coordinate for OpenCV-style @c BORDER_REFLECT101 (endpoints are not repeated in the reflection).
* @param coord Possibly out-of-bounds coordinate along the axis.
* @param extent Positive extent of the axis (number of samples). If @p extent is at most 1, returns @c 0.
* @return In-bounds index in <tt>[0, extent)</tt> after reflection.
* @note Period is <tt>2 * extent - 2</tt> when @p extent is greater than 1. Uses Euclidean modulo then folds with
* <tt>(extent - 1) - abs((extent - 1) - v)</tt>. On device, a 32-bit path applies when values fit a fixed bound.
*/
__device__ __host__ inline int64_t reflect101_border_coord_i64(int64_t coord, int64_t extent) {
if (extent <= 1) {
return 0;
}
const int64_t scale = 2 * extent - 2;
#if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__)
constexpr int64_t kLim = int64_t{1} << 30;
if (extent < kLim && coord > -kLim && coord < kLim && scale > 0 && scale < kLim) {
const int32_t e = static_cast<int32_t>(extent);
const int32_t s = e * 2 - 2;
int32_t v = euclid_mod_i32(static_cast<int32_t>(coord), s);
const int32_t inner = (e - 1) - v;
const int32_t a = abs_i32(inner);
return static_cast<int64_t>((e - 1) - a);
}
#endif
const int64_t v = euclid_mod_i64_fast(coord, scale);
const int64_t inner = (extent - 1) - v;
return (extent - 1) - abs_i64(inner);
}
} // namespace detail

/**
* @brief Wrapper class for ImageWrapper. This extends the descriptors by defining behaviors for when tensor
* coordinates go out of scope.
Expand Down Expand Up @@ -56,6 +114,13 @@ class BorderWrapper {
BorderWrapper(ImageWrapper<T> image_wrapper, T border_value)
: m_desc(image_wrapper), m_border_value(border_value) {}

/**
* @brief Sample the underlying image with no border logic. Caller must ensure coordinates are in-range.
*/
__device__ __host__ inline const T at_inbounds(int64_t n, int64_t h, int64_t w, int64_t c) const {
return m_desc.at(n, h, w, c);
}

/**
* @brief Returns a reference to the underlying data given image coordinates. If the coordinates fall out of bounds,
* a fallback reference based on the provided border type will be given instead.
Expand Down Expand Up @@ -91,48 +156,29 @@ class BorderWrapper {
// Reflect border type implementation. (Note: This is NOT REFLECT101, pixels at the border will be duplicated as
// is the intended behavior for this border mode.)
if constexpr (BorderType == eBorderType::BORDER_TYPE_REFLECT) {
int64_t scale = imgWidth * 2;
int64_t val = (w % scale + scale) % scale;
x = (val < imgWidth) ? val : scale - 1 - val;

scale = imgHeight * 2;
val = (h % scale + scale) % scale;
y = (val < imgHeight) ? val : scale - 1 - val;
if (w < 0 || w >= imgWidth) {
x = detail::reflect_border_coord_i64(w, imgWidth);
}
if (h < 0 || h >= imgHeight) {
y = detail::reflect_border_coord_i64(h, imgHeight);
}
}

if constexpr (BorderType == eBorderType::BORDER_TYPE_REFLECT101) {
if (imgWidth == 1) {
x = 0;
} else {
int64_t scale = 2 * imgWidth - 2;
x = (w % scale + scale) % scale;
x = imgWidth - 1 - std::abs(imgWidth - 1 - x);
}

if (imgHeight == 1) {
y = 0;
} else {
int64_t scale = 2 * imgHeight - 2;
y = (h % scale + scale) % scale;
y = imgHeight - 1 - std::abs(imgHeight - 1 - y);
}
x = detail::reflect101_border_coord_i64(w, imgWidth);
y = detail::reflect101_border_coord_i64(h, imgHeight);
}

// Replicate border type implementation
// Replicate: clamp to edge. Equivalent to per-axis OOB snap; min/max maps cleanly to GPU integer ops.
if constexpr (BorderType == eBorderType::BORDER_TYPE_REPLICATE) {
x = std::clamp<int64_t>(w, 0, imgWidth - 1);
y = std::clamp<int64_t>(h, 0, imgHeight - 1);
x = detail::clamp_i64(w, 0, imgWidth - 1);
y = detail::clamp_i64(h, 0, imgHeight - 1);
}

// Wrap border type implementation
if constexpr (BorderType == eBorderType::BORDER_TYPE_WRAP) {
if (w < 0 || w >= imgWidth) {
x = (w % imgWidth + imgWidth) % imgWidth;
}

if (h < 0 || h >= imgHeight) {
y = (h % imgHeight + imgHeight) % imgHeight;
}
x = detail::euclid_mod_i64_fast(w, imgWidth);
y = detail::euclid_mod_i64_fast(h, imgHeight);
}
Comment thread
zacharyvincze marked this conversation as resolved.

return m_desc.at(n, y, x, c);
Expand Down Expand Up @@ -160,7 +206,7 @@ class BorderWrapper {
__device__ __host__ inline int64_t batches() const { return m_desc.batches(); }

/**
* @brief Retries the number of channels in the image.
* @brief Retrieves the number of channels in the image.
*
* @return Image channels.
*/
Expand Down
Loading