-
Notifications
You must be signed in to change notification settings - Fork 7
[Optimization] Improve border/interpolation wrapper implementations #147
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
zacharyvincze
wants to merge
13
commits into
ROCm:develop
Choose a base branch
from
zacharyvincze:zv/optimization/interp-border-mode-patch
base: develop
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
13 commits
Select commit
Hold shift + click to select a range
718c1ad
Improve hot paths for BorderWrapper/InterpolationWrapper
zacharyvincze 00f6b89
Add dedicated warp_affine path
zacharyvincze 5af67d2
Remove nearest bounds check
zacharyvincze b781467
Per-axis bounds checking in border wrapper
zacharyvincze 56f0831
Unroll for loops in cubic interpolation implementation
zacharyvincze 852176a
General optimizations for border/interpolation wrapper implementations
zacharyvincze 2beed7a
Change warp perspective block size
zacharyvincze 79d84a1
Revert block size changes
zacharyvincze 99ac067
Organize helpers/improve docstrings
zacharyvincze 724ccc6
Merge branch 'develop' into zv/optimization/interp-border-mode-patch
zacharyvincze 0426add
Merge branch 'develop' into zv/optimization/interp-border-mode-patch
zacharyvincze 891f4fb
Merge branch 'develop' into zv/optimization/interp-border-mode-patch
zacharyvincze be15ee7
Merge branch 'develop' into zv/optimization/interp-border-mode-patch
zacharyvincze File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,164 @@ | ||
| /* | ||
| * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. | ||
| * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
| * of this software and associated documentation files (the "Software"), to deal | ||
| * in the Software without restriction, including without limitation the rights | ||
| * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
| * copies of the Software, and to permit persons to whom the Software is | ||
| * furnished to do so, subject to the following conditions: | ||
| * | ||
| * The above copyright notice and this permission notice shall be included in | ||
| * all copies or substantial portions of the Software. | ||
| * | ||
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
| * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
| * THE SOFTWARE. | ||
| */ | ||
|
|
||
| #pragma once | ||
|
|
||
| /** | ||
| * @file sampling_helpers.hpp | ||
| * @brief Small host/device helpers for border coordinate math and interpolation index conversion. | ||
| */ | ||
|
|
||
| #include <hip/hip_runtime.h> | ||
|
|
||
| #include <cmath> | ||
| #include <cstdint> | ||
|
|
||
| namespace roccv { | ||
| namespace detail { | ||
|
|
||
| /** | ||
| * @brief Branchless absolute value of a signed 64-bit integer (two's complement). | ||
| * @param v Input value. | ||
| * @return Non-negative absolute value of @p v. | ||
| * @note Avoids @c std::abs in device code; useful on GPU hot paths. | ||
| */ | ||
| __device__ __host__ __forceinline__ int64_t abs_i64(int64_t v) { | ||
| const int64_t mask = v >> 63; | ||
| return (v ^ mask) - mask; | ||
| } | ||
|
|
||
| /** | ||
| * @brief Branchless absolute value of a signed 32-bit integer (two's complement). | ||
| * @param v Input value. | ||
| * @return Non-negative absolute value of @p v. | ||
| */ | ||
| __device__ __host__ __forceinline__ int32_t abs_i32(int32_t v) { | ||
| const int32_t mask = v >> 31; | ||
| return (v ^ mask) - mask; | ||
| } | ||
|
|
||
| /** | ||
| * @brief Minimum of two 32-bit signed integers. | ||
| * @param a First operand. | ||
| * @param b Second operand. | ||
| * @return The lesser of @p a and @p b. | ||
| */ | ||
| __device__ __host__ __forceinline__ int32_t min_i32(int32_t a, int32_t b) { return a < b ? a : b; } | ||
|
|
||
| /** | ||
| * @brief Minimum of two 64-bit signed integers. | ||
| * @param a First operand. | ||
| * @param b Second operand. | ||
| * @return The lesser of @p a and @p b. | ||
| */ | ||
| __device__ __host__ __forceinline__ int64_t min_i64(int64_t a, int64_t b) { return a < b ? a : b; } | ||
|
|
||
| /** | ||
| * @brief Maximum of two 64-bit signed integers. | ||
| * @param a First operand. | ||
| * @param b Second operand. | ||
| * @return The greater of @p a and @p b. | ||
| */ | ||
| __device__ __host__ __forceinline__ int64_t max_i64(int64_t a, int64_t b) { return a > b ? a : b; } | ||
|
|
||
| /** | ||
| * @brief Clamp a signed 64-bit integer to a closed interval. | ||
| * @param v Value to clamp. | ||
| * @param lo Lower bound (inclusive). | ||
| * @param hi Upper bound (inclusive); must satisfy @p lo <= @p hi. | ||
| * @return @p v restricted to the inclusive interval between @p lo and @p hi. | ||
| */ | ||
| __device__ __host__ __forceinline__ int64_t clamp_i64(int64_t v, int64_t lo, int64_t hi) { | ||
| return min_i64(max_i64(v, lo), hi); | ||
| } | ||
|
|
||
| /** | ||
| * @brief Euclidean (non-negative) modulo for 32-bit operands. | ||
| * @param a Dividend. | ||
| * @param modulus Strictly positive modulus. | ||
| * @return Remainder in the half-open range <tt>[0, modulus)</tt>, congruent to @p a modulo @p modulus. | ||
| */ | ||
| __device__ __host__ inline int32_t euclid_mod_i32(int32_t a, int32_t modulus) { | ||
| int32_t r = a % modulus; | ||
| if (r < 0) r += modulus; | ||
| return r; | ||
| } | ||
|
|
||
| /** | ||
| * @brief Euclidean (non-negative) modulo for 64-bit operands. | ||
| * @param a Dividend. | ||
| * @param modulus Strictly positive modulus. | ||
| * @return Remainder in the half-open range <tt>[0, modulus)</tt>, congruent to @p a modulo @p modulus. | ||
| * @note Uses a single remainder and correction instead of repeating modulo and add. | ||
| */ | ||
| __device__ __host__ inline int64_t euclid_mod_i64(int64_t a, int64_t modulus) { | ||
| int64_t r = a % modulus; | ||
| if (r < 0) r += modulus; | ||
| return r; | ||
| } | ||
|
|
||
| /** | ||
| * @brief Euclidean modulo with a 32-bit fast path on the GPU when operands are in a safe range. | ||
| * @param a Dividend. | ||
| * @param modulus Strictly positive modulus. | ||
| * @return Same as euclid_mod_i64() when operands fit the device fast path; otherwise defers to euclid_mod_i64(). | ||
| * @note On device, uses 32-bit remainder when @p modulus and @p a are sufficiently small to avoid 64-bit division. | ||
| * On host, always uses euclid_mod_i64(). Caller must keep values in range when relying on the fast path. | ||
| */ | ||
| __device__ __host__ inline int64_t euclid_mod_i64_fast(int64_t a, int64_t modulus) { | ||
| #if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__) | ||
| constexpr int64_t kLim = int64_t{1} << 30; | ||
| if (modulus > 0 && modulus < kLim && a > -kLim && a < kLim) { | ||
| int32_t ai = static_cast<int32_t>(a); | ||
| int32_t m = static_cast<int32_t>(modulus); | ||
| int32_t r = ai % m; | ||
| if (r < 0) r += m; | ||
| return static_cast<int64_t>(r); | ||
| } | ||
| #endif | ||
| return euclid_mod_i64(a, modulus); | ||
| } | ||
|
|
||
| /** | ||
| * @brief Convert a subpixel coordinate to the integer grid index below @p x (floor). | ||
| * @param x Source coordinate in pixels. | ||
| * @return Largest int64 not greater than @p x (i.e. floor), suitable as the left/top neighbor index for bilinear/cubic. | ||
| * @note On device, uses a floor intrinsic compatible with HIP @c __float2ll_rd lowering; on host uses @c floorf(). | ||
| */ | ||
| __device__ __host__ __forceinline__ int64_t interp_floor_i64(float x) { | ||
| #if defined(__HIP_DEVICE_COMPILE__) || defined(__CUDA_ARCH__) | ||
| return static_cast<int64_t>(static_cast<long long>(__builtin_elementwise_floor(x))); | ||
| #else | ||
| return static_cast<int64_t>(floorf(x)); | ||
| #endif | ||
| } | ||
|
|
||
| /** | ||
| * @brief Nearest-neighbor rounding of a subpixel coordinate to an integer index. | ||
| * @param x Source coordinate in pixels. | ||
| * @return Integer closest to @p x, with half values rounded away from zero (same convention as @c std::llroundf()). | ||
| */ | ||
| __device__ __host__ __forceinline__ int64_t interp_nearest_i64(float x) { | ||
| return static_cast<int64_t>(std::llroundf(x)); | ||
| } | ||
|
|
||
| } // namespace detail | ||
| } // namespace roccv | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.