Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
77b2090
Create helper function for determining maximum block size
zacharyvincze Feb 9, 2026
6d616af
Define StreamCallback as static inline
zacharyvincze Feb 9, 2026
348816d
Use dynamic block size helper for Composite operator
zacharyvincze Feb 9, 2026
3b0488e
Add helper for grid size calculations
zacharyvincze Feb 9, 2026
2ceddce
Add dynamic block size calculations to operators
zacharyvincze Feb 9, 2026
8c84900
Add dynamic block sizes to rest of operators
zacharyvincze Feb 9, 2026
d6c3418
Merge branch 'develop' into zv/optimization/dynamic-block-sizes
zacharyvincze Feb 11, 2026
5bbacd9
Merge branch 'develop' into zv/optimization/dynamic-block-sizes
zacharyvincze Feb 12, 2026
284eca8
Address PR comments
zacharyvincze Feb 17, 2026
77d79e4
Update block calculations to floor result
zacharyvincze Feb 19, 2026
4417820
Merge branch 'develop' into zv/optimization/dynamic-block-sizes
zacharyvincze Feb 19, 2026
1c37b91
Hardcode target block size instead of querying runtime
zacharyvincze Feb 20, 2026
4a02208
Merge branch 'develop' into zv/optimization/dynamic-block-sizes
zacharyvincze Feb 23, 2026
9627e21
Merge branch 'develop' into zv/optimization/dynamic-block-sizes
zacharyvincze Feb 25, 2026
88a3eb2
Merge branch 'develop' into zv/optimization/dynamic-block-sizes
zacharyvincze Mar 2, 2026
222d50c
Merge branch 'develop' into zv/optimization/dynamic-block-sizes
zacharyvincze Mar 6, 2026
9666d83
Merge branch 'develop' into zv/optimization/dynamic-block-sizes
zacharyvincze Mar 6, 2026
a305bf6
Merge branch 'zv/optimization/dynamic-block-sizes' of github.com:zach…
zacharyvincze Mar 6, 2026
25f4483
Merge branch 'develop' into zv/optimization/dynamic-block-sizes
zacharyvincze Mar 16, 2026
0188cf2
Merge branch 'develop' into zv/optimization/dynamic-block-sizes
zacharyvincze Apr 23, 2026
2c2fb41
Merge branch 'develop' into zv/optimization/dynamic-block-sizes
zacharyvincze Apr 23, 2026
1efde9d
Merge branch 'develop' into zv/optimization/dynamic-block-sizes
zacharyvincze Apr 24, 2026
a899bb0
Use 1D blocks for pointwise kernels
zacharyvincze Apr 27, 2026
7ce6521
Merge branch 'zv/optimization/dynamic-block-sizes' of github.com:zach…
zacharyvincze Apr 27, 2026
063f97f
Change default 2D block size to 256
zacharyvincze Apr 27, 2026
9ba59d4
Switch default target block size to 128
zacharyvincze Apr 27, 2026
981346a
Use runtime API to determine maximum occupancy
zacharyvincze Apr 27, 2026
f5baca1
Increase default caps
zacharyvincze Apr 27, 2026
00a543d
Decrease 1D block size cap to 256
zacharyvincze Apr 27, 2026
e6f5588
Increase block size cap to 1024 for 1D blocks
zacharyvincze Apr 27, 2026
fc8c008
Return cap sizes back to normal defaults
zacharyvincze Apr 27, 2026
54849f9
Merge branch 'develop' into zv/optimization/dynamic-block-sizes
zacharyvincze May 6, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 130 additions & 2 deletions include/core/detail/hip_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@
#include <functional>
#include <memory>

#include "core/hip_assert.h"

namespace roccv::detail {
static void StreamCallback(void* userData) {
static inline void StreamCallback(void* userData) {
std::function<void()>* func = static_cast<std::function<void()>*>(userData);
(*func)();
delete func;
Expand All @@ -46,4 +48,130 @@ void LaunchHostFuncAsync(hipStream_t stream, Callable&& cb) {
HIP_VALIDATE_NO_ERRORS(hipLaunchHostFunc(stream, StreamCallback, data.get()));
data.release(); // Release ownership, StreamCallback is responsible for it now
}
} // namespace roccv::detail

/**
* @brief Get the device's wavefront/warp size, cached per-thread.
*
* Re-queries the runtime if the active HIP device has changed since the
* last call from the calling thread (so multi-device callers stay correct
* without paying for a query on every launch).
*/
inline int CachedWarpSize() {
static thread_local int cachedDeviceId = -1;
static thread_local int cachedWarpSize = 0;
int deviceId;
HIP_VALIDATE_NO_ERRORS(hipGetDevice(&deviceId));
if (deviceId != cachedDeviceId) {
HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&cachedWarpSize, hipDeviceAttributeWarpSize, deviceId));
cachedDeviceId = deviceId;
}
return cachedWarpSize;
}

/**
* @brief Cache hipOccupancyMaxPotentialBlockSize per (kernel, device).
*
* The driver picks a thread count that maximizes resident wavefronts per CU
* for the given kernel on the current device, accounting for the kernel's
* register and static-shared-memory usage. The result is bounded above by
* Cap so the API's drive toward maximum occupancy can't override workload-
* class judgment (memory-bound ops gain nothing past ~50% occupancy and
* can lose throughput to cache pressure with overly large blocks).
*
* Each (Kernel, Cap) instantiation gets its own thread-local cache slot,
* so the runtime query runs once per (kernel, device, cap) per thread.
*
* @tparam Kernel The __global__ function pointer (auto NTTP — each unique
* kernel address gets its own cached result).
* @tparam Cap Upper bound on the returned block size.
*/
template <auto Kernel, int Cap>
inline int CachedOccupancyBlockSize() {
static thread_local int cachedDeviceId = -1;
static thread_local int cachedBlockSize = 0;
int deviceId;
HIP_VALIDATE_NO_ERRORS(hipGetDevice(&deviceId));
if (deviceId != cachedDeviceId) {
int minGridSize;
HIP_VALIDATE_NO_ERRORS(hipOccupancyMaxPotentialBlockSize(&minGridSize, &cachedBlockSize, Kernel, 0, Cap));
cachedDeviceId = deviceId;
}
return cachedBlockSize;
}

/**
* @brief Pick a 1D block size for a pointwise kernel via runtime occupancy
* query, capped at Cap, and cached per (kernel, device).
*
* Use for pointwise kernels (no neighborhood reads). The driver returns a
* thread count tuned to this specific kernel's register pressure on the
* current device — important on architectures with very different SIMD-per-CU
* counts and register-file sizes (e.g. CDNA wants more wavefronts in flight
* per CU than RDNA to hide HBM latency).
*
* Pair with GetGridSize1D — see its docs for the row-major launch shape.
*
* @tparam Kernel The __global__ function pointer.
* @tparam Cap Upper bound on threads per block.
* @return dim3(blockSize, 1, 1).
*/
template <auto Kernel, int Cap = 256>
inline dim3 GetBlockSize1D() {
return dim3(CachedOccupancyBlockSize<Kernel, Cap>(), 1, 1);
}

/**
* @brief Pick a 2D block size for a stencil/transform kernel via runtime
* occupancy query, capped at Cap, and cached per (kernel, device).
*
* Use for kernels with 2D locality (stencils, interpolation neighborhoods,
* affine warps). Reshapes the queried thread count as
* (warpSize, blockSize / warpSize, 1) so threadIdx.x is wavefront-aligned
* for coalescing while threadIdx.y stacks rows for tile-style cache reuse.
*
* @tparam Kernel The __global__ function pointer.
* @tparam Cap Upper bound on threads per block.
* @return dim3(warpSize, blockSize / warpSize, 1).
*/
template <auto Kernel, int Cap = 512>
inline dim3 GetBlockSize2D() {
int blockSize = CachedOccupancyBlockSize<Kernel, Cap>();
int warpSize = CachedWarpSize();
return dim3(warpSize, blockSize / warpSize, 1);
}

/**
* @brief Get the grid size for a 2D kernel.
*
* @param[in] width The width of the image.
* @param[in] height The height of the image.
* @param[in] batchSize The batch size of the image.
* @param[in] blockSize The block size of the kernel.
* @return The grid size.
*/
static inline dim3 GetGridSize2D(size_t width, size_t height, size_t batchSize, dim3 blockSize) {
return dim3((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y, batchSize);
}

/**
* @brief Get the grid size for a 1D-row-major launch.
*
* Lays one image row along gridDim.y and the batch along gridDim.z. The
* block from GetBlockSize1D has blockDim.y == 1, so the kernel's standard
* `y = blockDim.y * blockIdx.y + threadIdx.y` collapses to `y = blockIdx.y`
* with no kernel changes required.
*
* Caller is responsible for ensuring height does not exceed the device's
* gridDim.y limit (typically 65535) and batchSize does not exceed gridDim.z.
*
* @param[in] width The width of the image.
* @param[in] height The height of the image (becomes gridDim.y).
* @param[in] batchSize The number of images (becomes gridDim.z).
* @param[in] blockSize Block size from GetBlockSize1D.
* @return The grid size: dim3(ceil(width / blockSize.x), height, batchSize).
*/
static inline dim3 GetGridSize1D(size_t width, size_t height, size_t batchSize, dim3 blockSize) {
return dim3((width + blockSize.x - 1) / blockSize.x, height, batchSize);
}

} // namespace roccv::detail
1 change: 1 addition & 0 deletions include/kernels/device/bnd_box_device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ THE SOFTWARE.

#include <core/detail/type_traits.hpp>

#include "common/math_vector.hpp"
#include "kernels/kernel_helpers.hpp"
#include "operator_types.h"

Expand Down
1 change: 1 addition & 0 deletions include/kernels/host/bnd_box_host.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ THE SOFTWARE.

#include <core/detail/type_traits.hpp>

#include "common/math_vector.hpp"
#include "kernels/kernel_helpers.hpp"
#include "operator_types.h"

Expand Down
10 changes: 5 additions & 5 deletions src/op_bilateral_filter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ THE SOFTWARE.

#include "common/validation_helpers.hpp"
#include "core/detail/casting.hpp"
#include "core/detail/hip_utils.hpp"
#include "core/wrappers/border_wrapper.hpp"
#include "core/wrappers/image_wrapper.hpp"
#include "kernels/device/bilateral_filter_device.hpp"
Expand Down Expand Up @@ -61,21 +62,20 @@ void dispatch_bilateral_filter_border_mode(hipStream_t stream, const Tensor &inp
sigmaSpace = 1.0f;
}

const int radius =
(diameter <= 0) ? static_cast<int>(std::roundf(sigmaSpace * 1.5f)) : (diameter >> 1);
const int radius = (diameter <= 0) ? static_cast<int>(std::roundf(sigmaSpace * 1.5f)) : (diameter >> 1);

float spaceCoeff = -1 / (2 * sigmaSpace * sigmaSpace);
float colorCoeff = -1 / (2 * sigmaColor * sigmaColor);

if (device == eDeviceType::GPU) {
dim3 block(8, 8);
constexpr auto kernel = Kernels::Device::bilateral_filter<T, BorderWrapper<T, B>, ImageWrapper<T>>;
dim3 block = detail::GetBlockSize2D<kernel>();
uint32_t xGridSize = (outputWrapper.width() + (block.x * 2) - 1) / (block.x * 2);
uint32_t yGridSize = (outputWrapper.height() + (block.y * 2) - 1) / (block.y * 2);
uint32_t zGridSize = outputWrapper.batches();
dim3 grid(xGridSize, yGridSize, zGridSize);

Kernels::Device::bilateral_filter<T>
<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, radius, spaceCoeff, colorCoeff);
kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, radius, spaceCoeff, colorCoeff);
} else if (device == eDeviceType::CPU) {
int divisor = std::gcd(4, outputWrapper.height()); // greatest common divisor
int dividend = std::gcd((numThreads / divisor), outputWrapper.width());
Expand Down
11 changes: 5 additions & 6 deletions src/op_bnd_box.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,8 @@ THE SOFTWARE.

#include <algorithm>
#include <cstring>
#include <iostream>
#include <vector>

#include "common/math_vector.hpp"
#include "common/validation_helpers.hpp"
#include "core/detail/hip_utils.hpp"
#include "core/tensor.hpp"
Expand All @@ -52,8 +50,9 @@ void dispatch_bnd_box_dtype(hipStream_t stream, const Tensor &input, const Tenso
auto batchSize = inputWrapper.batches();
switch (device) {
case eDeviceType::GPU: {
const dim3 block(32, 32);
const dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y, batchSize);
constexpr auto kernel = Kernels::Device::bndbox_kernel<has_alpha, T, ImageWrapper<T>, ImageWrapper<T>>;
const dim3 block = detail::GetBlockSize1D<kernel>();
const dim3 grid = detail::GetGridSize1D(width, height, batchSize, block);

Rect_t *rects_ptr = nullptr;
const auto n_rects = rects->size();
Expand All @@ -63,8 +62,8 @@ void dispatch_bnd_box_dtype(hipStream_t stream, const Tensor &input, const Tenso
HIP_VALIDATE_NO_ERRORS(
hipMemcpyAsync(rects_ptr, rects->data(), sizeof(Rect_t) * n_rects, hipMemcpyHostToDevice, stream));
}
Kernels::Device::bndbox_kernel<has_alpha, T>
<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, rects_ptr, n_rects, batchSize, height, width);
kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, rects_ptr, n_rects, batchSize, height,
width);
if (n_rects > 0) {
HIP_VALIDATE_NO_ERRORS(hipFreeAsync(rects_ptr, stream));
}
Expand Down
11 changes: 7 additions & 4 deletions src/op_composite.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <functional>

#include "common/validation_helpers.hpp"
#include "core/detail/hip_utils.hpp"
#include "core/wrappers/image_wrapper.hpp"
#include "kernels/device/composite_device.hpp"
#include "kernels/host/composite_host.hpp"
Expand All @@ -40,10 +41,12 @@ void dispatch_composite_masktype(hipStream_t stream, const Tensor& foreground, c

switch (device) {
case eDeviceType::GPU: {
dim3 block(64, 16);
dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
outputWrapper.batches());
Kernels::Device::composite<<<grid, block, 0, stream>>>(fgWrapper, bgWrapper, maskWrapper, outputWrapper);
constexpr auto kernel = Kernels::Device::composite<ImageWrapper<SrcType>, ImageWrapper<MaskType>,
ImageWrapper<DstType>>;
dim3 block = detail::GetBlockSize1D<kernel>();
dim3 grid =
detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
kernel<<<grid, block, 0, stream>>>(fgWrapper, bgWrapper, maskWrapper, outputWrapper);
break;
}

Expand Down
38 changes: 19 additions & 19 deletions src/op_convert_to.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,23 @@ THE SOFTWARE.
*/
#include "op_convert_to.hpp"

#include <hip/hip_runtime.h>

#include <functional>

#include <hip/hip_runtime.h>
#include "core/wrappers/image_wrapper.hpp"
#include "common/validation_helpers.hpp"
#include "core/detail/casting.hpp"
#include "core/detail/hip_utils.hpp"
#include "core/detail/type_traits.hpp"
#include "core/wrappers/image_wrapper.hpp"
#include "kernels/device/convert_to_device.hpp"
#include "kernels/host/convert_to_host.hpp"

namespace roccv {

template <typename SRC_DT, typename DST_DT, int NC>
void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const Tensor &output,
double alpha, double beta, eDeviceType device) {

void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const Tensor &output, double alpha,
double beta, eDeviceType device) {
using SRC_DT_NC = detail::MakeType<SRC_DT, NC>;
using DST_DT_NC = detail::MakeType<DST_DT, NC>;

Expand All @@ -47,17 +48,19 @@ void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const
using DST_BT = detail::BaseType<DST_DT>;

using DT_AB = decltype(float() * SRC_BT() * DST_BT());

DT_AB alpha_ab = detail::SaturateCast<DT_AB>(alpha);
DT_AB beta_ab = detail::SaturateCast<DT_AB>(beta);

// Launch CPU/GPU kernel depending on requested device type.
switch (device) {
case eDeviceType::GPU: {
dim3 block(64, 16);
dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
outputWrapper.batches());
Kernels::Device::convert_to<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, alpha_ab, beta_ab);
constexpr auto kernel =
Kernels::Device::convert_to<ImageWrapper<SRC_DT_NC>, ImageWrapper<DST_DT_NC>, DT_AB>;
dim3 block = detail::GetBlockSize1D<kernel>();
dim3 grid =
detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, alpha_ab, beta_ab);
break;
}
case eDeviceType::CPU: {
Expand All @@ -68,16 +71,14 @@ void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const
}

template <typename SRC_DT, typename DST_DT>
void dispatch_convert_to_output_dtype(hipStream_t stream, const Tensor &input, const Tensor &output,
double alpha, double beta, eDeviceType device) {

void dispatch_convert_to_output_dtype(hipStream_t stream, const Tensor &input, const Tensor &output, double alpha,
double beta, eDeviceType device) {
int64_t channels = output.shape(output.layout().channels_index());
// Select kernel dispatcher based on number of channels.
// clang-format off
static const std::array<std::function<void(hipStream_t, const Tensor &, const Tensor &, double, double, eDeviceType)>, 4>
funcs = {dispatch_convert_to_channels<SRC_DT, DST_DT, 1>, dispatch_convert_to_channels<SRC_DT, DST_DT, 2>, dispatch_convert_to_channels<SRC_DT, DST_DT, 3>, dispatch_convert_to_channels<SRC_DT, DST_DT, 4>};



// clang-format on

auto func = funcs.at(channels - 1);
Expand All @@ -86,11 +87,10 @@ void dispatch_convert_to_output_dtype(hipStream_t stream, const Tensor &input, c
}

template <typename SRC_DT>
void dispatch_convert_to_input_dtype(hipStream_t stream, const Tensor &input, const Tensor &output,
double alpha, double beta, eDeviceType device) {

void dispatch_convert_to_input_dtype(hipStream_t stream, const Tensor &input, const Tensor &output, double alpha,
double beta, eDeviceType device) {
eDataType output_dtype = output.dtype().etype();

// Select kernel dispatcher based on a base input datatype.
// clang-format off
static const std::unordered_map<eDataType, std::function<void(hipStream_t, const Tensor &, const Tensor &, double, double, eDeviceType)>>
Expand Down
12 changes: 7 additions & 5 deletions src/op_copy_make_border.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,10 @@
#include <functional>

#include "common/validation_helpers.hpp"
#include "core/detail/casting.hpp"
#include "core/detail/hip_utils.hpp"
#include "core/wrappers/border_wrapper.hpp"
#include "core/wrappers/image_wrapper.hpp"
#include "core/wrappers/interpolation_wrapper.hpp"
#include "kernels/device/copy_make_border_device.hpp"
#include "kernels/host/copy_make_border_host.hpp"

Expand All @@ -43,10 +44,11 @@ void dispatch_copy_make_border_border_mode(hipStream_t stream, const Tensor& inp

switch (device) {
case eDeviceType::GPU: {
dim3 block_dim(64, 16);
dim3 grid_dim((out_desc.width() + block_dim.x - 1) / block_dim.x,
(out_desc.height() + block_dim.y - 1) / block_dim.y, out_desc.batches());
Kernels::Device::copy_make_border<<<grid_dim, block_dim, 0, stream>>>(in_desc, out_desc, top, left);
constexpr auto kernel =
Kernels::Device::copy_make_border<BorderWrapper<T, BorderMode>, ImageWrapper<T>>;
dim3 block = detail::GetBlockSize1D<kernel>();
dim3 grid = detail::GetGridSize1D(out_desc.width(), out_desc.height(), out_desc.batches(), block);
kernel<<<grid, block, 0, stream>>>(in_desc, out_desc, top, left);
break;
}
case eDeviceType::CPU: {
Expand Down
10 changes: 6 additions & 4 deletions src/op_custom_crop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ THE SOFTWARE.
#include <functional>

#include "common/validation_helpers.hpp"
#include "core/detail/hip_utils.hpp"
#include "core/wrappers/image_wrapper.hpp"
#include "kernels/device/custom_crop_device.hpp"
#include "kernels/host/custom_crop_host.hpp"
Expand All @@ -40,10 +41,11 @@ void dispatch_custom_crop_dtype(hipStream_t stream, const Tensor& input, const T

switch (device) {
case eDeviceType::GPU: {
dim3 block(64, 16);
dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y,
outputWrapper.batches());
Kernels::Device::custom_crop<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, cropRect);
constexpr auto kernel = Kernels::Device::custom_crop<ImageWrapper<T>, ImageWrapper<T>>;
dim3 block = detail::GetBlockSize1D<kernel>();
dim3 grid =
detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block);
kernel<<<grid, block, 0, stream>>>(inputWrapper, outputWrapper, cropRect);
break;
}

Expand Down
Loading