diff --git a/include/core/detail/hip_utils.hpp b/include/core/detail/hip_utils.hpp index d8a918f1..74a17108 100644 --- a/include/core/detail/hip_utils.hpp +++ b/include/core/detail/hip_utils.hpp @@ -25,8 +25,10 @@ #include #include +#include "core/hip_assert.h" + namespace roccv::detail { -static void StreamCallback(void* userData) { +static inline void StreamCallback(void* userData) { std::function* func = static_cast*>(userData); (*func)(); delete func; @@ -46,4 +48,130 @@ void LaunchHostFuncAsync(hipStream_t stream, Callable&& cb) { HIP_VALIDATE_NO_ERRORS(hipLaunchHostFunc(stream, StreamCallback, data.get())); data.release(); // Release ownership, StreamCallback is responsible for it now } -} // namespace roccv::detail \ No newline at end of file + +/** + * @brief Get the device's wavefront/warp size, cached per-thread. + * + * Re-queries the runtime if the active HIP device has changed since the + * last call from the calling thread (so multi-device callers stay correct + * without paying for a query on every launch). + */ +inline int CachedWarpSize() { + static thread_local int cachedDeviceId = -1; + static thread_local int cachedWarpSize = 0; + int deviceId; + HIP_VALIDATE_NO_ERRORS(hipGetDevice(&deviceId)); + if (deviceId != cachedDeviceId) { + HIP_VALIDATE_NO_ERRORS(hipDeviceGetAttribute(&cachedWarpSize, hipDeviceAttributeWarpSize, deviceId)); + cachedDeviceId = deviceId; + } + return cachedWarpSize; +} + +/** + * @brief Cache hipOccupancyMaxPotentialBlockSize per (kernel, device). + * + * The driver picks a thread count that maximizes resident wavefronts per CU + * for the given kernel on the current device, accounting for the kernel's + * register and static-shared-memory usage. The result is bounded above by + * Cap so the API's drive toward maximum occupancy can't override workload- + * class judgment (memory-bound ops gain nothing past ~50% occupancy and + * can lose throughput to cache pressure with overly large blocks). + * + * Each (Kernel, Cap) instantiation gets its own thread-local cache slot, + * so the runtime query runs once per (kernel, device, cap) per thread. + * + * @tparam Kernel The __global__ function pointer (auto NTTP — each unique + * kernel address gets its own cached result). + * @tparam Cap Upper bound on the returned block size. + */ +template +inline int CachedOccupancyBlockSize() { + static thread_local int cachedDeviceId = -1; + static thread_local int cachedBlockSize = 0; + int deviceId; + HIP_VALIDATE_NO_ERRORS(hipGetDevice(&deviceId)); + if (deviceId != cachedDeviceId) { + int minGridSize; + HIP_VALIDATE_NO_ERRORS(hipOccupancyMaxPotentialBlockSize(&minGridSize, &cachedBlockSize, Kernel, 0, Cap)); + cachedDeviceId = deviceId; + } + return cachedBlockSize; +} + +/** + * @brief Pick a 1D block size for a pointwise kernel via runtime occupancy + * query, capped at Cap, and cached per (kernel, device). + * + * Use for pointwise kernels (no neighborhood reads). The driver returns a + * thread count tuned to this specific kernel's register pressure on the + * current device — important on architectures with very different SIMD-per-CU + * counts and register-file sizes (e.g. CDNA wants more wavefronts in flight + * per CU than RDNA to hide HBM latency). + * + * Pair with GetGridSize1D — see its docs for the row-major launch shape. + * + * @tparam Kernel The __global__ function pointer. + * @tparam Cap Upper bound on threads per block. + * @return dim3(blockSize, 1, 1). + */ +template +inline dim3 GetBlockSize1D() { + return dim3(CachedOccupancyBlockSize(), 1, 1); +} + +/** + * @brief Pick a 2D block size for a stencil/transform kernel via runtime + * occupancy query, capped at Cap, and cached per (kernel, device). + * + * Use for kernels with 2D locality (stencils, interpolation neighborhoods, + * affine warps). Reshapes the queried thread count as + * (warpSize, blockSize / warpSize, 1) so threadIdx.x is wavefront-aligned + * for coalescing while threadIdx.y stacks rows for tile-style cache reuse. + * + * @tparam Kernel The __global__ function pointer. + * @tparam Cap Upper bound on threads per block. + * @return dim3(warpSize, blockSize / warpSize, 1). + */ +template +inline dim3 GetBlockSize2D() { + int blockSize = CachedOccupancyBlockSize(); + int warpSize = CachedWarpSize(); + return dim3(warpSize, blockSize / warpSize, 1); +} + +/** + * @brief Get the grid size for a 2D kernel. + * + * @param[in] width The width of the image. + * @param[in] height The height of the image. + * @param[in] batchSize The batch size of the image. + * @param[in] blockSize The block size of the kernel. + * @return The grid size. + */ +static inline dim3 GetGridSize2D(size_t width, size_t height, size_t batchSize, dim3 blockSize) { + return dim3((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y, batchSize); +} + +/** + * @brief Get the grid size for a 1D-row-major launch. + * + * Lays one image row along gridDim.y and the batch along gridDim.z. The + * block from GetBlockSize1D has blockDim.y == 1, so the kernel's standard + * `y = blockDim.y * blockIdx.y + threadIdx.y` collapses to `y = blockIdx.y` + * with no kernel changes required. + * + * Caller is responsible for ensuring height does not exceed the device's + * gridDim.y limit (typically 65535) and batchSize does not exceed gridDim.z. + * + * @param[in] width The width of the image. + * @param[in] height The height of the image (becomes gridDim.y). + * @param[in] batchSize The number of images (becomes gridDim.z). + * @param[in] blockSize Block size from GetBlockSize1D. + * @return The grid size: dim3(ceil(width / blockSize.x), height, batchSize). + */ +static inline dim3 GetGridSize1D(size_t width, size_t height, size_t batchSize, dim3 blockSize) { + return dim3((width + blockSize.x - 1) / blockSize.x, height, batchSize); +} + +} // namespace roccv::detail diff --git a/include/kernels/device/bnd_box_device.hpp b/include/kernels/device/bnd_box_device.hpp index 9076e634..14900490 100644 --- a/include/kernels/device/bnd_box_device.hpp +++ b/include/kernels/device/bnd_box_device.hpp @@ -26,6 +26,7 @@ THE SOFTWARE. #include +#include "common/math_vector.hpp" #include "kernels/kernel_helpers.hpp" #include "operator_types.h" diff --git a/include/kernels/host/bnd_box_host.hpp b/include/kernels/host/bnd_box_host.hpp index db1af9f7..7b3826c1 100644 --- a/include/kernels/host/bnd_box_host.hpp +++ b/include/kernels/host/bnd_box_host.hpp @@ -26,6 +26,7 @@ THE SOFTWARE. #include +#include "common/math_vector.hpp" #include "kernels/kernel_helpers.hpp" #include "operator_types.h" diff --git a/src/op_bilateral_filter.cpp b/src/op_bilateral_filter.cpp index dffba8ae..b6eb6511 100644 --- a/src/op_bilateral_filter.cpp +++ b/src/op_bilateral_filter.cpp @@ -30,6 +30,7 @@ THE SOFTWARE. #include "common/validation_helpers.hpp" #include "core/detail/casting.hpp" +#include "core/detail/hip_utils.hpp" #include "core/wrappers/border_wrapper.hpp" #include "core/wrappers/image_wrapper.hpp" #include "kernels/device/bilateral_filter_device.hpp" @@ -61,21 +62,20 @@ void dispatch_bilateral_filter_border_mode(hipStream_t stream, const Tensor &inp sigmaSpace = 1.0f; } - const int radius = - (diameter <= 0) ? static_cast(std::roundf(sigmaSpace * 1.5f)) : (diameter >> 1); + const int radius = (diameter <= 0) ? static_cast(std::roundf(sigmaSpace * 1.5f)) : (diameter >> 1); float spaceCoeff = -1 / (2 * sigmaSpace * sigmaSpace); float colorCoeff = -1 / (2 * sigmaColor * sigmaColor); if (device == eDeviceType::GPU) { - dim3 block(8, 8); + constexpr auto kernel = Kernels::Device::bilateral_filter, ImageWrapper>; + dim3 block = detail::GetBlockSize2D(); uint32_t xGridSize = (outputWrapper.width() + (block.x * 2) - 1) / (block.x * 2); uint32_t yGridSize = (outputWrapper.height() + (block.y * 2) - 1) / (block.y * 2); uint32_t zGridSize = outputWrapper.batches(); dim3 grid(xGridSize, yGridSize, zGridSize); - Kernels::Device::bilateral_filter - <<>>(inputWrapper, outputWrapper, radius, spaceCoeff, colorCoeff); + kernel<<>>(inputWrapper, outputWrapper, radius, spaceCoeff, colorCoeff); } else if (device == eDeviceType::CPU) { int divisor = std::gcd(4, outputWrapper.height()); // greatest common divisor int dividend = std::gcd((numThreads / divisor), outputWrapper.width()); diff --git a/src/op_bnd_box.cpp b/src/op_bnd_box.cpp index 3b2443e0..3c0d5430 100644 --- a/src/op_bnd_box.cpp +++ b/src/op_bnd_box.cpp @@ -25,10 +25,8 @@ THE SOFTWARE. #include #include -#include #include -#include "common/math_vector.hpp" #include "common/validation_helpers.hpp" #include "core/detail/hip_utils.hpp" #include "core/tensor.hpp" @@ -52,8 +50,9 @@ void dispatch_bnd_box_dtype(hipStream_t stream, const Tensor &input, const Tenso auto batchSize = inputWrapper.batches(); switch (device) { case eDeviceType::GPU: { - const dim3 block(32, 32); - const dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y, batchSize); + constexpr auto kernel = Kernels::Device::bndbox_kernel, ImageWrapper>; + const dim3 block = detail::GetBlockSize1D(); + const dim3 grid = detail::GetGridSize1D(width, height, batchSize, block); Rect_t *rects_ptr = nullptr; const auto n_rects = rects->size(); @@ -63,8 +62,8 @@ void dispatch_bnd_box_dtype(hipStream_t stream, const Tensor &input, const Tenso HIP_VALIDATE_NO_ERRORS( hipMemcpyAsync(rects_ptr, rects->data(), sizeof(Rect_t) * n_rects, hipMemcpyHostToDevice, stream)); } - Kernels::Device::bndbox_kernel - <<>>(inputWrapper, outputWrapper, rects_ptr, n_rects, batchSize, height, width); + kernel<<>>(inputWrapper, outputWrapper, rects_ptr, n_rects, batchSize, height, + width); if (n_rects > 0) { HIP_VALIDATE_NO_ERRORS(hipFreeAsync(rects_ptr, stream)); } diff --git a/src/op_composite.cpp b/src/op_composite.cpp index 650a7bf1..3de572cf 100644 --- a/src/op_composite.cpp +++ b/src/op_composite.cpp @@ -24,6 +24,7 @@ #include #include "common/validation_helpers.hpp" +#include "core/detail/hip_utils.hpp" #include "core/wrappers/image_wrapper.hpp" #include "kernels/device/composite_device.hpp" #include "kernels/host/composite_host.hpp" @@ -40,10 +41,12 @@ void dispatch_composite_masktype(hipStream_t stream, const Tensor& foreground, c switch (device) { case eDeviceType::GPU: { - dim3 block(64, 16); - dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y, - outputWrapper.batches()); - Kernels::Device::composite<<>>(fgWrapper, bgWrapper, maskWrapper, outputWrapper); + constexpr auto kernel = Kernels::Device::composite, ImageWrapper, + ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); + dim3 grid = + detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); + kernel<<>>(fgWrapper, bgWrapper, maskWrapper, outputWrapper); break; } diff --git a/src/op_convert_to.cpp b/src/op_convert_to.cpp index 3baa4577..936f315d 100644 --- a/src/op_convert_to.cpp +++ b/src/op_convert_to.cpp @@ -21,22 +21,23 @@ THE SOFTWARE. */ #include "op_convert_to.hpp" +#include + #include -#include -#include "core/wrappers/image_wrapper.hpp" #include "common/validation_helpers.hpp" #include "core/detail/casting.hpp" +#include "core/detail/hip_utils.hpp" #include "core/detail/type_traits.hpp" +#include "core/wrappers/image_wrapper.hpp" #include "kernels/device/convert_to_device.hpp" #include "kernels/host/convert_to_host.hpp" namespace roccv { template -void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const Tensor &output, - double alpha, double beta, eDeviceType device) { - +void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const Tensor &output, double alpha, + double beta, eDeviceType device) { using SRC_DT_NC = detail::MakeType; using DST_DT_NC = detail::MakeType; @@ -47,17 +48,19 @@ void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const using DST_BT = detail::BaseType; using DT_AB = decltype(float() * SRC_BT() * DST_BT()); - + DT_AB alpha_ab = detail::SaturateCast(alpha); DT_AB beta_ab = detail::SaturateCast(beta); // Launch CPU/GPU kernel depending on requested device type. switch (device) { case eDeviceType::GPU: { - dim3 block(64, 16); - dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y, - outputWrapper.batches()); - Kernels::Device::convert_to<<>>(inputWrapper, outputWrapper, alpha_ab, beta_ab); + constexpr auto kernel = + Kernels::Device::convert_to, ImageWrapper, DT_AB>; + dim3 block = detail::GetBlockSize1D(); + dim3 grid = + detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); + kernel<<>>(inputWrapper, outputWrapper, alpha_ab, beta_ab); break; } case eDeviceType::CPU: { @@ -68,16 +71,14 @@ void dispatch_convert_to_channels(hipStream_t stream, const Tensor &input, const } template -void dispatch_convert_to_output_dtype(hipStream_t stream, const Tensor &input, const Tensor &output, - double alpha, double beta, eDeviceType device) { - +void dispatch_convert_to_output_dtype(hipStream_t stream, const Tensor &input, const Tensor &output, double alpha, + double beta, eDeviceType device) { int64_t channels = output.shape(output.layout().channels_index()); // Select kernel dispatcher based on number of channels. // clang-format off static const std::array, 4> funcs = {dispatch_convert_to_channels, dispatch_convert_to_channels, dispatch_convert_to_channels, dispatch_convert_to_channels}; - - + // clang-format on auto func = funcs.at(channels - 1); @@ -86,11 +87,10 @@ void dispatch_convert_to_output_dtype(hipStream_t stream, const Tensor &input, c } template -void dispatch_convert_to_input_dtype(hipStream_t stream, const Tensor &input, const Tensor &output, - double alpha, double beta, eDeviceType device) { - +void dispatch_convert_to_input_dtype(hipStream_t stream, const Tensor &input, const Tensor &output, double alpha, + double beta, eDeviceType device) { eDataType output_dtype = output.dtype().etype(); - + // Select kernel dispatcher based on a base input datatype. // clang-format off static const std::unordered_map> diff --git a/src/op_copy_make_border.cpp b/src/op_copy_make_border.cpp index feacfbd9..8a16f864 100644 --- a/src/op_copy_make_border.cpp +++ b/src/op_copy_make_border.cpp @@ -24,9 +24,10 @@ #include #include "common/validation_helpers.hpp" +#include "core/detail/casting.hpp" +#include "core/detail/hip_utils.hpp" #include "core/wrappers/border_wrapper.hpp" #include "core/wrappers/image_wrapper.hpp" -#include "core/wrappers/interpolation_wrapper.hpp" #include "kernels/device/copy_make_border_device.hpp" #include "kernels/host/copy_make_border_host.hpp" @@ -43,10 +44,11 @@ void dispatch_copy_make_border_border_mode(hipStream_t stream, const Tensor& inp switch (device) { case eDeviceType::GPU: { - dim3 block_dim(64, 16); - dim3 grid_dim((out_desc.width() + block_dim.x - 1) / block_dim.x, - (out_desc.height() + block_dim.y - 1) / block_dim.y, out_desc.batches()); - Kernels::Device::copy_make_border<<>>(in_desc, out_desc, top, left); + constexpr auto kernel = + Kernels::Device::copy_make_border, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(out_desc.width(), out_desc.height(), out_desc.batches(), block); + kernel<<>>(in_desc, out_desc, top, left); break; } case eDeviceType::CPU: { diff --git a/src/op_custom_crop.cpp b/src/op_custom_crop.cpp index 3d1a7056..89916758 100644 --- a/src/op_custom_crop.cpp +++ b/src/op_custom_crop.cpp @@ -26,6 +26,7 @@ THE SOFTWARE. #include #include "common/validation_helpers.hpp" +#include "core/detail/hip_utils.hpp" #include "core/wrappers/image_wrapper.hpp" #include "kernels/device/custom_crop_device.hpp" #include "kernels/host/custom_crop_host.hpp" @@ -40,10 +41,11 @@ void dispatch_custom_crop_dtype(hipStream_t stream, const Tensor& input, const T switch (device) { case eDeviceType::GPU: { - dim3 block(64, 16); - dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y, - outputWrapper.batches()); - Kernels::Device::custom_crop<<>>(inputWrapper, outputWrapper, cropRect); + constexpr auto kernel = Kernels::Device::custom_crop, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); + dim3 grid = + detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); + kernel<<>>(inputWrapper, outputWrapper, cropRect); break; } diff --git a/src/op_cvt_color.cpp b/src/op_cvt_color.cpp index 31abd103..8f978999 100644 --- a/src/op_cvt_color.cpp +++ b/src/op_cvt_color.cpp @@ -23,9 +23,8 @@ THE SOFTWARE. #include -#include - #include "common/validation_helpers.hpp" +#include "core/detail/hip_utils.hpp" #include "core/tensor.hpp" #include "core/wrappers/image_wrapper.hpp" #include "kernels/device/cvt_color_device.hpp" @@ -81,45 +80,70 @@ void CvtColor::operator()(hipStream_t stream, const Tensor &input, Tensor &outpu if (device == eDeviceType::GPU) { // Dispatch appropriate device kernel based on given conversion code - dim3 blockSize(32, 16); - dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y, samples); - switch (conversionCode) { - case eColorConversionCode::COLOR_BGR2GRAY: - Kernels::Device::rgb_or_bgr_to_grayscale - <<>>(ImageWrapper(input), ImageWrapper(output)); + case eColorConversionCode::COLOR_BGR2GRAY: { + constexpr auto kernel = Kernels::Device::rgb_or_bgr_to_grayscale< + uchar3, eSwizzle::ZYXW, ImageWrapper, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(width, height, samples, block); + kernel<<>>(ImageWrapper(input), ImageWrapper(output)); break; - - case eColorConversionCode::COLOR_RGB2GRAY: - Kernels::Device::rgb_or_bgr_to_grayscale - <<>>(ImageWrapper(input), ImageWrapper(output)); + } + + case eColorConversionCode::COLOR_RGB2GRAY: { + constexpr auto kernel = Kernels::Device::rgb_or_bgr_to_grayscale< + uchar3, eSwizzle::XYZW, ImageWrapper, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(width, height, samples, block); + kernel<<>>(ImageWrapper(input), ImageWrapper(output)); break; + } case eColorConversionCode::COLOR_BGR2RGB: - case eColorConversionCode::COLOR_RGB2BGR: - Kernels::Device::reorder - <<>>(ImageWrapper(input), ImageWrapper(output)); + case eColorConversionCode::COLOR_RGB2BGR: { + constexpr auto kernel = Kernels::Device::reorder< + uchar3, eSwizzle::ZYXW, ImageWrapper, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(width, height, samples, block); + kernel<<>>(ImageWrapper(input), ImageWrapper(output)); break; - - case eColorConversionCode::COLOR_BGR2YUV: - Kernels::Device::rgb_or_bgr_to_yuv<<>>( - ImageWrapper(input), ImageWrapper(output), 128.0f); + } + + case eColorConversionCode::COLOR_BGR2YUV: { + constexpr auto kernel = Kernels::Device::rgb_or_bgr_to_yuv< + uchar3, eSwizzle::ZYXW, ImageWrapper, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(width, height, samples, block); + kernel<<>>(ImageWrapper(input), ImageWrapper(output), 128.0f); break; - - case eColorConversionCode::COLOR_RGB2YUV: - Kernels::Device::rgb_or_bgr_to_yuv<<>>( - ImageWrapper(input), ImageWrapper(output), 128.0f); + } + + case eColorConversionCode::COLOR_RGB2YUV: { + constexpr auto kernel = Kernels::Device::rgb_or_bgr_to_yuv< + uchar3, eSwizzle::XYZW, ImageWrapper, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(width, height, samples, block); + kernel<<>>(ImageWrapper(input), ImageWrapper(output), 128.0f); break; - - case eColorConversionCode::COLOR_YUV2BGR: - Kernels::Device::yuv_to_rgb_or_bgr<<>>( - ImageWrapper(input), ImageWrapper(output), 128.0f); + } + + case eColorConversionCode::COLOR_YUV2BGR: { + constexpr auto kernel = Kernels::Device::yuv_to_rgb_or_bgr< + uchar3, eSwizzle::ZYXW, ImageWrapper, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(width, height, samples, block); + kernel<<>>(ImageWrapper(input), ImageWrapper(output), 128.0f); break; - - case eColorConversionCode::COLOR_YUV2RGB: - Kernels::Device::yuv_to_rgb_or_bgr<<>>( - ImageWrapper(input), ImageWrapper(output), 128.0f); + } + + case eColorConversionCode::COLOR_YUV2RGB: { + constexpr auto kernel = Kernels::Device::yuv_to_rgb_or_bgr< + uchar3, eSwizzle::XYZW, ImageWrapper, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(width, height, samples, block); + kernel<<>>(ImageWrapper(input), ImageWrapper(output), 128.0f); break; + } default: throw Exception("Not implemented", eStatusType::NOT_IMPLEMENTED); diff --git a/src/op_flip.cpp b/src/op_flip.cpp index 5566a0b6..54886b6f 100644 --- a/src/op_flip.cpp +++ b/src/op_flip.cpp @@ -27,6 +27,7 @@ THE SOFTWARE. #include #include "common/validation_helpers.hpp" +#include "core/detail/hip_utils.hpp" #include "core/exception.hpp" #include "core/status_type.h" #include "core/wrappers/image_wrapper.hpp" @@ -42,10 +43,11 @@ void dispatch_flip_axis(hipStream_t stream, const Tensor& input, const Tensor& o switch (device) { case eDeviceType::GPU: { - dim3 block(64, 16); - dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y, - outputWrapper.batches()); - Kernels::Device::flip<<>>(inputWrapper, outputWrapper); + constexpr auto kernel = Kernels::Device::flip, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); + dim3 grid = + detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); + kernel<<>>(inputWrapper, outputWrapper); break; } diff --git a/src/op_gamma_contrast.cpp b/src/op_gamma_contrast.cpp index d6c78690..e764d27c 100644 --- a/src/op_gamma_contrast.cpp +++ b/src/op_gamma_contrast.cpp @@ -24,15 +24,11 @@ THE SOFTWARE. #include -#include #include #include -#include -#include -#include "common/array_wrapper.hpp" -#include "common/math_vector.hpp" #include "common/validation_helpers.hpp" +#include "core/detail/hip_utils.hpp" #include "core/tensor.hpp" #include "core/wrappers/image_wrapper.hpp" #include "kernels/device/gamma_contrast_device.hpp" @@ -47,11 +43,12 @@ void dispatch_gamma_contrast_dtype(hipStream_t stream, const Tensor &input, cons ImageWrapper outputWrapper(output); if (device == eDeviceType::GPU) { - dim3 block(64, 16); - dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y, - outputWrapper.batches()); + constexpr auto kernel = Kernels::Device::gamma_contrast, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); + dim3 grid = + detail::GetGridSize1D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); - Kernels::Device::gamma_contrast<<>>(inputWrapper, outputWrapper, gamma); + kernel<<>>(inputWrapper, outputWrapper, gamma); } else if (device == eDeviceType::CPU) { Kernels::Host::gamma_contrast(inputWrapper, outputWrapper, gamma); } diff --git a/src/op_histogram.cpp b/src/op_histogram.cpp index 24a2ef79..819c70fc 100644 --- a/src/op_histogram.cpp +++ b/src/op_histogram.cpp @@ -22,13 +22,11 @@ THE SOFTWARE. #include "op_histogram.hpp" #include -#include #include #include #include -#include "common/array_wrapper.hpp" #include "common/validation_helpers.hpp" #include "core/wrappers/generic_tensor_wrapper.hpp" #include "core/wrappers/image_wrapper.hpp" diff --git a/src/op_non_max_suppression.cpp b/src/op_non_max_suppression.cpp index 93c4f90c..d34b819e 100644 --- a/src/op_non_max_suppression.cpp +++ b/src/op_non_max_suppression.cpp @@ -24,7 +24,6 @@ THE SOFTWARE. #include #include "common/validation_helpers.hpp" -#include "core/hip_assert.h" #include "kernels/device/non_max_suppression_device.hpp" #include "kernels/host/non_max_suppression_host.hpp" diff --git a/src/op_normalize.cpp b/src/op_normalize.cpp index 529638b8..666b317e 100644 --- a/src/op_normalize.cpp +++ b/src/op_normalize.cpp @@ -26,6 +26,7 @@ THE SOFTWARE. #include #include "common/validation_helpers.hpp" +#include "core/detail/hip_utils.hpp" #include "core/detail/type_traits.hpp" #include "core/tensor.hpp" #include "core/wrappers/image_wrapper.hpp" @@ -49,11 +50,13 @@ void dispatch_normalize_stddev(hipStream_t stream, const Tensor& input, const Te switch (device) { case eDeviceType::GPU: { - dim3 block(32, 8); - dim3 grid((outputWrap.width() + block.x - 1) / block.x, (outputWrap.height() + block.y - 1) / block.y, - outputWrap.batches()); - Kernels::Device::normalize - <<>>(inputWrap, baseWrap, scaleWrap, outputWrap, global_scale, shift, epsilon); + constexpr auto kernel = + Kernels::Device::normalize, ImageWrapper, ImageWrapper, + ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(outputWrap.width(), outputWrap.height(), outputWrap.batches(), block); + kernel<<>>(inputWrap, baseWrap, scaleWrap, outputWrap, global_scale, shift, + epsilon); break; } case eDeviceType::CPU: { @@ -112,7 +115,8 @@ void Normalize::operator()(hipStream_t stream, const Tensor& input, const Tensor // TODO: Need to support scalar base/scale tensors at some point. Will require some extra handling on the kernel // level. Once in place, this check can be removed. CHECK_TENSOR_COMPARISON(base.shape(base.layout().channels_index()) == input.shape(input.layout().channels_index())); - CHECK_TENSOR_COMPARISON(scale.shape(scale.layout().channels_index()) == input.shape(input.layout().channels_index())); + CHECK_TENSOR_COMPARISON(scale.shape(scale.layout().channels_index()) == + input.shape(input.layout().channels_index())); // Create kernel dispatching table based on input/output datatype and number of channels. // clang-format off diff --git a/src/op_remap.cpp b/src/op_remap.cpp index 0992cf44..cdd4209d 100644 --- a/src/op_remap.cpp +++ b/src/op_remap.cpp @@ -23,12 +23,10 @@ THE SOFTWARE. #include -#include "common/array_wrapper.hpp" #include "common/validation_helpers.hpp" #include "core/detail/casting.hpp" +#include "core/detail/hip_utils.hpp" #include "core/detail/internal_structs.hpp" -#include "core/detail/math/math.hpp" -#include "core/detail/type_traits.hpp" #include "core/wrappers/image_wrapper.hpp" #include "core/wrappers/interpolation_wrapper.hpp" #include "kernels/device/remap_device.hpp" @@ -91,11 +89,12 @@ void dispatch_remap_mapInterp(hipStream_t stream, const Tensor &input, const Ten // Launch CPU/GPU kernel depending on requested device type. switch (device) { case eDeviceType::GPU: { - dim3 block(64, 16); - dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y, - outputWrapper.batches()); - Kernels::Device::remap<<>>(inputWrapper, outputWrapper, wrappedMapTensor, - mapBatchSize, params); + constexpr auto kernel = Kernels::Device::remap< + InterpolationWrapper, ImageWrapper, InterpolationWrapper>; + dim3 block = detail::GetBlockSize2D(); + dim3 grid = + detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); + kernel<<>>(inputWrapper, outputWrapper, wrappedMapTensor, mapBatchSize, params); break; } diff --git a/src/op_resize.cpp b/src/op_resize.cpp index d7cd0b61..30d9dc14 100644 --- a/src/op_resize.cpp +++ b/src/op_resize.cpp @@ -25,7 +25,7 @@ THE SOFTWARE. #include #include "common/validation_helpers.hpp" -#include "core/detail/casting.hpp" +#include "core/detail/hip_utils.hpp" #include "core/exception.hpp" #include "core/status_type.h" #include "core/wrappers/interpolation_wrapper.hpp" @@ -45,10 +45,12 @@ void dispatch_resize_interp(hipStream_t stream, const Tensor& input, const Tenso switch (device) { case eDeviceType::GPU: { - dim3 block(64, 16); - dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y, - outputWrapper.batches()); - Kernels::Device::resize<<>>(inputWrapper, outputWrapper, scaleX, scaleY); + constexpr auto kernel = Kernels::Device::resize< + InterpolationWrapper, ImageWrapper>; + dim3 block = detail::GetBlockSize2D(); + dim3 grid = + detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); + kernel<<>>(inputWrapper, outputWrapper, scaleX, scaleY); break; } @@ -62,13 +64,13 @@ void dispatch_resize_interp(hipStream_t stream, const Tensor& input, const Tenso template void dispatch_resize_dtype(hipStream_t stream, const Tensor& input, const Tensor& output, eInterpolationType interpolation, eDeviceType device) { - static const std::unordered_map< - eInterpolationType, - std::function> - funcs = {{eInterpolationType::INTERP_TYPE_NEAREST, dispatch_resize_interp}, - {eInterpolationType::INTERP_TYPE_LINEAR, dispatch_resize_interp}, - {eInterpolationType::INTERP_TYPE_CUBIC, dispatch_resize_interp} - }; + static const std::unordered_map> + funcs = { + {eInterpolationType::INTERP_TYPE_NEAREST, + dispatch_resize_interp}, + {eInterpolationType::INTERP_TYPE_LINEAR, dispatch_resize_interp}, + {eInterpolationType::INTERP_TYPE_CUBIC, dispatch_resize_interp}}; if (!funcs.contains(interpolation)) { throw Exception("Operation does not support the given interpolation mode.", eStatusType::NOT_IMPLEMENTED); @@ -78,8 +80,8 @@ void dispatch_resize_dtype(hipStream_t stream, const Tensor& input, const Tensor func(stream, input, output, device); } -void Resize::operator()(hipStream_t stream, const Tensor& input, const Tensor& output, - eInterpolationType interpolation, eDeviceType device) const { +void Resize::operator()(hipStream_t stream, const Tensor& input, const Tensor& output, eInterpolationType interpolation, + eDeviceType device) const { CHECK_TENSOR_DEVICE(input, device); CHECK_TENSOR_DEVICE(output, device); diff --git a/src/op_rotate.cpp b/src/op_rotate.cpp index 28806779..be7d73a4 100644 --- a/src/op_rotate.cpp +++ b/src/op_rotate.cpp @@ -26,6 +26,7 @@ THE SOFTWARE. #include "common/array_wrapper.hpp" #include "common/validation_helpers.hpp" +#include "core/detail/hip_utils.hpp" #include "core/wrappers/interpolation_wrapper.hpp" #include "kernels/device/rotate_device.hpp" #include "kernels/host/rotate_host.hpp" @@ -59,10 +60,12 @@ void dispatch_rotate_interp(hipStream_t stream, const Tensor &input, const Tenso switch (device) { case eDeviceType::GPU: { - dim3 block(32, 16); - dim3 grid((outputWrap.width() + block.x - 1) / block.x, (outputWrap.height() + block.y - 1) / block.y, - outputWrap.batches()); - Kernels::Device::rotate<<>>(inputWrap, outputWrap, matWrap); + constexpr auto kernel = Kernels::Device::rotate< + InterpolationWrapper, ImageWrapper, + ArrayWrapper>; + dim3 block = detail::GetBlockSize2D(); + dim3 grid = detail::GetGridSize2D(outputWrap.width(), outputWrap.height(), outputWrap.batches(), block); + kernel<<>>(inputWrap, outputWrap, matWrap); break; } diff --git a/src/op_thresholding.cpp b/src/op_thresholding.cpp index 420ac84a..2325c5eb 100644 --- a/src/op_thresholding.cpp +++ b/src/op_thresholding.cpp @@ -26,6 +26,7 @@ THE SOFTWARE. #include #include "common/validation_helpers.hpp" +#include "core/detail/hip_utils.hpp" #include "core/wrappers/generic_tensor_wrapper.hpp" #include "core/wrappers/image_wrapper.hpp" #include "kernels/device/thresholding_device.hpp" @@ -50,34 +51,45 @@ void dispatch_threshold_dtype(hipStream_t stream, const Tensor &input, const Ten const auto width = input.shape()[input.shape().layout().width_index()]; if (device == eDeviceType::GPU) { - dim3 block(64, 16); - dim3 grid((width + block.x - 1) / block.x, (height + block.y - 1) / block.y, outputWrapper.batches()); - switch (m_threshType) { - case THRESH_BINARY: - Kernels::Device::binary_generic<<>>(inputWrapper, outputWrapper, - GenericTensorWrapper(thresh), - GenericTensorWrapper(maxVal)); + case THRESH_BINARY: { + constexpr auto kernel = Kernels::Device::binary_generic, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(width, height, outputWrapper.batches(), block); + kernel<<>>(inputWrapper, outputWrapper, GenericTensorWrapper(thresh), + GenericTensorWrapper(maxVal)); break; - case THRESH_BINARY_INV: - Kernels::Device::binary_inv_generic<<>>(inputWrapper, outputWrapper, - GenericTensorWrapper(thresh), - GenericTensorWrapper(maxVal)); + } + case THRESH_BINARY_INV: { + constexpr auto kernel = Kernels::Device::binary_inv_generic, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(width, height, outputWrapper.batches(), block); + kernel<<>>(inputWrapper, outputWrapper, GenericTensorWrapper(thresh), + GenericTensorWrapper(maxVal)); break; - case THRESH_TRUNC: - Kernels::Device::trunc_generic<<>>(inputWrapper, outputWrapper, - GenericTensorWrapper(thresh)); + } + case THRESH_TRUNC: { + constexpr auto kernel = Kernels::Device::trunc_generic, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(width, height, outputWrapper.batches(), block); + kernel<<>>(inputWrapper, outputWrapper, GenericTensorWrapper(thresh)); break; - case THRESH_TOZERO: - Kernels::Device::tozero_generic<<>>(inputWrapper, outputWrapper, - GenericTensorWrapper(thresh)); + } + case THRESH_TOZERO: { + constexpr auto kernel = Kernels::Device::tozero_generic, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(width, height, outputWrapper.batches(), block); + kernel<<>>(inputWrapper, outputWrapper, GenericTensorWrapper(thresh)); break; - case THRESH_TOZERO_INV: - Kernels::Device::tozeroinv_generic<<>>(inputWrapper, outputWrapper, - GenericTensorWrapper(thresh)); + } + case THRESH_TOZERO_INV: { + constexpr auto kernel = Kernels::Device::tozeroinv_generic, ImageWrapper>; + dim3 block = detail::GetBlockSize1D(); + dim3 grid = detail::GetGridSize1D(width, height, outputWrapper.batches(), block); + kernel<<>>(inputWrapper, outputWrapper, GenericTensorWrapper(thresh)); break; + } } - } else if (device == eDeviceType::CPU) { switch (m_threshType) { case THRESH_BINARY: diff --git a/src/op_warp_perspective.cpp b/src/op_warp_perspective.cpp index ca77fc8c..4db3f3f4 100644 --- a/src/op_warp_perspective.cpp +++ b/src/op_warp_perspective.cpp @@ -26,8 +26,8 @@ THE SOFTWARE. #include "common/array_wrapper.hpp" #include "common/validation_helpers.hpp" #include "core/detail/casting.hpp" +#include "core/detail/hip_utils.hpp" #include "core/detail/math/math.hpp" -#include "core/detail/type_traits.hpp" #include "kernels/device/warp_perspective_device.hpp" #include "kernels/host/warp_perspective_host.hpp" @@ -42,10 +42,12 @@ void dispatch_warp_perspective_interp(hipStream_t stream, const Tensor &input, c // Launch CPU/GPU kernel depending on requested device type. switch (device) { case eDeviceType::GPU: { - dim3 block(64, 16); - dim3 grid((outputWrapper.width() + block.x - 1) / block.x, (outputWrapper.height() + block.y - 1) / block.y, - outputWrapper.batches()); - Kernels::Device::warp_perspective<<>>(inputWrapper, outputWrapper, transform); + constexpr auto kernel = Kernels::Device::warp_perspective< + InterpolationWrapper, ImageWrapper, ArrayWrapper>; + dim3 block = detail::GetBlockSize2D(); + dim3 grid = + detail::GetGridSize2D(outputWrapper.width(), outputWrapper.height(), outputWrapper.batches(), block); + kernel<<>>(inputWrapper, outputWrapper, transform); break; }