Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/reference/rocCV-supported-operators.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ The rocCV is a collection of the following computer vision operators that are su
:header: "Operator", "Description", "Datatypes", "Layouts"

"AdvCvtColor","Converts color spaces with explicit BT601/BT709/BT2020 coefficients, including NV12/NV21 paths.","U8","NHWC, HWC"
"AvgBlur","Applies an average blur filter on images in a tensor.","U8, U16, S16, S32, F32","NHWC, HWC"
"BilateralFilter", "Applies a bilateral filter.", "U8", "NHWC, HWC"
"BndBox","Draws bounding boxes on the images in a tensor.","U8","NHWC, HWC"
"Composite","Composites two input tensors using a provided alpha mask.","U8, S8, U32, S32, F32","NHWC, HWC"
Expand Down
1 change: 1 addition & 0 deletions docs/supported-operators.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ See below for a list of Computer Vision operators rocCV supports.
|Name|Description|Datatypes|Layouts|CPU/GPU Support|
|-|-|-|-|-|
|AdvCvtColor|Converts color spaces using explicit BT601/BT709/BT2020 coefficients and supports NV12/NV21 paths.|U8|NHWC, HWC|Both|
|AvgBlur|Applies an average blur filter on images in a tensor.|U8, U16, S16, S32, F32|NHWC, HWC|Both|
|BilateralFilter|Applies a bilateral filter to reduce image noise while preserving strong edges.|U8, S8, U16, S16, U32, S32, F32, F64|NHWC, HWC|Both|
|BndBox|Draws rectangular borders using the specified locations, dimensions and colors, in order to show the locations and sizes of objects in an image.|U8, S8|NHWC, HWC|Both|
|CenterCrop|Crops an image at its center with a given rectangular region.|U8, S8, U16, S16, U32, S32, F32, F64|NHWC, HWC|Both|
Expand Down
191 changes: 191 additions & 0 deletions include/kernels/device/avg_blur_device.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
/**
Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#pragma once

#include <hip/hip_runtime.h>

#include "core/detail/casting.hpp"
#include "core/detail/type_traits.hpp"
#include "core/detail/vector_utils.hpp"
#include "operator_types.h"

namespace Kernels {
namespace Device {

template <typename T, typename SrcWrapper, typename DstWrapper>
__global__ void avg_blur_2d(SrcWrapper input, DstWrapper output,
int kernelWidth, int kernelHeight,
int kernelAnchorX, int kernelAnchorY) {
using namespace roccv::detail;
using WorkType = MakeType<float, NumElements<T>>;

const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
const int b = blockIdx.z;

if (x >= output.width() || y >= output.height() || b >= output.batches()) {
return;
}

// Initialize accumulator
WorkType sum = SetAll<WorkType>(0.0f);

// Compute the sum over the kernel window
for (int ky = 0; ky < kernelHeight; ++ky) {
int srcY = y - kernelAnchorY + ky;

for (int kx = 0; kx < kernelWidth; ++kx) {
int srcX = x - kernelAnchorX + kx;

// Read pixel through border wrapper (handles out-of-bounds)
T pixel = input.at(b, srcY, srcX, 0);

// Accumulate as float to avoid overflow
sum = sum + StaticCast<WorkType>(pixel);
}
}

// Compute average by dividing by kernel area
float kernelArea = static_cast<float>(kernelWidth * kernelHeight);
WorkType average = sum / kernelArea;

// Write result with saturation
output.at(b, y, x, 0) = SaturateCast<T>(average);
}

/**
* @brief Optimized 1D Horizontal Average Blur Kernel with Shared Memory Tiling
*
* First pass of separable average blur. Applies horizontal averaging using shared
* memory to reduce global memory bandwidth. Each block loads a tile of input data
* into shared memory, then each thread computes its output using only shared memory.
*/
template <typename T, int BLOCK_WIDTH, typename SrcWrapper, typename DstWrapper>
__global__ void avg_blur_horizontal(SrcWrapper input, DstWrapper output,
int kernelWidth, int kernelAnchorX) {
using namespace roccv::detail;
using WorkType = MakeType<float, NumElements<T>>;

const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockIdx.y;
const int b = blockIdx.z;

// Check bounds for loading - all threads in block participate
const bool validBlock = (y < output.height() && b < output.batches());

// Shared memory tile: BLOCK_WIDTH + kernel halo on both sides
extern __shared__ char smem[];
T* tile = reinterpret_cast<T*>(smem);

const int halo = kernelWidth - 1;
const int tileWidth = BLOCK_WIDTH + halo;

// Load data into shared memory with halo
// All threads participate in loading to avoid divergence before syncthreads
for (int i = threadIdx.x; i < tileWidth; i += blockDim.x) {
int srcX = blockIdx.x * BLOCK_WIDTH + i - kernelAnchorX;
if (validBlock) {
tile[i] = input.at(b, y, srcX, 0);
}
}

__syncthreads();

// Now check if this specific thread has valid output to compute
if (x >= output.width() || !validBlock) {
return;
}

// Compute horizontal average using shared memory
WorkType sum = SetAll<WorkType>(0.0f);

int tileIdx = threadIdx.x + kernelAnchorX;
for (int kx = 0; kx < kernelWidth; ++kx) {
sum = sum + StaticCast<WorkType>(tile[tileIdx - kernelAnchorX + kx]);
}

float kernelSize = static_cast<float>(kernelWidth);
WorkType average = sum / kernelSize;

output.at(b, y, x, 0) = SaturateCast<T>(average);
}

/**
* @brief Optimized 1D Vertical Average Blur Kernel with Shared Memory Tiling
*
* Second pass of separable average blur. Applies vertical averaging using shared
* memory to reduce global memory bandwidth. Each block loads a column tile into
* shared memory, then computes vertical averages.
*/
template <typename T, int BLOCK_HEIGHT, typename SrcWrapper, typename DstWrapper>
__global__ void avg_blur_vertical(SrcWrapper input, DstWrapper output,
int kernelHeight, int kernelAnchorY) {
using namespace roccv::detail;
using WorkType = MakeType<float, NumElements<T>>;

const int x = blockIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
const int b = blockIdx.z;

// Check bounds for loading - all threads in block participate
const bool validBlock = (x < output.width() && b < output.batches());

// Shared memory tile: BLOCK_HEIGHT + kernel halo on both sides
extern __shared__ char smem[];
T* tile = reinterpret_cast<T*>(smem);

const int halo = kernelHeight - 1;
const int tileHeight = BLOCK_HEIGHT + halo;

// Load data into shared memory with halo
// All threads participate in loading to avoid divergence before syncthreads
for (int i = threadIdx.y; i < tileHeight; i += blockDim.y) {
int srcY = blockIdx.y * BLOCK_HEIGHT + i - kernelAnchorY;
if (validBlock) {
tile[i] = input.at(b, srcY, x, 0);
}
}

__syncthreads();

// Now check if this specific thread has valid output to compute
if (y >= output.height() || !validBlock) {
return;
}

// Compute vertical average using shared memory
WorkType sum = SetAll<WorkType>(0.0f);

int tileIdx = threadIdx.y + kernelAnchorY;
for (int ky = 0; ky < kernelHeight; ++ky) {
sum = sum + StaticCast<WorkType>(tile[tileIdx - kernelAnchorY + ky]);
}

float kernelSize = static_cast<float>(kernelHeight);
WorkType average = sum / kernelSize;

output.at(b, y, x, 0) = SaturateCast<T>(average);
}

} // namespace Device
} // namespace Kernels
94 changes: 94 additions & 0 deletions include/kernels/host/avg_blur_host.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
/**
Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

#pragma once

#include "core/detail/casting.hpp"
#include "core/detail/type_traits.hpp"
#include "core/detail/vector_utils.hpp"
#include "operator_types.h"

namespace Kernels {
namespace Host {

/**
* @brief CPU implementation of 2D Average Blur
*
* Applies a 2D averaging filter to the input image on CPU. Each output pixel is computed
* as the average of all pixels within the kernel window.
*
* @tparam T Data type (e.g., uchar3, float4, etc.)
* @tparam SrcWrapper Border wrapper type for input
* @tparam DstWrapper Image wrapper type for output
*
* @param input Input image with border handling
* @param output Output image
* @param kernelWidth Width of the averaging kernel
* @param kernelHeight Height of the averaging kernel
* @param kernelAnchorX X-coordinate of kernel anchor point
* @param kernelAnchorY Y-coordinate of kernel anchor point
*/
template <typename T, typename SrcWrapper, typename DstWrapper>
void avg_blur_2d(SrcWrapper input, DstWrapper output,
int kernelWidth, int kernelHeight,
int kernelAnchorX, int kernelAnchorY) {
using namespace roccv::detail;
using WorkType = MakeType<float, NumElements<T>>;

// Compute kernel area for averaging
float kernelArea = static_cast<float>(kernelWidth * kernelHeight);

// Iterate over all batches
for (int b = 0; b < output.batches(); b++) {
// Iterate over all output pixels
for (int y = 0; y < output.height(); y++) {
for (int x = 0; x < output.width(); x++) {
// Initialize accumulator
WorkType sum = SetAll<WorkType>(0.0f);

// Compute the sum over the kernel window
for (int ky = 0; ky < kernelHeight; ++ky) {
int srcY = y - kernelAnchorY + ky;

for (int kx = 0; kx < kernelWidth; ++kx) {
int srcX = x - kernelAnchorX + kx;

// Read pixel through border wrapper (handles out-of-bounds)
T pixel = input.at(b, srcY, srcX, 0);

// Accumulate as float to avoid overflow
sum = sum + StaticCast<WorkType>(pixel);
}
}

// Compute average by dividing by kernel area
WorkType average = sum / kernelArea;

// Write result with saturation
output.at(b, y, x, 0) = SaturateCast<T>(average);
}
}
}
}

} // namespace Host
} // namespace Kernels
94 changes: 94 additions & 0 deletions include/op_avg_blur.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
/**
Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once

#include <i_operator.hpp>

#include "core/tensor.hpp"
#include "operator_types.h"

namespace roccv {
/**
* @brief Class for managing the Average Blur operator
*
*/
class AvgBlur final : public IOperator {
public:
/**
* @brief Constructs an AvgBlur object.
*
*/
AvgBlur();

/**
* @brief Destroy the AvgBlur object
*
*/
~AvgBlur();

/**
* @brief Construct a new AvgBlur object.
* The object can be used to apply an average (mean) blur filter on images in a tensor.
*
* Limitations:
*
* Input:
* Supported TensorLayout(s): [NHWC, HWC]
* Channels: [1, 3, 4]
* Supported DataType(s): [U8, U16, S16, S32, F32]
*
* Output:
* Supported TensorLayout(s): [NHWC, HWC]
* Channels: [1, 3, 4]
* Supported DataType(s): [U8, U16, S16, S32, F32]
*
* Input/Output dependency
*
* Property | Input == Output
* -------------- | -------------
* Data Layout | Yes
* Data Type | Yes
* Number | Yes
* Channels | Yes
* Width | Yes
* Height | Yes
*
*
* @param[in] stream The HIP stream to run this operation on.
* @param[in] input Input tensor with image batch data
* @param[out] output Output tensor for storing modified image batch data
* @param[in] kernelWidth Width of the averaging kernel.
* @param[in] kernelHeight Height of the averaging kernel.
* @param[in] kernelAnchorX Kernel anchor in X direction.
* @param[in] kernelAnchorY Kernel anchor in Y direction.
* @param[in] borderMode A border type to identify the pixel extrapolation
* method (e.g. BORDER_TYPE_CONSTANT or BORDER_TYPE_REPLICATE)
* @param[in] borderValue Set as 0 unless using a constant border.
* @param[in] device The device which this operation should run on.
* (Default: eDeviceType::GPU)
*
*/
void operator()(hipStream_t stream, const Tensor& input, const Tensor& output,
int kernelWidth, int kernelHeight, int kernelAnchorX, int kernelAnchorY,
eBorderType borderType, float4 borderValue, eDeviceType device = eDeviceType::GPU);
};
} // namespace roccv
Loading