From e89b7d81cf353e89e2ba6b8accea1710a28ccb65 Mon Sep 17 00:00:00 2001 From: svcbuild Date: Wed, 8 Apr 2026 15:57:44 -0700 Subject: [PATCH 1/2] Average Blur operator added --- docs/reference/rocCV-supported-operators.rst | 1 + docs/supported-operators.md | 1 + include/kernels/device/avg_blur_device.hpp | 191 +++++++++++ include/kernels/host/avg_blur_host.hpp | 94 ++++++ include/op_avg_blur.hpp | 94 ++++++ python/include/operators/py_op_avg_blur.hpp | 45 +++ python/src/main.cpp | 2 + python/src/operators/py_op_avg_blur.cpp | 125 ++++++++ src/op_avg_blur.cpp | 289 +++++++++++++++++ .../src/tests/operators/test_op_avg_blur.cpp | 302 ++++++++++++++++++ tests/roccv/python/test_op_avg_blur.py | 56 ++++ 11 files changed, 1200 insertions(+) create mode 100644 include/kernels/device/avg_blur_device.hpp create mode 100644 include/kernels/host/avg_blur_host.hpp create mode 100644 include/op_avg_blur.hpp create mode 100644 python/include/operators/py_op_avg_blur.hpp create mode 100644 python/src/operators/py_op_avg_blur.cpp create mode 100644 src/op_avg_blur.cpp create mode 100644 tests/roccv/cpp/src/tests/operators/test_op_avg_blur.cpp create mode 100644 tests/roccv/python/test_op_avg_blur.py diff --git a/docs/reference/rocCV-supported-operators.rst b/docs/reference/rocCV-supported-operators.rst index 97dbdb20..77faa592 100644 --- a/docs/reference/rocCV-supported-operators.rst +++ b/docs/reference/rocCV-supported-operators.rst @@ -13,6 +13,7 @@ The rocCV is a collection of the following computer vision operators that are su :header: "Operator", "Description", "Datatypes", "Layouts" "AdvCvtColor","Converts color spaces with explicit BT601/BT709/BT2020 coefficients, including NV12/NV21 paths.","U8","NHWC, HWC" + "AvgBlur","Applies an average blur filter on images in a tensor.","U8, U16, S16, S32, F32","NHWC, HWC" "BilateralFilter", "Applies a bilateral filter.", "U8", "NHWC, HWC" "BndBox","Draws bounding boxes on the images in a tensor.","U8","NHWC, HWC" "Composite","Composites two input tensors using a provided alpha mask.","U8, S8, U32, S32, F32","NHWC, HWC" diff --git a/docs/supported-operators.md b/docs/supported-operators.md index 081ebfcb..46038563 100644 --- a/docs/supported-operators.md +++ b/docs/supported-operators.md @@ -5,6 +5,7 @@ See below for a list of Computer Vision operators rocCV supports. |Name|Description|Datatypes|Layouts|CPU/GPU Support| |-|-|-|-|-| |AdvCvtColor|Converts color spaces using explicit BT601/BT709/BT2020 coefficients and supports NV12/NV21 paths.|U8|NHWC, HWC|Both| +|AvgBlur|Applies an average blur filter on images in a tensor.|U8, U16, S16, S32, F32|NHWC, HWC|Both| |BilateralFilter|Applies a bilateral filter to reduce image noise while preserving strong edges.|U8, S8, U16, S16, U32, S32, F32, F64|NHWC, HWC|Both| |BndBox|Draws rectangular borders using the specified locations, dimensions and colors, in order to show the locations and sizes of objects in an image.|U8, S8|NHWC, HWC|Both| |CenterCrop|Crops an image at its center with a given rectangular region.|U8, S8, U16, S16, U32, S32, F32, F64|NHWC, HWC|Both| diff --git a/include/kernels/device/avg_blur_device.hpp b/include/kernels/device/avg_blur_device.hpp new file mode 100644 index 00000000..f72016ef --- /dev/null +++ b/include/kernels/device/avg_blur_device.hpp @@ -0,0 +1,191 @@ +/** +Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include + +#include "core/detail/casting.hpp" +#include "core/detail/type_traits.hpp" +#include "core/detail/vector_utils.hpp" +#include "operator_types.h" + +namespace Kernels { +namespace Device { + +template +__global__ void avg_blur_2d(SrcWrapper input, DstWrapper output, + int kernelWidth, int kernelHeight, + int kernelAnchorX, int kernelAnchorY) { + using namespace roccv::detail; + using WorkType = MakeType>; + + const int x = blockDim.x * blockIdx.x + threadIdx.x; + const int y = blockDim.y * blockIdx.y + threadIdx.y; + const int b = blockIdx.z; + + if (x >= output.width() || y >= output.height() || b >= output.batches()) { + return; + } + + // Initialize accumulator + WorkType sum = SetAll(0.0f); + + // Compute the sum over the kernel window + for (int ky = 0; ky < kernelHeight; ++ky) { + int srcY = y - kernelAnchorY + ky; + + for (int kx = 0; kx < kernelWidth; ++kx) { + int srcX = x - kernelAnchorX + kx; + + // Read pixel through border wrapper (handles out-of-bounds) + T pixel = input.at(b, srcY, srcX, 0); + + // Accumulate as float to avoid overflow + sum = sum + StaticCast(pixel); + } + } + + // Compute average by dividing by kernel area + float kernelArea = static_cast(kernelWidth * kernelHeight); + WorkType average = sum / kernelArea; + + // Write result with saturation + output.at(b, y, x, 0) = SaturateCast(average); +} + +/** + * @brief Optimized 1D Horizontal Average Blur Kernel with Shared Memory Tiling + * + * First pass of separable average blur. Applies horizontal averaging using shared + * memory to reduce global memory bandwidth. Each block loads a tile of input data + * into shared memory, then each thread computes its output using only shared memory. + */ +template +__global__ void avg_blur_horizontal(SrcWrapper input, DstWrapper output, + int kernelWidth, int kernelAnchorX) { + using namespace roccv::detail; + using WorkType = MakeType>; + + const int x = blockDim.x * blockIdx.x + threadIdx.x; + const int y = blockIdx.y; + const int b = blockIdx.z; + + // Check bounds for loading - all threads in block participate + const bool validBlock = (y < output.height() && b < output.batches()); + + // Shared memory tile: BLOCK_WIDTH + kernel halo on both sides + extern __shared__ char smem[]; + T* tile = reinterpret_cast(smem); + + const int halo = kernelWidth - 1; + const int tileWidth = BLOCK_WIDTH + halo; + + // Load data into shared memory with halo + // All threads participate in loading to avoid divergence before syncthreads + for (int i = threadIdx.x; i < tileWidth; i += blockDim.x) { + int srcX = blockIdx.x * BLOCK_WIDTH + i - kernelAnchorX; + if (validBlock) { + tile[i] = input.at(b, y, srcX, 0); + } + } + + __syncthreads(); + + // Now check if this specific thread has valid output to compute + if (x >= output.width() || !validBlock) { + return; + } + + // Compute horizontal average using shared memory + WorkType sum = SetAll(0.0f); + + int tileIdx = threadIdx.x + kernelAnchorX; + for (int kx = 0; kx < kernelWidth; ++kx) { + sum = sum + StaticCast(tile[tileIdx - kernelAnchorX + kx]); + } + + float kernelSize = static_cast(kernelWidth); + WorkType average = sum / kernelSize; + + output.at(b, y, x, 0) = SaturateCast(average); +} + +/** + * @brief Optimized 1D Vertical Average Blur Kernel with Shared Memory Tiling + * + * Second pass of separable average blur. Applies vertical averaging using shared + * memory to reduce global memory bandwidth. Each block loads a column tile into + * shared memory, then computes vertical averages. + */ +template +__global__ void avg_blur_vertical(SrcWrapper input, DstWrapper output, + int kernelHeight, int kernelAnchorY) { + using namespace roccv::detail; + using WorkType = MakeType>; + + const int x = blockIdx.x; + const int y = blockDim.y * blockIdx.y + threadIdx.y; + const int b = blockIdx.z; + + // Check bounds for loading - all threads in block participate + const bool validBlock = (x < output.width() && b < output.batches()); + + // Shared memory tile: BLOCK_HEIGHT + kernel halo on both sides + extern __shared__ char smem[]; + T* tile = reinterpret_cast(smem); + + const int halo = kernelHeight - 1; + const int tileHeight = BLOCK_HEIGHT + halo; + + // Load data into shared memory with halo + // All threads participate in loading to avoid divergence before syncthreads + for (int i = threadIdx.y; i < tileHeight; i += blockDim.y) { + int srcY = blockIdx.y * BLOCK_HEIGHT + i - kernelAnchorY; + if (validBlock) { + tile[i] = input.at(b, srcY, x, 0); + } + } + + __syncthreads(); + + // Now check if this specific thread has valid output to compute + if (y >= output.height() || !validBlock) { + return; + } + + // Compute vertical average using shared memory + WorkType sum = SetAll(0.0f); + + int tileIdx = threadIdx.y + kernelAnchorY; + for (int ky = 0; ky < kernelHeight; ++ky) { + sum = sum + StaticCast(tile[tileIdx - kernelAnchorY + ky]); + } + + float kernelSize = static_cast(kernelHeight); + WorkType average = sum / kernelSize; + + output.at(b, y, x, 0) = SaturateCast(average); +} + +} // namespace Device +} // namespace Kernels diff --git a/include/kernels/host/avg_blur_host.hpp b/include/kernels/host/avg_blur_host.hpp new file mode 100644 index 00000000..d2aced8b --- /dev/null +++ b/include/kernels/host/avg_blur_host.hpp @@ -0,0 +1,94 @@ +/** +Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include "core/detail/casting.hpp" +#include "core/detail/type_traits.hpp" +#include "core/detail/vector_utils.hpp" +#include "operator_types.h" + +namespace Kernels { +namespace Host { + +/** + * @brief CPU implementation of 2D Average Blur + * + * Applies a 2D averaging filter to the input image on CPU. Each output pixel is computed + * as the average of all pixels within the kernel window. + * + * @tparam T Data type (e.g., uchar3, float4, etc.) + * @tparam SrcWrapper Border wrapper type for input + * @tparam DstWrapper Image wrapper type for output + * + * @param input Input image with border handling + * @param output Output image + * @param kernelWidth Width of the averaging kernel + * @param kernelHeight Height of the averaging kernel + * @param kernelAnchorX X-coordinate of kernel anchor point + * @param kernelAnchorY Y-coordinate of kernel anchor point + */ +template +void avg_blur_2d(SrcWrapper input, DstWrapper output, + int kernelWidth, int kernelHeight, + int kernelAnchorX, int kernelAnchorY) { + using namespace roccv::detail; + using WorkType = MakeType>; + + // Compute kernel area for averaging + float kernelArea = static_cast(kernelWidth * kernelHeight); + + // Iterate over all batches + for (int b = 0; b < output.batches(); b++) { + // Iterate over all output pixels + for (int y = 0; y < output.height(); y++) { + for (int x = 0; x < output.width(); x++) { + // Initialize accumulator + WorkType sum = SetAll(0.0f); + + // Compute the sum over the kernel window + for (int ky = 0; ky < kernelHeight; ++ky) { + int srcY = y - kernelAnchorY + ky; + + for (int kx = 0; kx < kernelWidth; ++kx) { + int srcX = x - kernelAnchorX + kx; + + // Read pixel through border wrapper (handles out-of-bounds) + T pixel = input.at(b, srcY, srcX, 0); + + // Accumulate as float to avoid overflow + sum = sum + StaticCast(pixel); + } + } + + // Compute average by dividing by kernel area + WorkType average = sum / kernelArea; + + // Write result with saturation + output.at(b, y, x, 0) = SaturateCast(average); + } + } + } +} + +} // namespace Host +} // namespace Kernels diff --git a/include/op_avg_blur.hpp b/include/op_avg_blur.hpp new file mode 100644 index 00000000..96bf132a --- /dev/null +++ b/include/op_avg_blur.hpp @@ -0,0 +1,94 @@ +/** +Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#pragma once + +#include + +#include "core/tensor.hpp" +#include "operator_types.h" + +namespace roccv { +/** + * @brief Class for managing the Average Blur operator + * + */ +class AvgBlur final : public IOperator { + public: + /** + * @brief Constructs an AvgBlur object. + * + */ + AvgBlur(); + + /** + * @brief Destroy the AvgBlur object + * + */ + ~AvgBlur(); + + /** + * @brief Construct a new AvgBlur object. + * The object can be used to apply an average (mean) blur filter on images in a tensor. + * + * Limitations: + * + * Input: + * Supported TensorLayout(s): [NHWC, HWC] + * Channels: [1, 3, 4] + * Supported DataType(s): [U8, U16, S16, S32, F32] + * + * Output: + * Supported TensorLayout(s): [NHWC, HWC] + * Channels: [1, 3, 4] + * Supported DataType(s): [U8, U16, S16, S32, F32] + * + * Input/Output dependency + * + * Property | Input == Output + * -------------- | ------------- + * Data Layout | Yes + * Data Type | Yes + * Number | Yes + * Channels | Yes + * Width | Yes + * Height | Yes + * + * + * @param[in] stream The HIP stream to run this operation on. + * @param[in] input Input tensor with image batch data + * @param[out] output Output tensor for storing modified image batch data + * @param[in] kernelWidth Width of the averaging kernel. + * @param[in] kernelHeight Height of the averaging kernel. + * @param[in] kernelAnchorX Kernel anchor in X direction. + * @param[in] kernelAnchorY Kernel anchor in Y direction. + * @param[in] borderMode A border type to identify the pixel extrapolation + * method (e.g. BORDER_TYPE_CONSTANT or BORDER_TYPE_REPLICATE) + * @param[in] borderValue Set as 0 unless using a constant border. + * @param[in] device The device which this operation should run on. + * (Default: eDeviceType::GPU) + * + */ + void operator()(hipStream_t stream, const Tensor& input, const Tensor& output, + int kernelWidth, int kernelHeight, int kernelAnchorX, int kernelAnchorY, + eBorderType borderType, float4 borderValue, eDeviceType device = eDeviceType::GPU); +}; +} // namespace roccv \ No newline at end of file diff --git a/python/include/operators/py_op_avg_blur.hpp b/python/include/operators/py_op_avg_blur.hpp new file mode 100644 index 00000000..c5bb4d62 --- /dev/null +++ b/python/include/operators/py_op_avg_blur.hpp @@ -0,0 +1,45 @@ +/** +Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include +#include + +#include "py_stream.hpp" +#include "py_tensor.hpp" + +namespace py = pybind11; + +class PyOpAvgBlur { + public: + static void Export(py::module& m); + static PyTensor Execute(PyTensor& input, py::tuple kernelSize, py::tuple anchor, + eBorderType borderMode, py::list borderValue, + std::optional> stream, + eDeviceType device); + static void ExecuteInto(PyTensor& output, PyTensor& input, + py::tuple kernelSize, py::tuple anchor, + eBorderType borderMode, py::list borderValue, + std::optional> stream, + eDeviceType device); +}; diff --git a/python/src/main.cpp b/python/src/main.cpp index ddbec687..31444ab4 100644 --- a/python/src/main.cpp +++ b/python/src/main.cpp @@ -23,6 +23,7 @@ THE SOFTWARE. #include #include +#include "operators/py_op_avg_blur.hpp" #include "operators/py_op_bilateral_filter.hpp" #include "operators/py_op_bnd_box.hpp" #include "operators/py_op_center_crop.hpp" @@ -61,6 +62,7 @@ PYBIND11_MODULE(rocpycv, m) { PyStructs::Export(m); PyStream::Export(m); PyTensor::Export(m); + PyOpAvgBlur::Export(m); PyOpCustomCrop::Export(m); PyOpNonMaxSuppression::Export(m); PyOpNormalize::Export(m); diff --git a/python/src/operators/py_op_avg_blur.cpp b/python/src/operators/py_op_avg_blur.cpp new file mode 100644 index 00000000..46ae9c1c --- /dev/null +++ b/python/src/operators/py_op_avg_blur.cpp @@ -0,0 +1,125 @@ +/** +Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "operators/py_op_avg_blur.hpp" + +#include + +#include "py_helpers.hpp" + +PyTensor PyOpAvgBlur::Execute(PyTensor& input, py::tuple kernelSize, py::tuple anchor, + eBorderType borderMode, py::list borderValue, + std::optional> stream, + eDeviceType device) { + + hipStream_t hipStream = stream.has_value() ? stream.value().get().getStream() : nullptr; + + // Extract kernel size from tuple + int kernelWidth = kernelSize[0].cast(); + int kernelHeight = kernelSize[1].cast(); + + // Extract anchor from tuple + int kernelAnchorX = anchor[0].cast(); + int kernelAnchorY = anchor[1].cast(); + + auto inputTensor = input.getTensor(); + auto outputTensor = std::make_shared(inputTensor->shape(), inputTensor->dtype(), device); + + roccv::AvgBlur op; + op(hipStream, *inputTensor, *outputTensor, kernelWidth, kernelHeight, + kernelAnchorX, kernelAnchorY, borderMode, GetFloat4FromPyList(borderValue), device); + return PyTensor(outputTensor); +} + +void PyOpAvgBlur::ExecuteInto(PyTensor& output, PyTensor& input, + py::tuple kernelSize, py::tuple anchor, + eBorderType borderMode, py::list borderValue, + std::optional> stream, + eDeviceType device) { + + hipStream_t hipStream = stream.has_value() ? stream.value().get().getStream() : nullptr; + + // Extract kernel size from tuple + int kernelWidth = kernelSize[0].cast(); + int kernelHeight = kernelSize[1].cast(); + + // Extract anchor from tuple + int kernelAnchorX = anchor[0].cast(); + int kernelAnchorY = anchor[1].cast(); + + roccv::AvgBlur op; + op(hipStream, *input.getTensor(), *output.getTensor(), kernelWidth, kernelHeight, + kernelAnchorX, kernelAnchorY, borderMode, GetFloat4FromPyList(borderValue), device); +} + +void PyOpAvgBlur::Export(py::module& m) { + using namespace py::literals; + m.def("avg_blur", &PyOpAvgBlur::Execute, "src"_a, "kernelSize"_a, "anchor"_a, + "borderMode"_a, "borderValue"_a, + "stream"_a = nullptr, "device"_a = eDeviceType::GPU, R"pbdoc( + + Executes the Average Blur operation on the given HIP stream. + + See also: + Refer to the rocCV C++ API reference for more information on this operation. + + Args: + src (rocpycv.Tensor): Input tensor containing one or more images. + kernelSize (Tuple[int, int]): Kernel size as (width, height). + anchor (Tuple[int, int]): Kernel anchor position as (x, y). + borderMode (rocpycv.eBorderType): The border type to identify the pixel extrapolation method. + borderValue (List[float]): The color value to use when a constant border is selected. + stream (rocpycv.Stream, optional): HIP stream to run this operation on. + device (rocpycv.Device, optional): The device to run this operation on. Defaults to GPU. + + Returns: + rocpycv.Tensor: The output tensor with blurred images. + )pbdoc"); + + m.def("avg_blur_into", &PyOpAvgBlur::ExecuteInto, "dst"_a, "src"_a, "kernelSize"_a, "anchor"_a, + "borderMode"_a, "borderValue"_a, + "stream"_a = nullptr, "device"_a = eDeviceType::GPU, R"pbdoc( + + + Executes the Average Blur operation on the given HIP stream, writing results into a pre-allocated output tensor. + + This operation applies an average (mean) blur filter on images in a tensor. + The filter computes the average of all pixels within a rectangular kernel + for each pixel position in the image. + + See also: + Refer to the rocCV C++ API reference for more information on this operation. + + Args: + dst (rocpycv.Tensor): The output tensor which results are written to. + src (rocpycv.Tensor): Input tensor containing one or more images. + kernelSize (Tuple[int, int]): Kernel size as (width, height). + anchor (Tuple[int, int]): Kernel anchor position as (x, y). + borderMode (rocpycv.eBorderType): The border type to identify the pixel extrapolation method. + borderValue (List[float]): The color value to use when a constant border is selected (4 elements for RGBA). + stream (rocpycv.Stream, optional): HIP stream to run this operation on. + device (rocpycv.Device, optional): The device to run this operation on. Defaults to GPU. + + Returns: + None + )pbdoc"); +} diff --git a/src/op_avg_blur.cpp b/src/op_avg_blur.cpp new file mode 100644 index 00000000..19276890 --- /dev/null +++ b/src/op_avg_blur.cpp @@ -0,0 +1,289 @@ +/** +Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include "op_avg_blur.hpp" + +#include + +#include +#include +#include + +#include "common/array_wrapper.hpp" +#include "core/wrappers/border_wrapper.hpp" +#include "common/validation_helpers.hpp" +#include "core/detail/casting.hpp" +#include "core/detail/math/math.hpp" +#include "core/detail/type_traits.hpp" +#include "kernels/device/avg_blur_device.hpp" +#include "kernels/host/avg_blur_host.hpp" + +namespace roccv { +AvgBlur::AvgBlur() {} + +AvgBlur::~AvgBlur() {} + +template +void dispatch_avg_blur_border_mode(hipStream_t stream, const Tensor &input, const Tensor &output, + int kernelWidth, int kernelHeight, int kernelAnchorX, int kernelAnchorY, + T borderValue, eDeviceType device) { + BorderWrapper inputWrapper(input, borderValue); + ImageWrapper outputWrapper(output); + + if (outputWrapper.channels() > 4 || outputWrapper.channels() < 1) { + throw Exception("Invalid channel size: cannot be greater than 4 or less than 1.", eStatusType::OUT_OF_BOUNDS); + } + + if (device == eDeviceType::GPU) { + dim3 block(32, 8); + dim3 grid((outputWrapper.width() + block.x - 1) / block.x, + (outputWrapper.height() + block.y - 1) / block.y, + outputWrapper.batches()); + + Kernels::Device::avg_blur_2d, ImageWrapper> + <<>>( + inputWrapper, outputWrapper, + kernelWidth, kernelHeight, + kernelAnchorX, kernelAnchorY); + + hipError_t err = hipGetLastError(); + if (err != hipSuccess) { + throw Exception("Average blur kernel launch failed: " + std::string(hipGetErrorString(err)), + eStatusType::INVALID_OPERATION); + } + } else if (device == eDeviceType::CPU) { + Kernels::Host::avg_blur_2d, ImageWrapper>( + inputWrapper, outputWrapper, + kernelWidth, kernelHeight, + kernelAnchorX, kernelAnchorY); + } +} + +/** + * @brief Optimized separable average blur using two-pass filtering with shared memory tiling + * + * This function implements average blur as two 1D convolutions (horizontal then vertical) + * instead of a single 2D convolution. Benefits: + * - Reduces computational complexity from O(k²) to O(2k) per pixel + * - Uses shared memory tiling to reduce global memory bandwidth + * - Typically 1.5-5.5× faster than direct 2D approach for kernels >= 5×5 + * + * Trade-off: Requires intermediate buffer (same size as output) + */ +template +void dispatch_avg_blur_border_mode_separable(hipStream_t stream, const Tensor &input, const Tensor &output, + int kernelWidth, int kernelHeight, int kernelAnchorX, int kernelAnchorY, + T borderValue, eDeviceType device) { + BorderWrapper inputWrapper(input, borderValue); + ImageWrapper outputWrapper(output); + + if (outputWrapper.channels() > 4 || outputWrapper.channels() < 1) { + throw Exception("Invalid channel size: cannot be greater than 4 or less than 1.", eStatusType::OUT_OF_BOUNDS); + } + + if (device == eDeviceType::GPU) { + // Allocate intermediate buffer for horizontal pass result + Tensor intermediate(output.shape(), output.dtype(), device); + ImageWrapper intermediateWrapper(intermediate); + + // Constants for tiling + constexpr int BLOCK_WIDTH = 128; // Horizontal: 128 threads/block for good occupancy + constexpr int BLOCK_HEIGHT = 128; // Vertical: testing 128 for maximum occupancy + + // Horizontal pass: input -> intermediate + { + dim3 block(BLOCK_WIDTH, 1); + dim3 grid((outputWrapper.width() + BLOCK_WIDTH - 1) / BLOCK_WIDTH, + outputWrapper.height(), + outputWrapper.batches()); + + int halo = kernelWidth - 1; + int tileWidth = BLOCK_WIDTH + halo; + size_t smemSize = tileWidth * sizeof(T); + + Kernels::Device::avg_blur_horizontal, ImageWrapper> + <<>>( + inputWrapper, intermediateWrapper, + kernelWidth, kernelAnchorX); + + hipError_t err = hipGetLastError(); + if (err != hipSuccess) { + throw Exception("Horizontal blur kernel launch failed: " + std::string(hipGetErrorString(err)), + eStatusType::INVALID_OPERATION); + } + } + + // Vertical pass: intermediate -> output + // Wrap intermediate buffer with border handling for vertical direction + BorderWrapper intermediateWrapperWithBorder(intermediate, borderValue); + { + dim3 block(1, BLOCK_HEIGHT); + dim3 grid(outputWrapper.width(), + (outputWrapper.height() + BLOCK_HEIGHT - 1) / BLOCK_HEIGHT, + outputWrapper.batches()); + + int halo = kernelHeight - 1; + int tileHeight = BLOCK_HEIGHT + halo; + size_t smemSize = tileHeight * sizeof(T); + + Kernels::Device::avg_blur_vertical, ImageWrapper> + <<>>( + intermediateWrapperWithBorder, outputWrapper, + kernelHeight, kernelAnchorY); + + hipError_t err = hipGetLastError(); + if (err != hipSuccess) { + throw Exception("Vertical blur kernel launch failed: " + std::string(hipGetErrorString(err)), + eStatusType::INVALID_OPERATION); + } + } + + } else if (device == eDeviceType::CPU) { + Kernels::Host::avg_blur_2d, ImageWrapper>( + inputWrapper, outputWrapper, + kernelWidth, kernelHeight, + kernelAnchorX, kernelAnchorY); + } +} + +template +void dispatch_avg_blur_dtype(hipStream_t stream, const Tensor &input, const Tensor &output, + int kernelWidth, int kernelHeight, int kernelAnchorX, int kernelAnchorY, + eBorderType borderMode, float4 borderValue, eDeviceType device) { + // Select kernel dispatcher based on requested border mode. + // clang-format off + static const std::unordered_map> + funcs = { + {eBorderType::BORDER_TYPE_REPLICATE, dispatch_avg_blur_border_mode}, + {eBorderType::BORDER_TYPE_CONSTANT, dispatch_avg_blur_border_mode}, + {eBorderType::BORDER_TYPE_REFLECT, dispatch_avg_blur_border_mode}, + {eBorderType::BORDER_TYPE_REFLECT101, dispatch_avg_blur_border_mode}, + {eBorderType::BORDER_TYPE_WRAP, dispatch_avg_blur_border_mode} + }; + // clang-format on + + if (!funcs.contains(borderMode)) { + throw Exception("AvgBlur does not support the given border mode.", eStatusType::NOT_IMPLEMENTED); + } + + auto func = funcs.at(borderMode); + func(stream, input, output, kernelWidth, kernelHeight, kernelAnchorX, kernelAnchorY, + detail::SaturateCast(borderValue), device); +} + +template +void dispatch_avg_blur_dtype_optimized(hipStream_t stream, const Tensor &input, const Tensor &output, + int kernelWidth, int kernelHeight, int kernelAnchorX, int kernelAnchorY, + eBorderType borderMode, float4 borderValue, eDeviceType device) { + // Select kernel dispatcher based on requested border mode. + // clang-format off + static const std::unordered_map> + funcs = { + {eBorderType::BORDER_TYPE_REPLICATE, dispatch_avg_blur_border_mode_separable}, + {eBorderType::BORDER_TYPE_CONSTANT, dispatch_avg_blur_border_mode_separable}, + {eBorderType::BORDER_TYPE_REFLECT, dispatch_avg_blur_border_mode_separable}, + {eBorderType::BORDER_TYPE_REFLECT101, dispatch_avg_blur_border_mode_separable}, + {eBorderType::BORDER_TYPE_WRAP, dispatch_avg_blur_border_mode_separable} + }; + // clang-format on + + if (!funcs.contains(borderMode)) { + throw Exception("AvgBlur does not support the given border mode.", eStatusType::NOT_IMPLEMENTED); + } + + auto func = funcs.at(borderMode); + func(stream, input, output, kernelWidth, kernelHeight, kernelAnchorX, kernelAnchorY, + detail::SaturateCast(borderValue), device); +} + +void AvgBlur::operator()(hipStream_t stream, const Tensor &input, const Tensor &output, + int kernelWidth, int kernelHeight, int kernelAnchorX, int kernelAnchorY, + eBorderType borderType, float4 borderValue, eDeviceType device) { + // Verify that the tensors are located on the right device (CPU or GPU). + CHECK_TENSOR_DEVICE(input, device); + CHECK_TENSOR_DEVICE(output, device); + + // Ensure all tensors are using supported datatypes + CHECK_TENSOR_DATATYPES(input, DATA_TYPE_U8, DATA_TYPE_U16, DATA_TYPE_S16, DATA_TYPE_S32, DATA_TYPE_F32); + CHECK_TENSOR_DATATYPES(output, DATA_TYPE_U8, DATA_TYPE_U16, DATA_TYPE_S16, DATA_TYPE_S32, DATA_TYPE_F32); + + // Ensure all tensors are using supported layouts. + CHECK_TENSOR_LAYOUT(input, TENSOR_LAYOUT_NHWC, TENSOR_LAYOUT_HWC); + CHECK_TENSOR_LAYOUT(output, TENSOR_LAYOUT_NHWC, TENSOR_LAYOUT_HWC); + + CHECK_TENSOR_CHANNELS(input, 1, 3, 4); + + eDataType dtype = input.dtype().etype(); + int64_t channels = input.shape(input.layout().channels_index()); + + // Ensure the layout and shapes for the input/output tensor match + CHECK_TENSOR_COMPARISON(input.layout() == output.layout()); + CHECK_TENSOR_COMPARISON(input.shape() == output.shape()); + + // Choose between direct 2D and optimized separable approach based on kernel size + // Benchmark results show crossover point between 9×9 and 11×11 kernels + // Below 11×11: kernel launch overhead dominates computational savings + // At 11×11 and above: separable filtering provides 1.3-3× speedup + // For GPU only - CPU uses direct 2D in both cases + bool useSeparable = (device == eDeviceType::GPU) && + (kernelWidth >= 11 || kernelHeight >= 11); + + if (useSeparable) { + // Use optimized separable filtering with shared memory tiling + // clang-format off + static const std::unordered_map< + eDataType, std::array, 4>> + funcs_optimized = { + {eDataType::DATA_TYPE_U8, {dispatch_avg_blur_dtype_optimized, 0, dispatch_avg_blur_dtype_optimized, dispatch_avg_blur_dtype_optimized}}, + {eDataType::DATA_TYPE_U16, {dispatch_avg_blur_dtype_optimized, 0, dispatch_avg_blur_dtype_optimized, dispatch_avg_blur_dtype_optimized}}, + {eDataType::DATA_TYPE_S16, {dispatch_avg_blur_dtype_optimized, 0, dispatch_avg_blur_dtype_optimized, dispatch_avg_blur_dtype_optimized}}, + {eDataType::DATA_TYPE_S32, {dispatch_avg_blur_dtype_optimized, 0, dispatch_avg_blur_dtype_optimized, dispatch_avg_blur_dtype_optimized}}, + {eDataType::DATA_TYPE_F32, {dispatch_avg_blur_dtype_optimized, 0, dispatch_avg_blur_dtype_optimized, dispatch_avg_blur_dtype_optimized}} + }; + // clang-format on + + auto func = funcs_optimized.at(dtype)[channels - 1]; + if (func == 0) throw Exception("Not mapped to a defined function.", eStatusType::INVALID_OPERATION); + func(stream, input, output, kernelWidth, kernelHeight, kernelAnchorX, kernelAnchorY, borderType, borderValue, device); + } else { + // Use direct 2D convolution (better for small kernels) + // clang-format off + static const std::unordered_map< + eDataType, std::array, 4>> + funcs = { + {eDataType::DATA_TYPE_U8, {dispatch_avg_blur_dtype, 0, dispatch_avg_blur_dtype, dispatch_avg_blur_dtype}}, + {eDataType::DATA_TYPE_U16, {dispatch_avg_blur_dtype, 0, dispatch_avg_blur_dtype, dispatch_avg_blur_dtype}}, + {eDataType::DATA_TYPE_S16, {dispatch_avg_blur_dtype, 0, dispatch_avg_blur_dtype, dispatch_avg_blur_dtype}}, + {eDataType::DATA_TYPE_S32, {dispatch_avg_blur_dtype, 0, dispatch_avg_blur_dtype, dispatch_avg_blur_dtype}}, + {eDataType::DATA_TYPE_F32, {dispatch_avg_blur_dtype, 0, dispatch_avg_blur_dtype, dispatch_avg_blur_dtype}} + }; + // clang-format on + + auto func = funcs.at(dtype)[channels - 1]; + if (func == 0) throw Exception("Not mapped to a defined function.", eStatusType::INVALID_OPERATION); + func(stream, input, output, kernelWidth, kernelHeight, kernelAnchorX, kernelAnchorY, borderType, borderValue, device); + } + + ////////////////////////////////////////////////////////////////////////////////////////////////////////// +} +} // namespace roccv \ No newline at end of file diff --git a/tests/roccv/cpp/src/tests/operators/test_op_avg_blur.cpp b/tests/roccv/cpp/src/tests/operators/test_op_avg_blur.cpp new file mode 100644 index 00000000..33ec722e --- /dev/null +++ b/tests/roccv/cpp/src/tests/operators/test_op_avg_blur.cpp @@ -0,0 +1,302 @@ +/** +Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include +#include +#include +#include + +#include "test_helpers.hpp" + +using namespace roccv; +using namespace roccv::detail; +using namespace roccv::tests; + +namespace { + +/** + * @brief Verified golden C++ model for the average blur operation on one image. + * + * @tparam T Vectorized datatype of the image's pixels. + * @param[in] input Input tensor containing image data. + * @param[out] output Output tensor containing blurred image data. + * @param[in] kernelWidth Width of the averaging kernel + * @param[in] kernelHeight Height of the averaging kernel + * @param[in] kernelAnchorX X-coordinate of kernel anchor point + * @param[in] kernelAnchorY Y-coordinate of kernel anchor point + * @param[in] borderMode Border pixel extrapolation method + * @param[in] borderValue Color for constant border mode + * @return None. + */ +template > +void GenerateGoldenAvgBlur(std::vector& input, std::vector& output, int32_t batchSize, Size2D imageSize, + int kernelWidth, int kernelHeight, int kernelAnchorX, int kernelAnchorY, T borderValue) { + BorderWrapper src(ImageWrapper(input, batchSize, imageSize.w, imageSize.h), borderValue); + ImageWrapper dst(output, batchSize, imageSize.w, imageSize.h); + using namespace roccv::detail; + using WorkType = MakeType>; + + // Compute kernel area for averaging + float kernelArea = static_cast(kernelWidth * kernelHeight); + + // Iterate over all batches + for (int b = 0; b < dst.batches(); b++) { + // Iterate over all output pixels + for (int j = 0; j < dst.height(); j++) { + for (int i = 0; i < dst.width(); i++) { + // Initialize accumulator + WorkType sum = SetAll(0.0f); + + // Compute the sum over the kernel window + for (int ky = 0; ky < kernelHeight; ++ky) { + int srcY = j - kernelAnchorY + ky; + + for (int kx = 0; kx < kernelWidth; ++kx) { + int srcX = i - kernelAnchorX + kx; + + T pixel = src.at(b, srcY, srcX, 0); + + sum = sum + StaticCast(pixel); + } + } + + WorkType average = sum / kernelArea; + + dst.at(b, j, i, 0) = SaturateCast(average); + } + } + } +} + +/** + * @brief Tests correctness of the average blur operator, comparing it against a generated golden result. + * + * @tparam T Underlying datatype of the image's pixels. + * @tparam BT Base type of the image data. + * @param[in] batchSize Number of images in the batch. + * @param[in] width Width of each image in the batch. + * @param[in] height Height of each image in the batch. + * @param[in] format Image format. + * @param[in] kernelWidth Width of the averaging kernel + * @param[in] kernelHeight Height of the averaging kernel + * @param[in] borderColor Color for constant border mode + * @param[in] device Device this correctness test should be run on. + */ +template > +void TestCorrectness(int batchSize, int width, int height, ImageFormat format, int kernelWidth, int kernelHeight, + float4 borderColor, eDeviceType device) { + // Create input and output tensor based on test parameters + Tensor input(batchSize, {width, height}, format, device); + Tensor output(batchSize, {width, height}, format, device); + + // Create a vector and fill it with random data. + std::vector inputData(input.shape().size()); + FillVector(inputData); + if constexpr (std::is_floating_point_v) { + for (int i = 0; i < inputData.size(); i++) { + inputData[i] *= static_cast(std::numeric_limits::max()); + } + } + + // Copy generated input data into input tensor + CopyVectorIntoTensor(input, inputData); + + // Calculate kernel anchor (center of kernel) + int kernelAnchorX = kernelWidth / 2; + int kernelAnchorY = kernelHeight / 2; + + hipStream_t stream; + HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream)); + AvgBlur op; + op(stream, input, output, kernelWidth, kernelHeight, kernelAnchorX, kernelAnchorY, BorderMode, borderColor, + device); + HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream)); + HIP_VALIDATE_NO_ERRORS(hipStreamDestroy(stream)); + + // Copy data from output tensor into a host allocated vector + std::vector outputData(output.shape().size()); + CopyTensorIntoVector(outputData, output); + + // Calculate golden reference + std::vector refData(output.shape().size()); + GenerateGoldenAvgBlur(inputData, refData, batchSize, {width, height}, kernelWidth, kernelHeight, + kernelAnchorX, kernelAnchorY, detail::SaturateCast(borderColor)); + + // Compare data in actual output versus the generated golden reference image + CompareVectorsNear(outputData, refData, 1); +} + +} // namespace + +int main(int argc, char** argv) { + TEST_CASES_BEGIN(); + + // GPU correctness tests - U8 (1 channel) + TEST_CASE((TestCorrectness(1, 20, 20, FMT_U8, 3, 3, {0.0, 0.0, 0.0, 0.0}, + eDeviceType::GPU))); + TEST_CASE((TestCorrectness(4, 32, 32, FMT_U8, 5, 5, {0.0, 0.0, 0.0, 0.0}, + eDeviceType::GPU))); + TEST_CASE((TestCorrectness(2, 24, 24, FMT_U8, 7, 7, {0.0, 0.0, 0.0, 0.0}, + eDeviceType::GPU))); + + // GPU correctness tests - RGB8 (3 channels) + TEST_CASE((TestCorrectness(1, 20, 20, FMT_RGB8, 3, 3, {0.0, 0.0, 0.0, 0.0}, + eDeviceType::GPU))); + TEST_CASE((TestCorrectness(2, 32, 32, FMT_RGB8, 5, 5, {100.0, 100.0, 0.0, 0.0}, + eDeviceType::GPU))); + TEST_CASE((TestCorrectness(1, 16, 16, FMT_RGB8, 3, 3, {50.0, 50.0, 50.0, 0.0}, + eDeviceType::GPU))); + + // GPU correctness tests - RGBA8 (4 channels) + TEST_CASE((TestCorrectness(1, 10, 10, FMT_RGBA8, 3, 3, {0.0, 0.0, 0.0, 0.0}, + eDeviceType::GPU))); + TEST_CASE((TestCorrectness(5, 64, 64, FMT_RGBA8, 5, 5, {0.0, 0.0, 0.0, 0.0}, + eDeviceType::GPU))); + TEST_CASE((TestCorrectness(2, 20, 20, FMT_RGBA8, 7, 7, {128.0, 128.0, 128.0, 255.0}, + eDeviceType::GPU))); + + // GPU correctness tests - S16 (signed 16-bit) + TEST_CASE((TestCorrectness(1, 20, 20, FMT_S16, 3, 3, {500.0, 500.0, 0.0, 0.0}, + eDeviceType::GPU))); + TEST_CASE((TestCorrectness(3, 20, 20, FMT_S16, 5, 5, {500.0, 500.0, 0.0, 0.0}, + eDeviceType::GPU))); + TEST_CASE((TestCorrectness(2, 16, 16, FMT_RGBs16, 3, 3, {100.0, 100.0, 100.0, 0.0}, + eDeviceType::GPU))); + TEST_CASE((TestCorrectness(1, 24, 24, FMT_RGBAs16, 5, 5, {0.0, 0.0, 0.0, 0.0}, + eDeviceType::GPU))); + + // GPU correctness tests - U16 (unsigned 16-bit) + TEST_CASE((TestCorrectness(1, 20, 20, FMT_U16, 3, 3, {0.0, 0.0, 0.0, 0.0}, + eDeviceType::GPU))); + TEST_CASE((TestCorrectness(2, 20, 20, FMT_U16, 5, 5, {0.0, 0.0, 0.0, 0.0}, + eDeviceType::GPU))); + TEST_CASE((TestCorrectness(1, 20, 20, FMT_RGB16, 3, 3, {500.0, 600.0, 0.0, 0.0}, + eDeviceType::GPU))); + TEST_CASE((TestCorrectness(1, 20, 20, FMT_RGBA16, 5, 5, {500.0, 600.0, 0.0, 0.0}, + eDeviceType::GPU))); + + // GPU correctness tests - S32 (signed 32-bit) + TEST_CASE((TestCorrectness(1, 32, 32, FMT_S32, 3, 3, {500.0, 500.0, 0.0, 0.0}, + eDeviceType::GPU))); + TEST_CASE((TestCorrectness(2, 32, 32, FMT_S32, 5, 5, {500.0, 500.0, 0.0, 0.0}, + eDeviceType::GPU))); + TEST_CASE((TestCorrectness(1, 16, 16, FMT_RGBs32, 3, 3, {100.0, 100.0, 100.0, 0.0}, + eDeviceType::GPU))); + TEST_CASE((TestCorrectness(2, 24, 24, FMT_RGBAs32, 5, 5, {0.0, 0.0, 0.0, 0.0}, + eDeviceType::GPU))); + + // GPU correctness tests - F32 (float) + TEST_CASE((TestCorrectness(1, 24, 24, FMT_F32, 3, 3, {500.0, 500.0, 0.0, 0.0}, + eDeviceType::GPU))); + TEST_CASE((TestCorrectness(2, 24, 24, FMT_F32, 5, 5, {600.0, 500.0, 0.0, 0.0}, + eDeviceType::GPU))); + TEST_CASE((TestCorrectness(1, 24, 24, FMT_RGBf32, 3, 3, {500.0, 500.0, 0.0, 0.0}, + eDeviceType::GPU))); + TEST_CASE((TestCorrectness(2, 24, 24, FMT_RGBf32, 7, 7, {600.0, 500.0, 0.0, 0.0}, + eDeviceType::GPU))); + TEST_CASE((TestCorrectness(1, 24, 24, FMT_RGBAf32, 3, 3, {500.0, 500.0, 0.0, 0.0}, + eDeviceType::GPU))); + TEST_CASE((TestCorrectness(2, 24, 24, FMT_RGBAf32, 5, 5, {600.0, 500.0, 0.0, 0.0}, + eDeviceType::GPU))); + + // CPU correctness tests - U8 (1 channel) + TEST_CASE((TestCorrectness(1, 20, 20, FMT_U8, 3, 3, {0.0, 0.0, 0.0, 0.0}, + eDeviceType::CPU))); + TEST_CASE((TestCorrectness(4, 32, 32, FMT_U8, 5, 5, {0.0, 0.0, 0.0, 0.0}, + eDeviceType::CPU))); + TEST_CASE((TestCorrectness(2, 24, 24, FMT_U8, 7, 7, {0.0, 0.0, 0.0, 0.0}, + eDeviceType::CPU))); + + // CPU correctness tests - RGB8 (3 channels) + TEST_CASE((TestCorrectness(1, 20, 20, FMT_RGB8, 3, 3, {0.0, 0.0, 0.0, 0.0}, + eDeviceType::CPU))); + TEST_CASE((TestCorrectness(2, 32, 32, FMT_RGB8, 5, 5, {100.0, 100.0, 0.0, 0.0}, + eDeviceType::CPU))); + TEST_CASE((TestCorrectness(1, 16, 16, FMT_RGB8, 3, 3, {50.0, 50.0, 50.0, 0.0}, + eDeviceType::CPU))); + + // CPU correctness tests - RGBA8 (4 channels) + TEST_CASE((TestCorrectness(1, 10, 10, FMT_RGBA8, 3, 3, {0.0, 0.0, 0.0, 0.0}, + eDeviceType::CPU))); + TEST_CASE((TestCorrectness(5, 64, 64, FMT_RGBA8, 5, 5, {0.0, 0.0, 0.0, 0.0}, + eDeviceType::CPU))); + TEST_CASE((TestCorrectness(2, 20, 20, FMT_RGBA8, 7, 7, {128.0, 128.0, 128.0, 255.0}, + eDeviceType::CPU))); + + // CPU correctness tests - S16 (signed 16-bit) + TEST_CASE((TestCorrectness(1, 20, 20, FMT_S16, 3, 3, {500.0, 500.0, 0.0, 0.0}, + eDeviceType::CPU))); + TEST_CASE((TestCorrectness(3, 20, 20, FMT_S16, 5, 5, {500.0, 500.0, 0.0, 0.0}, + eDeviceType::CPU))); + TEST_CASE((TestCorrectness(2, 16, 16, FMT_RGBs16, 3, 3, {100.0, 100.0, 100.0, 0.0}, + eDeviceType::CPU))); + TEST_CASE((TestCorrectness(1, 24, 24, FMT_RGBAs16, 5, 5, {0.0, 0.0, 0.0, 0.0}, + eDeviceType::CPU))); + + // CPU correctness tests - U16 (unsigned 16-bit) + TEST_CASE((TestCorrectness(1, 20, 20, FMT_U16, 3, 3, {0.0, 0.0, 0.0, 0.0}, + eDeviceType::CPU))); + TEST_CASE((TestCorrectness(2, 20, 20, FMT_U16, 5, 5, {0.0, 0.0, 0.0, 0.0}, + eDeviceType::CPU))); + TEST_CASE((TestCorrectness(1, 20, 20, FMT_RGB16, 3, 3, {500.0, 600.0, 0.0, 0.0}, + eDeviceType::CPU))); + TEST_CASE((TestCorrectness(1, 20, 20, FMT_RGBA16, 5, 5, {500.0, 600.0, 0.0, 0.0}, + eDeviceType::CPU))); + + // CPU correctness tests - S32 (signed 32-bit) + TEST_CASE((TestCorrectness(1, 32, 32, FMT_S32, 3, 3, {500.0, 500.0, 0.0, 0.0}, + eDeviceType::CPU))); + TEST_CASE((TestCorrectness(2, 32, 32, FMT_S32, 5, 5, {500.0, 500.0, 0.0, 0.0}, + eDeviceType::CPU))); + TEST_CASE((TestCorrectness(1, 16, 16, FMT_RGBs32, 3, 3, {100.0, 100.0, 100.0, 0.0}, + eDeviceType::CPU))); + TEST_CASE((TestCorrectness(2, 24, 24, FMT_RGBAs32, 5, 5, {0.0, 0.0, 0.0, 0.0}, + eDeviceType::CPU))); + + // CPU correctness tests - F32 (float) + TEST_CASE((TestCorrectness(1, 24, 24, FMT_F32, 3, 3, {500.0, 500.0, 0.0, 0.0}, + eDeviceType::CPU))); + TEST_CASE((TestCorrectness(2, 24, 24, FMT_F32, 5, 5, {600.0, 500.0, 0.0, 0.0}, + eDeviceType::CPU))); + TEST_CASE((TestCorrectness(1, 24, 24, FMT_RGBf32, 3, 3, {500.0, 500.0, 0.0, 0.0}, + eDeviceType::CPU))); + TEST_CASE((TestCorrectness(2, 24, 24, FMT_RGBf32, 7, 7, {600.0, 500.0, 0.0, 0.0}, + eDeviceType::CPU))); + TEST_CASE((TestCorrectness(1, 24, 24, FMT_RGBAf32, 3, 3, {500.0, 500.0, 0.0, 0.0}, + eDeviceType::CPU))); + TEST_CASE((TestCorrectness(2, 24, 24, FMT_RGBAf32, 5, 5, {600.0, 500.0, 0.0, 0.0}, + eDeviceType::CPU))); + + // Additional edge cases - various kernel sizes + TEST_CASE((TestCorrectness(1, 40, 40, FMT_RGB8, 9, 9, {0.0, 0.0, 0.0, 0.0}, + eDeviceType::GPU))); + TEST_CASE((TestCorrectness(1, 50, 50, FMT_RGBAf32, 11, 11, {0.0, 0.0, 0.0, 0.0}, + eDeviceType::GPU))); + TEST_CASE((TestCorrectness(1, 40, 40, FMT_RGB8, 9, 9, {0.0, 0.0, 0.0, 0.0}, + eDeviceType::CPU))); + TEST_CASE((TestCorrectness(1, 50, 50, FMT_RGBAf32, 11, 11, {0.0, 0.0, 0.0, 0.0}, + eDeviceType::CPU))); + + TEST_CASES_END(); +} diff --git a/tests/roccv/python/test_op_avg_blur.py b/tests/roccv/python/test_op_avg_blur.py new file mode 100644 index 00000000..d38ab2fe --- /dev/null +++ b/tests/roccv/python/test_op_avg_blur.py @@ -0,0 +1,56 @@ +# ############################################################################## +# Copyright (c) - 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +# ############################################################################## + +import pytest +import rocpycv + +from test_helpers import compare_tensors, generate_tensor + + +@pytest.mark.parametrize("device", [rocpycv.eDeviceType.GPU, rocpycv.eDeviceType.CPU]) +@pytest.mark.parametrize("dtype", [rocpycv.eDataType.U8, rocpycv.eDataType.U16, rocpycv.eDataType.S16, rocpycv.eDataType.S32, rocpycv.eDataType.F32]) +@pytest.mark.parametrize("border_mode", [rocpycv.eBorderType.CONSTANT]) +@pytest.mark.parametrize("border_val", [[0, 0, 0, 0]]) +@pytest.mark.parametrize("kernel_size,anchor", [ + ((3, 3), (1, 1)), + ((7, 7), (3, 3)), + ((9, 9), (4, 4)) +]) +@pytest.mark.parametrize("channels", [1, 3, 4]) +@pytest.mark.parametrize("samples,height,width", [ + (1, 56, 64), + (3, 14, 40), + (5, 45, 105) +]) +def test_op_avg_blur(samples, height, width, channels, border_mode, border_val, kernel_size, anchor, dtype, device): + input = generate_tensor(samples, width, height, channels, dtype, device) + output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device) + + stream = rocpycv.Stream() + rocpycv.avg_blur_into(output_golden, input, kernel_size, anchor, + border_mode, border_val, stream, device) + output = rocpycv.avg_blur(input, kernel_size, anchor, + border_mode, border_val, stream, device) + stream.synchronize() + + compare_tensors(output, output_golden) From 4ec666c916205af7087bdc931a4d98ae4b02d2ca Mon Sep 17 00:00:00 2001 From: svcbuild Date: Thu, 9 Apr 2026 14:33:14 -0700 Subject: [PATCH 2/2] Made -1,-1 the default anchor, added anchors to C++ tests --- src/op_avg_blur.cpp | 8 ++ .../src/tests/operators/test_op_avg_blur.cpp | 124 +++++++++--------- tests/roccv/python/test_op_avg_blur.py | 2 +- 3 files changed, 70 insertions(+), 64 deletions(-) diff --git a/src/op_avg_blur.cpp b/src/op_avg_blur.cpp index 19276890..27563e97 100644 --- a/src/op_avg_blur.cpp +++ b/src/op_avg_blur.cpp @@ -231,6 +231,14 @@ void AvgBlur::operator()(hipStream_t stream, const Tensor &input, const Tensor & CHECK_TENSOR_CHANNELS(input, 1, 3, 4); + // Handle default anchor (-1, -1) by setting to kernel center + if (kernelAnchorX == -1) { + kernelAnchorX = kernelWidth / 2; + } + if (kernelAnchorY == -1) { + kernelAnchorY = kernelHeight / 2; + } + eDataType dtype = input.dtype().etype(); int64_t channels = input.shape(input.layout().channels_index()); diff --git a/tests/roccv/cpp/src/tests/operators/test_op_avg_blur.cpp b/tests/roccv/cpp/src/tests/operators/test_op_avg_blur.cpp index 33ec722e..38f566d4 100644 --- a/tests/roccv/cpp/src/tests/operators/test_op_avg_blur.cpp +++ b/tests/roccv/cpp/src/tests/operators/test_op_avg_blur.cpp @@ -100,12 +100,14 @@ void GenerateGoldenAvgBlur(std::vector& input, std::vector& output, int3 * @param[in] format Image format. * @param[in] kernelWidth Width of the averaging kernel * @param[in] kernelHeight Height of the averaging kernel + * @param[in] kernelAnchorX X-coordinate of kernel anchor point + * @param[in] kernelAnchorY Y-coordinate of kernel anchor point * @param[in] borderColor Color for constant border mode * @param[in] device Device this correctness test should be run on. */ template > void TestCorrectness(int batchSize, int width, int height, ImageFormat format, int kernelWidth, int kernelHeight, - float4 borderColor, eDeviceType device) { + int kernelAnchorX, int kernelAnchorY, float4 borderColor, eDeviceType device) { // Create input and output tensor based on test parameters Tensor input(batchSize, {width, height}, format, device); Tensor output(batchSize, {width, height}, format, device); @@ -122,10 +124,6 @@ void TestCorrectness(int batchSize, int width, int height, ImageFormat format, i // Copy generated input data into input tensor CopyVectorIntoTensor(input, inputData); - // Calculate kernel anchor (center of kernel) - int kernelAnchorX = kernelWidth / 2; - int kernelAnchorY = kernelHeight / 2; - hipStream_t stream; HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream)); AvgBlur op; @@ -153,149 +151,149 @@ int main(int argc, char** argv) { TEST_CASES_BEGIN(); // GPU correctness tests - U8 (1 channel) - TEST_CASE((TestCorrectness(1, 20, 20, FMT_U8, 3, 3, {0.0, 0.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 20, 20, FMT_U8, 3, 3, 1, 1, {0.0, 0.0, 0.0, 0.0}, eDeviceType::GPU))); - TEST_CASE((TestCorrectness(4, 32, 32, FMT_U8, 5, 5, {0.0, 0.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(4, 32, 32, FMT_U8, 5, 5, 2, 2, {0.0, 0.0, 0.0, 0.0}, eDeviceType::GPU))); - TEST_CASE((TestCorrectness(2, 24, 24, FMT_U8, 7, 7, {0.0, 0.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(2, 24, 24, FMT_U8, 7, 7, 3, 3, {0.0, 0.0, 0.0, 0.0}, eDeviceType::GPU))); // GPU correctness tests - RGB8 (3 channels) - TEST_CASE((TestCorrectness(1, 20, 20, FMT_RGB8, 3, 3, {0.0, 0.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 20, 20, FMT_RGB8, 3, 3, 1, 1, {0.0, 0.0, 0.0, 0.0}, eDeviceType::GPU))); - TEST_CASE((TestCorrectness(2, 32, 32, FMT_RGB8, 5, 5, {100.0, 100.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(2, 32, 32, FMT_RGB8, 5, 5, 2, 2, {100.0, 100.0, 0.0, 0.0}, eDeviceType::GPU))); - TEST_CASE((TestCorrectness(1, 16, 16, FMT_RGB8, 3, 3, {50.0, 50.0, 50.0, 0.0}, + TEST_CASE((TestCorrectness(1, 16, 16, FMT_RGB8, 3, 3, 1, 1, {50.0, 50.0, 50.0, 0.0}, eDeviceType::GPU))); // GPU correctness tests - RGBA8 (4 channels) - TEST_CASE((TestCorrectness(1, 10, 10, FMT_RGBA8, 3, 3, {0.0, 0.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 10, 10, FMT_RGBA8, 3, 3, 1, 1, {0.0, 0.0, 0.0, 0.0}, eDeviceType::GPU))); - TEST_CASE((TestCorrectness(5, 64, 64, FMT_RGBA8, 5, 5, {0.0, 0.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(5, 64, 64, FMT_RGBA8, 5, 5, 2, 2, {0.0, 0.0, 0.0, 0.0}, eDeviceType::GPU))); - TEST_CASE((TestCorrectness(2, 20, 20, FMT_RGBA8, 7, 7, {128.0, 128.0, 128.0, 255.0}, + TEST_CASE((TestCorrectness(2, 20, 20, FMT_RGBA8, 7, 7, 3, 3, {128.0, 128.0, 128.0, 255.0}, eDeviceType::GPU))); // GPU correctness tests - S16 (signed 16-bit) - TEST_CASE((TestCorrectness(1, 20, 20, FMT_S16, 3, 3, {500.0, 500.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 20, 20, FMT_S16, 3, 3, 1, 1, {500.0, 500.0, 0.0, 0.0}, eDeviceType::GPU))); - TEST_CASE((TestCorrectness(3, 20, 20, FMT_S16, 5, 5, {500.0, 500.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(3, 20, 20, FMT_S16, 5, 5, 2, 2, {500.0, 500.0, 0.0, 0.0}, eDeviceType::GPU))); - TEST_CASE((TestCorrectness(2, 16, 16, FMT_RGBs16, 3, 3, {100.0, 100.0, 100.0, 0.0}, + TEST_CASE((TestCorrectness(2, 16, 16, FMT_RGBs16, 3, 3, 1, 1, {100.0, 100.0, 100.0, 0.0}, eDeviceType::GPU))); - TEST_CASE((TestCorrectness(1, 24, 24, FMT_RGBAs16, 5, 5, {0.0, 0.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 24, 24, FMT_RGBAs16, 5, 5, 2, 2, {0.0, 0.0, 0.0, 0.0}, eDeviceType::GPU))); // GPU correctness tests - U16 (unsigned 16-bit) - TEST_CASE((TestCorrectness(1, 20, 20, FMT_U16, 3, 3, {0.0, 0.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 20, 20, FMT_U16, 3, 3, 1, 1, {0.0, 0.0, 0.0, 0.0}, eDeviceType::GPU))); - TEST_CASE((TestCorrectness(2, 20, 20, FMT_U16, 5, 5, {0.0, 0.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(2, 20, 20, FMT_U16, 5, 5, 2, 2, {0.0, 0.0, 0.0, 0.0}, eDeviceType::GPU))); - TEST_CASE((TestCorrectness(1, 20, 20, FMT_RGB16, 3, 3, {500.0, 600.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 20, 20, FMT_RGB16, 3, 3, 1, 1, {500.0, 600.0, 0.0, 0.0}, eDeviceType::GPU))); - TEST_CASE((TestCorrectness(1, 20, 20, FMT_RGBA16, 5, 5, {500.0, 600.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 20, 20, FMT_RGBA16, 5, 5, 2, 2, {500.0, 600.0, 0.0, 0.0}, eDeviceType::GPU))); // GPU correctness tests - S32 (signed 32-bit) - TEST_CASE((TestCorrectness(1, 32, 32, FMT_S32, 3, 3, {500.0, 500.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 32, 32, FMT_S32, 3, 3, 1, 1, {500.0, 500.0, 0.0, 0.0}, eDeviceType::GPU))); - TEST_CASE((TestCorrectness(2, 32, 32, FMT_S32, 5, 5, {500.0, 500.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(2, 32, 32, FMT_S32, 5, 5, 2, 2, {500.0, 500.0, 0.0, 0.0}, eDeviceType::GPU))); - TEST_CASE((TestCorrectness(1, 16, 16, FMT_RGBs32, 3, 3, {100.0, 100.0, 100.0, 0.0}, + TEST_CASE((TestCorrectness(1, 16, 16, FMT_RGBs32, 3, 3, 1, 1, {100.0, 100.0, 100.0, 0.0}, eDeviceType::GPU))); - TEST_CASE((TestCorrectness(2, 24, 24, FMT_RGBAs32, 5, 5, {0.0, 0.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(2, 24, 24, FMT_RGBAs32, 5, 5, 2, 2, {0.0, 0.0, 0.0, 0.0}, eDeviceType::GPU))); // GPU correctness tests - F32 (float) - TEST_CASE((TestCorrectness(1, 24, 24, FMT_F32, 3, 3, {500.0, 500.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 24, 24, FMT_F32, 3, 3, 1, 1, {500.0, 500.0, 0.0, 0.0}, eDeviceType::GPU))); - TEST_CASE((TestCorrectness(2, 24, 24, FMT_F32, 5, 5, {600.0, 500.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(2, 24, 24, FMT_F32, 5, 5, 2, 2, {600.0, 500.0, 0.0, 0.0}, eDeviceType::GPU))); - TEST_CASE((TestCorrectness(1, 24, 24, FMT_RGBf32, 3, 3, {500.0, 500.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 24, 24, FMT_RGBf32, 3, 3, 1, 1, {500.0, 500.0, 0.0, 0.0}, eDeviceType::GPU))); - TEST_CASE((TestCorrectness(2, 24, 24, FMT_RGBf32, 7, 7, {600.0, 500.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(2, 24, 24, FMT_RGBf32, 7, 7, 3, 3, {600.0, 500.0, 0.0, 0.0}, eDeviceType::GPU))); - TEST_CASE((TestCorrectness(1, 24, 24, FMT_RGBAf32, 3, 3, {500.0, 500.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 24, 24, FMT_RGBAf32, 3, 3, 1, 1, {500.0, 500.0, 0.0, 0.0}, eDeviceType::GPU))); - TEST_CASE((TestCorrectness(2, 24, 24, FMT_RGBAf32, 5, 5, {600.0, 500.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(2, 24, 24, FMT_RGBAf32, 5, 5, 2, 2, {600.0, 500.0, 0.0, 0.0}, eDeviceType::GPU))); // CPU correctness tests - U8 (1 channel) - TEST_CASE((TestCorrectness(1, 20, 20, FMT_U8, 3, 3, {0.0, 0.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 20, 20, FMT_U8, 3, 3, 1, 1, {0.0, 0.0, 0.0, 0.0}, eDeviceType::CPU))); - TEST_CASE((TestCorrectness(4, 32, 32, FMT_U8, 5, 5, {0.0, 0.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(4, 32, 32, FMT_U8, 5, 5, 2, 2, {0.0, 0.0, 0.0, 0.0}, eDeviceType::CPU))); - TEST_CASE((TestCorrectness(2, 24, 24, FMT_U8, 7, 7, {0.0, 0.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(2, 24, 24, FMT_U8, 7, 7, 3, 3, {0.0, 0.0, 0.0, 0.0}, eDeviceType::CPU))); // CPU correctness tests - RGB8 (3 channels) - TEST_CASE((TestCorrectness(1, 20, 20, FMT_RGB8, 3, 3, {0.0, 0.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 20, 20, FMT_RGB8, 3, 3, 1, 1, {0.0, 0.0, 0.0, 0.0}, eDeviceType::CPU))); - TEST_CASE((TestCorrectness(2, 32, 32, FMT_RGB8, 5, 5, {100.0, 100.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(2, 32, 32, FMT_RGB8, 5, 5, 2, 2, {100.0, 100.0, 0.0, 0.0}, eDeviceType::CPU))); - TEST_CASE((TestCorrectness(1, 16, 16, FMT_RGB8, 3, 3, {50.0, 50.0, 50.0, 0.0}, + TEST_CASE((TestCorrectness(1, 16, 16, FMT_RGB8, 3, 3, 1, 1, {50.0, 50.0, 50.0, 0.0}, eDeviceType::CPU))); // CPU correctness tests - RGBA8 (4 channels) - TEST_CASE((TestCorrectness(1, 10, 10, FMT_RGBA8, 3, 3, {0.0, 0.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 10, 10, FMT_RGBA8, 3, 3, 1, 1, {0.0, 0.0, 0.0, 0.0}, eDeviceType::CPU))); - TEST_CASE((TestCorrectness(5, 64, 64, FMT_RGBA8, 5, 5, {0.0, 0.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(5, 64, 64, FMT_RGBA8, 5, 5, 2, 2, {0.0, 0.0, 0.0, 0.0}, eDeviceType::CPU))); - TEST_CASE((TestCorrectness(2, 20, 20, FMT_RGBA8, 7, 7, {128.0, 128.0, 128.0, 255.0}, + TEST_CASE((TestCorrectness(2, 20, 20, FMT_RGBA8, 7, 7, 3, 3, {128.0, 128.0, 128.0, 255.0}, eDeviceType::CPU))); // CPU correctness tests - S16 (signed 16-bit) - TEST_CASE((TestCorrectness(1, 20, 20, FMT_S16, 3, 3, {500.0, 500.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 20, 20, FMT_S16, 3, 3, 1, 1, {500.0, 500.0, 0.0, 0.0}, eDeviceType::CPU))); - TEST_CASE((TestCorrectness(3, 20, 20, FMT_S16, 5, 5, {500.0, 500.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(3, 20, 20, FMT_S16, 5, 5, 2, 2, {500.0, 500.0, 0.0, 0.0}, eDeviceType::CPU))); - TEST_CASE((TestCorrectness(2, 16, 16, FMT_RGBs16, 3, 3, {100.0, 100.0, 100.0, 0.0}, + TEST_CASE((TestCorrectness(2, 16, 16, FMT_RGBs16, 3, 3, 1, 1, {100.0, 100.0, 100.0, 0.0}, eDeviceType::CPU))); - TEST_CASE((TestCorrectness(1, 24, 24, FMT_RGBAs16, 5, 5, {0.0, 0.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 24, 24, FMT_RGBAs16, 5, 5, 2, 2, {0.0, 0.0, 0.0, 0.0}, eDeviceType::CPU))); // CPU correctness tests - U16 (unsigned 16-bit) - TEST_CASE((TestCorrectness(1, 20, 20, FMT_U16, 3, 3, {0.0, 0.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 20, 20, FMT_U16, 3, 3, 1, 1, {0.0, 0.0, 0.0, 0.0}, eDeviceType::CPU))); - TEST_CASE((TestCorrectness(2, 20, 20, FMT_U16, 5, 5, {0.0, 0.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(2, 20, 20, FMT_U16, 5, 5, 2, 2, {0.0, 0.0, 0.0, 0.0}, eDeviceType::CPU))); - TEST_CASE((TestCorrectness(1, 20, 20, FMT_RGB16, 3, 3, {500.0, 600.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 20, 20, FMT_RGB16, 3, 3, 1, 1, {500.0, 600.0, 0.0, 0.0}, eDeviceType::CPU))); - TEST_CASE((TestCorrectness(1, 20, 20, FMT_RGBA16, 5, 5, {500.0, 600.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 20, 20, FMT_RGBA16, 5, 5, 2, 2, {500.0, 600.0, 0.0, 0.0}, eDeviceType::CPU))); // CPU correctness tests - S32 (signed 32-bit) - TEST_CASE((TestCorrectness(1, 32, 32, FMT_S32, 3, 3, {500.0, 500.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 32, 32, FMT_S32, 3, 3, 1, 1, {500.0, 500.0, 0.0, 0.0}, eDeviceType::CPU))); - TEST_CASE((TestCorrectness(2, 32, 32, FMT_S32, 5, 5, {500.0, 500.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(2, 32, 32, FMT_S32, 5, 5, 2, 2, {500.0, 500.0, 0.0, 0.0}, eDeviceType::CPU))); - TEST_CASE((TestCorrectness(1, 16, 16, FMT_RGBs32, 3, 3, {100.0, 100.0, 100.0, 0.0}, + TEST_CASE((TestCorrectness(1, 16, 16, FMT_RGBs32, 3, 3, 1, 1, {100.0, 100.0, 100.0, 0.0}, eDeviceType::CPU))); - TEST_CASE((TestCorrectness(2, 24, 24, FMT_RGBAs32, 5, 5, {0.0, 0.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(2, 24, 24, FMT_RGBAs32, 5, 5, 2, 2, {0.0, 0.0, 0.0, 0.0}, eDeviceType::CPU))); // CPU correctness tests - F32 (float) - TEST_CASE((TestCorrectness(1, 24, 24, FMT_F32, 3, 3, {500.0, 500.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 24, 24, FMT_F32, 3, 3, 1, 1, {500.0, 500.0, 0.0, 0.0}, eDeviceType::CPU))); - TEST_CASE((TestCorrectness(2, 24, 24, FMT_F32, 5, 5, {600.0, 500.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(2, 24, 24, FMT_F32, 5, 5, 2, 2, {600.0, 500.0, 0.0, 0.0}, eDeviceType::CPU))); - TEST_CASE((TestCorrectness(1, 24, 24, FMT_RGBf32, 3, 3, {500.0, 500.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 24, 24, FMT_RGBf32, 3, 3, 1, 1, {500.0, 500.0, 0.0, 0.0}, eDeviceType::CPU))); - TEST_CASE((TestCorrectness(2, 24, 24, FMT_RGBf32, 7, 7, {600.0, 500.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(2, 24, 24, FMT_RGBf32, 7, 7, 3, 3, {600.0, 500.0, 0.0, 0.0}, eDeviceType::CPU))); - TEST_CASE((TestCorrectness(1, 24, 24, FMT_RGBAf32, 3, 3, {500.0, 500.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 24, 24, FMT_RGBAf32, 3, 3, 1, 1, {500.0, 500.0, 0.0, 0.0}, eDeviceType::CPU))); - TEST_CASE((TestCorrectness(2, 24, 24, FMT_RGBAf32, 5, 5, {600.0, 500.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(2, 24, 24, FMT_RGBAf32, 5, 5, 2, 2, {600.0, 500.0, 0.0, 0.0}, eDeviceType::CPU))); // Additional edge cases - various kernel sizes - TEST_CASE((TestCorrectness(1, 40, 40, FMT_RGB8, 9, 9, {0.0, 0.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 40, 40, FMT_RGB8, 9, 9, 4, 4, {0.0, 0.0, 0.0, 0.0}, eDeviceType::GPU))); - TEST_CASE((TestCorrectness(1, 50, 50, FMT_RGBAf32, 11, 11, {0.0, 0.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 50, 50, FMT_RGBAf32, 11, 11, 5, 5, {0.0, 0.0, 0.0, 0.0}, eDeviceType::GPU))); - TEST_CASE((TestCorrectness(1, 40, 40, FMT_RGB8, 9, 9, {0.0, 0.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 40, 40, FMT_RGB8, 9, 9, 4, 4, {0.0, 0.0, 0.0, 0.0}, eDeviceType::CPU))); - TEST_CASE((TestCorrectness(1, 50, 50, FMT_RGBAf32, 11, 11, {0.0, 0.0, 0.0, 0.0}, + TEST_CASE((TestCorrectness(1, 50, 50, FMT_RGBAf32, 11, 11, 5, 5, {0.0, 0.0, 0.0, 0.0}, eDeviceType::CPU))); TEST_CASES_END(); diff --git a/tests/roccv/python/test_op_avg_blur.py b/tests/roccv/python/test_op_avg_blur.py index d38ab2fe..877c175d 100644 --- a/tests/roccv/python/test_op_avg_blur.py +++ b/tests/roccv/python/test_op_avg_blur.py @@ -33,7 +33,7 @@ @pytest.mark.parametrize("border_val", [[0, 0, 0, 0]]) @pytest.mark.parametrize("kernel_size,anchor", [ ((3, 3), (1, 1)), - ((7, 7), (3, 3)), + ((7, 7), (-1, -1)), ((9, 9), (4, 4)) ]) @pytest.mark.parametrize("channels", [1, 3, 4])