ROCm · paveltc · Apr 8, 2026 · Apr 9, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/docs/reference/rocCV-supported-operators.rst b/docs/reference/rocCV-supported-operators.rst
@@ -13,6 +13,7 @@ The rocCV is a collection of the following computer vision operators that are su
     :header: "Operator", "Description", "Datatypes", "Layouts"
 
     "AdvCvtColor","Converts color spaces with explicit BT601/BT709/BT2020 coefficients, including NV12/NV21 paths.","U8","NHWC, HWC"
+    "AvgBlur","Applies an average blur filter on images in a tensor.","U8, U16, S16, S32, F32","NHWC, HWC"
     "BilateralFilter", "Applies a bilateral filter.", "U8", "NHWC, HWC"
     "BndBox","Draws bounding boxes on the images in a tensor.","U8","NHWC, HWC"
     "Composite","Composites two input tensors using a provided alpha mask.","U8, S8, U32, S32, F32","NHWC, HWC"

diff --git a/docs/supported-operators.md b/docs/supported-operators.md
@@ -5,6 +5,7 @@ See below for a list of Computer Vision operators rocCV supports.
 |Name|Description|Datatypes|Layouts|CPU/GPU Support|
 |-|-|-|-|-|
 |AdvCvtColor|Converts color spaces using explicit BT601/BT709/BT2020 coefficients and supports NV12/NV21 paths.|U8|NHWC, HWC|Both|
+|AvgBlur|Applies an average blur filter on images in a tensor.|U8, U16, S16, S32, F32|NHWC, HWC|Both|
 |BilateralFilter|Applies a bilateral filter to reduce image noise while preserving strong edges.|U8, S8, U16, S16, U32, S32, F32, F64|NHWC, HWC|Both|
 |BndBox|Draws rectangular borders using the specified locations, dimensions and colors, in order to show the locations and sizes of objects in an image.|U8, S8|NHWC, HWC|Both|
 |CenterCrop|Crops an image at its center with a given rectangular region.|U8, S8, U16, S16, U32, S32, F32, F64|NHWC, HWC|Both|

diff --git a/include/kernels/device/avg_blur_device.hpp b/include/kernels/device/avg_blur_device.hpp
@@ -0,0 +1,191 @@
+/**
+Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+
+#include "core/detail/casting.hpp"
+#include "core/detail/type_traits.hpp"
+#include "core/detail/vector_utils.hpp"
+#include "operator_types.h"
+
+namespace Kernels {
+namespace Device {
+
+template <typename T, typename SrcWrapper, typename DstWrapper>
+__global__ void avg_blur_2d(SrcWrapper input, DstWrapper output,
+                            int kernelWidth, int kernelHeight,
+                            int kernelAnchorX, int kernelAnchorY) {
+    using namespace roccv::detail;
+    using WorkType = MakeType<float, NumElements<T>>;
+
+    const int x = blockDim.x * blockIdx.x + threadIdx.x;
+    const int y = blockDim.y * blockIdx.y + threadIdx.y;
+    const int b = blockIdx.z;
+
+    if (x >= output.width() || y >= output.height() || b >= output.batches()) {
+        return;
+    }
+
+    // Initialize accumulator
+    WorkType sum = SetAll<WorkType>(0.0f);
+
+    // Compute the sum over the kernel window
+    for (int ky = 0; ky < kernelHeight; ++ky) {
+        int srcY = y - kernelAnchorY + ky;
+
+        for (int kx = 0; kx < kernelWidth; ++kx) {
+            int srcX = x - kernelAnchorX + kx;
+
+            // Read pixel through border wrapper (handles out-of-bounds)
+            T pixel = input.at(b, srcY, srcX, 0);
+
+            // Accumulate as float to avoid overflow
+            sum = sum + StaticCast<WorkType>(pixel);
+        }
+    }
+
+    // Compute average by dividing by kernel area
+    float kernelArea = static_cast<float>(kernelWidth * kernelHeight);
+    WorkType average = sum / kernelArea;
+
+    // Write result with saturation
+    output.at(b, y, x, 0) = SaturateCast<T>(average);
+}
+
+/**
+ * @brief Optimized 1D Horizontal Average Blur Kernel with Shared Memory Tiling
+ *
+ * First pass of separable average blur. Applies horizontal averaging using shared
+ * memory to reduce global memory bandwidth. Each block loads a tile of input data
+ * into shared memory, then each thread computes its output using only shared memory.
+ */
+template <typename T, int BLOCK_WIDTH, typename SrcWrapper, typename DstWrapper>
+__global__ void avg_blur_horizontal(SrcWrapper input, DstWrapper output,
+                                    int kernelWidth, int kernelAnchorX) {
+    using namespace roccv::detail;
+    using WorkType = MakeType<float, NumElements<T>>;
+
+    const int x = blockDim.x * blockIdx.x + threadIdx.x;
+    const int y = blockIdx.y;
+    const int b = blockIdx.z;
+
+    // Check bounds for loading - all threads in block participate
+    const bool validBlock = (y < output.height() && b < output.batches());
+
+    // Shared memory tile: BLOCK_WIDTH + kernel halo on both sides
+    extern __shared__ char smem[];
+    T* tile = reinterpret_cast<T*>(smem);
+
+    const int halo = kernelWidth - 1;
+    const int tileWidth = BLOCK_WIDTH + halo;
+
+    // Load data into shared memory with halo
+    // All threads participate in loading to avoid divergence before syncthreads
+    for (int i = threadIdx.x; i < tileWidth; i += blockDim.x) {
+        int srcX = blockIdx.x * BLOCK_WIDTH + i - kernelAnchorX;
+        if (validBlock) {
+            tile[i] = input.at(b, y, srcX, 0);
+        }
+    }
+
+    __syncthreads();
+
+    // Now check if this specific thread has valid output to compute
+    if (x >= output.width() || !validBlock) {
+        return;
+    }
+
+    // Compute horizontal average using shared memory
+    WorkType sum = SetAll<WorkType>(0.0f);
+
+    int tileIdx = threadIdx.x + kernelAnchorX;
+    for (int kx = 0; kx < kernelWidth; ++kx) {
+        sum = sum + StaticCast<WorkType>(tile[tileIdx - kernelAnchorX + kx]);
+    }
+
+    float kernelSize = static_cast<float>(kernelWidth);
+    WorkType average = sum / kernelSize;
+
+    output.at(b, y, x, 0) = SaturateCast<T>(average);
+}
+
+/**
+ * @brief Optimized 1D Vertical Average Blur Kernel with Shared Memory Tiling
+ *
+ * Second pass of separable average blur. Applies vertical averaging using shared
+ * memory to reduce global memory bandwidth. Each block loads a column tile into
+ * shared memory, then computes vertical averages.
+ */
+template <typename T, int BLOCK_HEIGHT, typename SrcWrapper, typename DstWrapper>
+__global__ void avg_blur_vertical(SrcWrapper input, DstWrapper output,
+                                  int kernelHeight, int kernelAnchorY) {
+    using namespace roccv::detail;
+    using WorkType = MakeType<float, NumElements<T>>;
+
+    const int x = blockIdx.x;
+    const int y = blockDim.y * blockIdx.y + threadIdx.y;
+    const int b = blockIdx.z;
+
+    // Check bounds for loading - all threads in block participate
+    const bool validBlock = (x < output.width() && b < output.batches());
+
+    // Shared memory tile: BLOCK_HEIGHT + kernel halo on both sides
+    extern __shared__ char smem[];
+    T* tile = reinterpret_cast<T*>(smem);
+
+    const int halo = kernelHeight - 1;
+    const int tileHeight = BLOCK_HEIGHT + halo;
+
+    // Load data into shared memory with halo
+    // All threads participate in loading to avoid divergence before syncthreads
+    for (int i = threadIdx.y; i < tileHeight; i += blockDim.y) {
+        int srcY = blockIdx.y * BLOCK_HEIGHT + i - kernelAnchorY;
+        if (validBlock) {
+            tile[i] = input.at(b, srcY, x, 0);
+        }
+    }
+
+    __syncthreads();
+
+    // Now check if this specific thread has valid output to compute
+    if (y >= output.height() || !validBlock) {
+        return;
+    }
+
+    // Compute vertical average using shared memory
+    WorkType sum = SetAll<WorkType>(0.0f);
+
+    int tileIdx = threadIdx.y + kernelAnchorY;
+    for (int ky = 0; ky < kernelHeight; ++ky) {
+        sum = sum + StaticCast<WorkType>(tile[tileIdx - kernelAnchorY + ky]);
+    }
+
+    float kernelSize = static_cast<float>(kernelHeight);
+    WorkType average = sum / kernelSize;
+
+    output.at(b, y, x, 0) = SaturateCast<T>(average);
+}
+
+}  // namespace Device
+}  // namespace Kernels
diff --git a/include/kernels/host/avg_blur_host.hpp b/include/kernels/host/avg_blur_host.hpp
@@ -0,0 +1,94 @@
+/**
+Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include "core/detail/casting.hpp"
+#include "core/detail/type_traits.hpp"
+#include "core/detail/vector_utils.hpp"
+#include "operator_types.h"
+
+namespace Kernels {
+namespace Host {
+
+/**
+ * @brief CPU implementation of 2D Average Blur
+ *
+ * Applies a 2D averaging filter to the input image on CPU. Each output pixel is computed
+ * as the average of all pixels within the kernel window.
+ *
+ * @tparam T Data type (e.g., uchar3, float4, etc.)
+ * @tparam SrcWrapper Border wrapper type for input
+ * @tparam DstWrapper Image wrapper type for output
+ *
+ * @param input Input image with border handling
+ * @param output Output image
+ * @param kernelWidth Width of the averaging kernel
+ * @param kernelHeight Height of the averaging kernel
+ * @param kernelAnchorX X-coordinate of kernel anchor point
+ * @param kernelAnchorY Y-coordinate of kernel anchor point
+ */
+template <typename T, typename SrcWrapper, typename DstWrapper>
+void avg_blur_2d(SrcWrapper input, DstWrapper output,
+                 int kernelWidth, int kernelHeight,
+                 int kernelAnchorX, int kernelAnchorY) {
+    using namespace roccv::detail;
+    using WorkType = MakeType<float, NumElements<T>>;
+
+    // Compute kernel area for averaging
+    float kernelArea = static_cast<float>(kernelWidth * kernelHeight);
+
+    // Iterate over all batches
+    for (int b = 0; b < output.batches(); b++) {
+        // Iterate over all output pixels
+        for (int y = 0; y < output.height(); y++) {
+            for (int x = 0; x < output.width(); x++) {
+                // Initialize accumulator
+                WorkType sum = SetAll<WorkType>(0.0f);
+
+                // Compute the sum over the kernel window
+                for (int ky = 0; ky < kernelHeight; ++ky) {
+                    int srcY = y - kernelAnchorY + ky;
+
+                    for (int kx = 0; kx < kernelWidth; ++kx) {
+                        int srcX = x - kernelAnchorX + kx;
+
+                        // Read pixel through border wrapper (handles out-of-bounds)
+                        T pixel = input.at(b, srcY, srcX, 0);
+
+                        // Accumulate as float to avoid overflow
+                        sum = sum + StaticCast<WorkType>(pixel);
+                    }
+                }
+
+                // Compute average by dividing by kernel area
+                WorkType average = sum / kernelArea;
+
+                // Write result with saturation
+                output.at(b, y, x, 0) = SaturateCast<T>(average);
+            }
+        }
+    }
+}
+
+}  // namespace Host
+}  // namespace Kernels
diff --git a/include/op_avg_blur.hpp b/include/op_avg_blur.hpp
@@ -0,0 +1,94 @@
+/**
+Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#pragma once
+
+#include <i_operator.hpp>
+
+#include "core/tensor.hpp"
+#include "operator_types.h"
+
+namespace roccv {
+/**
+ * @brief Class for managing the Average Blur operator
+ *
+ */
+class AvgBlur final : public IOperator {
+   public:
+    /**
+     * @brief Constructs an AvgBlur object.
+     *
+     */
+    AvgBlur();
+
+    /**
+     * @brief Destroy the AvgBlur object
+     *
+     */
+    ~AvgBlur();
+
+    /**
+     * @brief Construct a new AvgBlur object.
+     * The object can be used to apply an average (mean) blur filter on images in a tensor.
+     *
+     * Limitations:
+     *
+     * Input:
+     *       Supported TensorLayout(s): [NHWC, HWC]
+     *                        Channels: [1, 3, 4]
+     *       Supported DataType(s):     [U8, U16, S16, S32, F32]
+     *
+     * Output:
+     *       Supported TensorLayout(s): [NHWC, HWC]
+     *                        Channels: [1, 3, 4]
+     *       Supported DataType(s):     [U8, U16, S16, S32, F32]
+     *
+     * Input/Output dependency
+     *
+     *       Property      |  Input == Output
+     *      -------------- | -------------
+     *       Data Layout   | Yes
+     *       Data Type     | Yes
+     *       Number        | Yes
+     *       Channels      | Yes
+     *       Width         | Yes
+     *       Height        | Yes
+     *
+     *
+     * @param[in] stream The HIP stream to run this operation on.
+     * @param[in] input Input tensor with image batch data
+     * @param[out] output Output tensor for storing modified image batch data
+     * @param[in] kernelWidth Width of the averaging kernel.
+     * @param[in] kernelHeight Height of the averaging kernel.
+     * @param[in] kernelAnchorX Kernel anchor in X direction.
+     * @param[in] kernelAnchorY Kernel anchor in Y direction.
+     * @param[in] borderMode A border type to identify the pixel extrapolation
+     * method (e.g. BORDER_TYPE_CONSTANT or BORDER_TYPE_REPLICATE)
+     * @param[in] borderValue Set as 0 unless using a constant border.
+     * @param[in] device The device which this operation should run on.
+     * (Default: eDeviceType::GPU)
+     *
+     */
+    void operator()(hipStream_t stream, const Tensor& input, const Tensor& output,
+                    int kernelWidth, int kernelHeight, int kernelAnchorX, int kernelAnchorY,
+                    eBorderType borderType, float4 borderValue, eDeviceType device = eDeviceType::GPU);
+};
+}  // namespace roccv