From e89b7d81cf353e89e2ba6b8accea1710a28ccb65 Mon Sep 17 00:00:00 2001
From: svcbuild <paveltc>
Date: Wed, 8 Apr 2026 15:57:44 -0700
Subject: [PATCH 1/2] Average Blur operator added

---
 docs/reference/rocCV-supported-operators.rst  |   1 +
 docs/supported-operators.md                   |   1 +
 include/kernels/device/avg_blur_device.hpp    | 191 +++++++++++
 include/kernels/host/avg_blur_host.hpp        |  94 ++++++
 include/op_avg_blur.hpp                       |  94 ++++++
 python/include/operators/py_op_avg_blur.hpp   |  45 +++
 python/src/main.cpp                           |   2 +
 python/src/operators/py_op_avg_blur.cpp       | 125 ++++++++
 src/op_avg_blur.cpp                           | 289 +++++++++++++++++
 .../src/tests/operators/test_op_avg_blur.cpp  | 302 ++++++++++++++++++
 tests/roccv/python/test_op_avg_blur.py        |  56 ++++
 11 files changed, 1200 insertions(+)
 create mode 100644 include/kernels/device/avg_blur_device.hpp
 create mode 100644 include/kernels/host/avg_blur_host.hpp
 create mode 100644 include/op_avg_blur.hpp
 create mode 100644 python/include/operators/py_op_avg_blur.hpp
 create mode 100644 python/src/operators/py_op_avg_blur.cpp
 create mode 100644 src/op_avg_blur.cpp
 create mode 100644 tests/roccv/cpp/src/tests/operators/test_op_avg_blur.cpp
 create mode 100644 tests/roccv/python/test_op_avg_blur.py
diff --git a/docs/reference/rocCV-supported-operators.rst b/docs/reference/rocCV-supported-operators.rst
index 97dbdb20..77faa592 100644
--- a/docs/reference/rocCV-supported-operators.rst
+++ b/docs/reference/rocCV-supported-operators.rst
@@ -13,6 +13,7 @@ The rocCV is a collection of the following computer vision operators that are su
     :header: "Operator", "Description", "Datatypes", "Layouts"
     
     "AdvCvtColor","Converts color spaces with explicit BT601/BT709/BT2020 coefficients, including NV12/NV21 paths.","U8","NHWC, HWC"
+    "AvgBlur","Applies an average blur filter on images in a tensor.","U8, U16, S16, S32, F32","NHWC, HWC"
     "BilateralFilter", "Applies a bilateral filter.", "U8", "NHWC, HWC"
     "BndBox","Draws bounding boxes on the images in a tensor.","U8","NHWC, HWC"
     "Composite","Composites two input tensors using a provided alpha mask.","U8, S8, U32, S32, F32","NHWC, HWC"
diff --git a/docs/supported-operators.md b/docs/supported-operators.md
index 081ebfcb..46038563 100644
--- a/docs/supported-operators.md
+++ b/docs/supported-operators.md
@@ -5,6 +5,7 @@ See below for a list of Computer Vision operators rocCV supports.
 |Name|Description|Datatypes|Layouts|CPU/GPU Support|
 |-|-|-|-|-|
 |AdvCvtColor|Converts color spaces using explicit BT601/BT709/BT2020 coefficients and supports NV12/NV21 paths.|U8|NHWC, HWC|Both|
+|AvgBlur|Applies an average blur filter on images in a tensor.|U8, U16, S16, S32, F32|NHWC, HWC|Both|
 |BilateralFilter|Applies a bilateral filter to reduce image noise while preserving strong edges.|U8, S8, U16, S16, U32, S32, F32, F64|NHWC, HWC|Both|
 |BndBox|Draws rectangular borders using the specified locations, dimensions and colors, in order to show the locations and sizes of objects in an image.|U8, S8|NHWC, HWC|Both|
 |CenterCrop|Crops an image at its center with a given rectangular region.|U8, S8, U16, S16, U32, S32, F32, F64|NHWC, HWC|Both|
diff --git a/include/kernels/device/avg_blur_device.hpp b/include/kernels/device/avg_blur_device.hpp
new file mode 100644
index 00000000..f72016ef
--- /dev/null
+++ b/include/kernels/device/avg_blur_device.hpp
@@ -0,0 +1,191 @@
+/**
+Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+
+#include "core/detail/casting.hpp"
+#include "core/detail/type_traits.hpp"
+#include "core/detail/vector_utils.hpp"
+#include "operator_types.h"
+
+namespace Kernels {
+namespace Device {
+
+template <typename T, typename SrcWrapper, typename DstWrapper>
+__global__ void avg_blur_2d(SrcWrapper input, DstWrapper output,
+                            int kernelWidth, int kernelHeight,
+                            int kernelAnchorX, int kernelAnchorY) {
+    using namespace roccv::detail;
+    using WorkType = MakeType<float, NumElements<T>>;
+
+    const int x = blockDim.x * blockIdx.x + threadIdx.x;
+    const int y = blockDim.y * blockIdx.y + threadIdx.y;
+    const int b = blockIdx.z;
+
+    if (x >= output.width() || y >= output.height() || b >= output.batches()) {
+        return;
+    }
+
+    // Initialize accumulator
+    WorkType sum = SetAll<WorkType>(0.0f);
+
+    // Compute the sum over the kernel window
+    for (int ky = 0; ky < kernelHeight; ++ky) {
+        int srcY = y - kernelAnchorY + ky;
+
+        for (int kx = 0; kx < kernelWidth; ++kx) {
+            int srcX = x - kernelAnchorX + kx;
+
+            // Read pixel through border wrapper (handles out-of-bounds)
+            T pixel = input.at(b, srcY, srcX, 0);
+
+            // Accumulate as float to avoid overflow
+            sum = sum + StaticCast<WorkType>(pixel);
+        }
+    }
+
+    // Compute average by dividing by kernel area
+    float kernelArea = static_cast<float>(kernelWidth * kernelHeight);
+    WorkType average = sum / kernelArea;
+
+    // Write result with saturation
+    output.at(b, y, x, 0) = SaturateCast<T>(average);
+}
+
+/**
+ * @brief Optimized 1D Horizontal Average Blur Kernel with Shared Memory Tiling
+ *
+ * First pass of separable average blur. Applies horizontal averaging using shared
+ * memory to reduce global memory bandwidth. Each block loads a tile of input data
+ * into shared memory, then each thread computes its output using only shared memory.
+ */
+template <typename T, int BLOCK_WIDTH, typename SrcWrapper, typename DstWrapper>
+__global__ void avg_blur_horizontal(SrcWrapper input, DstWrapper output,
+                                    int kernelWidth, int kernelAnchorX) {
+    using namespace roccv::detail;
+    using WorkType = MakeType<float, NumElements<T>>;
+
+    const int x = blockDim.x * blockIdx.x + threadIdx.x;
+    const int y = blockIdx.y;
+    const int b = blockIdx.z;
+
+    // Check bounds for loading - all threads in block participate
+    const bool validBlock = (y < output.height() && b < output.batches());
+
+    // Shared memory tile: BLOCK_WIDTH + kernel halo on both sides
+    extern __shared__ char smem[];
+    T* tile = reinterpret_cast<T*>(smem);
+
+    const int halo = kernelWidth - 1;
+    const int tileWidth = BLOCK_WIDTH + halo;
+
+    // Load data into shared memory with halo
+    // All threads participate in loading to avoid divergence before syncthreads
+    for (int i = threadIdx.x; i < tileWidth; i += blockDim.x) {
+        int srcX = blockIdx.x * BLOCK_WIDTH + i - kernelAnchorX;
+        if (validBlock) {
+            tile[i] = input.at(b, y, srcX, 0);
+        }
+    }
+
+    __syncthreads();
+
+    // Now check if this specific thread has valid output to compute
+    if (x >= output.width() || !validBlock) {
+        return;
+    }
+
+    // Compute horizontal average using shared memory
+    WorkType sum = SetAll<WorkType>(0.0f);
+
+    int tileIdx = threadIdx.x + kernelAnchorX;
+    for (int kx = 0; kx < kernelWidth; ++kx) {
+        sum = sum + StaticCast<WorkType>(tile[tileIdx - kernelAnchorX + kx]);
+    }
+
+    float kernelSize = static_cast<float>(kernelWidth);
+    WorkType average = sum / kernelSize;
+
+    output.at(b, y, x, 0) = SaturateCast<T>(average);
+}
+
+/**
+ * @brief Optimized 1D Vertical Average Blur Kernel with Shared Memory Tiling
+ *
+ * Second pass of separable average blur. Applies vertical averaging using shared
+ * memory to reduce global memory bandwidth. Each block loads a column tile into
+ * shared memory, then computes vertical averages.
+ */
+template <typename T, int BLOCK_HEIGHT, typename SrcWrapper, typename DstWrapper>
+__global__ void avg_blur_vertical(SrcWrapper input, DstWrapper output,
+                                  int kernelHeight, int kernelAnchorY) {
+    using namespace roccv::detail;
+    using WorkType = MakeType<float, NumElements<T>>;
+
+    const int x = blockIdx.x;
+    const int y = blockDim.y * blockIdx.y + threadIdx.y;
+    const int b = blockIdx.z;
+
+    // Check bounds for loading - all threads in block participate
+    const bool validBlock = (x < output.width() && b < output.batches());
+
+    // Shared memory tile: BLOCK_HEIGHT + kernel halo on both sides
+    extern __shared__ char smem[];
+    T* tile = reinterpret_cast<T*>(smem);
+
+    const int halo = kernelHeight - 1;
+    const int tileHeight = BLOCK_HEIGHT + halo;
+
+    // Load data into shared memory with halo
+    // All threads participate in loading to avoid divergence before syncthreads
+    for (int i = threadIdx.y; i < tileHeight; i += blockDim.y) {
+        int srcY = blockIdx.y * BLOCK_HEIGHT + i - kernelAnchorY;
+        if (validBlock) {
+            tile[i] = input.at(b, srcY, x, 0);
+        }
+    }
+
+    __syncthreads();
+
+    // Now check if this specific thread has valid output to compute
+    if (y >= output.height() || !validBlock) {
+        return;
+    }
+
+    // Compute vertical average using shared memory
+    WorkType sum = SetAll<WorkType>(0.0f);
+
+    int tileIdx = threadIdx.y + kernelAnchorY;
+    for (int ky = 0; ky < kernelHeight; ++ky) {
+        sum = sum + StaticCast<WorkType>(tile[tileIdx - kernelAnchorY + ky]);
+    }
+
+    float kernelSize = static_cast<float>(kernelHeight);
+    WorkType average = sum / kernelSize;
+
+    output.at(b, y, x, 0) = SaturateCast<T>(average);
+}
+
+}  // namespace Device
+}  // namespace Kernels
diff --git a/include/kernels/host/avg_blur_host.hpp b/include/kernels/host/avg_blur_host.hpp
new file mode 100644
index 00000000..d2aced8b
--- /dev/null
+++ b/include/kernels/host/avg_blur_host.hpp
@@ -0,0 +1,94 @@
+/**
+Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include "core/detail/casting.hpp"
+#include "core/detail/type_traits.hpp"
+#include "core/detail/vector_utils.hpp"
+#include "operator_types.h"
+
+namespace Kernels {
+namespace Host {
+
+/**
+ * @brief CPU implementation of 2D Average Blur
+ *
+ * Applies a 2D averaging filter to the input image on CPU. Each output pixel is computed
+ * as the average of all pixels within the kernel window.
+ *
+ * @tparam T Data type (e.g., uchar3, float4, etc.)
+ * @tparam SrcWrapper Border wrapper type for input
+ * @tparam DstWrapper Image wrapper type for output
+ *
+ * @param input Input image with border handling
+ * @param output Output image
+ * @param kernelWidth Width of the averaging kernel
+ * @param kernelHeight Height of the averaging kernel
+ * @param kernelAnchorX X-coordinate of kernel anchor point
+ * @param kernelAnchorY Y-coordinate of kernel anchor point
+ */
+template <typename T, typename SrcWrapper, typename DstWrapper>
+void avg_blur_2d(SrcWrapper input, DstWrapper output,
+                 int kernelWidth, int kernelHeight,
+                 int kernelAnchorX, int kernelAnchorY) {
+    using namespace roccv::detail;
+    using WorkType = MakeType<float, NumElements<T>>;
+
+    // Compute kernel area for averaging
+    float kernelArea = static_cast<float>(kernelWidth * kernelHeight);
+
+    // Iterate over all batches
+    for (int b = 0; b < output.batches(); b++) {
+        // Iterate over all output pixels
+        for (int y = 0; y < output.height(); y++) {
+            for (int x = 0; x < output.width(); x++) {
+                // Initialize accumulator
+                WorkType sum = SetAll<WorkType>(0.0f);
+
+                // Compute the sum over the kernel window
+                for (int ky = 0; ky < kernelHeight; ++ky) {
+                    int srcY = y - kernelAnchorY + ky;
+
+                    for (int kx = 0; kx < kernelWidth; ++kx) {
+                        int srcX = x - kernelAnchorX + kx;
+
+                        // Read pixel through border wrapper (handles out-of-bounds)
+                        T pixel = input.at(b, srcY, srcX, 0);
+
+                        // Accumulate as float to avoid overflow
+                        sum = sum + StaticCast<WorkType>(pixel);
+                    }
+                }
+
+                // Compute average by dividing by kernel area
+                WorkType average = sum / kernelArea;
+
+                // Write result with saturation
+                output.at(b, y, x, 0) = SaturateCast<T>(average);
+            }
+        }
+    }
+}
+
+}  // namespace Host
+}  // namespace Kernels
diff --git a/include/op_avg_blur.hpp b/include/op_avg_blur.hpp
new file mode 100644
index 00000000..96bf132a
--- /dev/null
+++ b/include/op_avg_blur.hpp
@@ -0,0 +1,94 @@
+/**
+Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#pragma once
+
+#include <i_operator.hpp>
+
+#include "core/tensor.hpp"
+#include "operator_types.h"
+
+namespace roccv {
+/**
+ * @brief Class for managing the Average Blur operator
+ *
+ */
+class AvgBlur final : public IOperator {
+   public:
+    /**
+     * @brief Constructs an AvgBlur object.
+     *
+     */
+    AvgBlur();
+
+    /**
+     * @brief Destroy the AvgBlur object
+     *
+     */
+    ~AvgBlur();
+
+    /**
+     * @brief Construct a new AvgBlur object.
+     * The object can be used to apply an average (mean) blur filter on images in a tensor.
+     *
+     * Limitations:
+     *
+     * Input:
+     *       Supported TensorLayout(s): [NHWC, HWC]
+     *                        Channels: [1, 3, 4]
+     *       Supported DataType(s):     [U8, U16, S16, S32, F32]
+     *
+     * Output:
+     *       Supported TensorLayout(s): [NHWC, HWC]
+     *                        Channels: [1, 3, 4]
+     *       Supported DataType(s):     [U8, U16, S16, S32, F32]
+     *
+     * Input/Output dependency
+     *
+     *       Property      |  Input == Output
+     *      -------------- | -------------
+     *       Data Layout   | Yes
+     *       Data Type     | Yes
+     *       Number        | Yes
+     *       Channels      | Yes
+     *       Width         | Yes
+     *       Height        | Yes
+     *
+     *
+     * @param[in] stream The HIP stream to run this operation on.
+     * @param[in] input Input tensor with image batch data
+     * @param[out] output Output tensor for storing modified image batch data
+     * @param[in] kernelWidth Width of the averaging kernel.
+     * @param[in] kernelHeight Height of the averaging kernel.
+     * @param[in] kernelAnchorX Kernel anchor in X direction.
+     * @param[in] kernelAnchorY Kernel anchor in Y direction.
+     * @param[in] borderMode A border type to identify the pixel extrapolation
+     * method (e.g. BORDER_TYPE_CONSTANT or BORDER_TYPE_REPLICATE)
+     * @param[in] borderValue Set as 0 unless using a constant border.
+     * @param[in] device The device which this operation should run on.
+     * (Default: eDeviceType::GPU)
+     *
+     */
+    void operator()(hipStream_t stream, const Tensor& input, const Tensor& output,
+                    int kernelWidth, int kernelHeight, int kernelAnchorX, int kernelAnchorY,
+                    eBorderType borderType, float4 borderValue, eDeviceType device = eDeviceType::GPU);
+};
+}  // namespace roccv
\ No newline at end of file
diff --git a/python/include/operators/py_op_avg_blur.hpp b/python/include/operators/py_op_avg_blur.hpp
new file mode 100644
index 00000000..c5bb4d62
--- /dev/null
+++ b/python/include/operators/py_op_avg_blur.hpp
@@ -0,0 +1,45 @@
+/**
+Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include <operator_types.h>
+#include <pybind11/pybind11.h>
+
+#include "py_stream.hpp"
+#include "py_tensor.hpp"
+
+namespace py = pybind11;
+
+class PyOpAvgBlur {
+   public:
+    static void Export(py::module& m);
+    static PyTensor Execute(PyTensor& input, py::tuple kernelSize, py::tuple anchor,
+                            eBorderType borderMode, py::list borderValue,
+                            std::optional<std::reference_wrapper<PyStream>> stream,
+                            eDeviceType device);
+    static void ExecuteInto(PyTensor& output, PyTensor& input,
+                            py::tuple kernelSize, py::tuple anchor,
+                            eBorderType borderMode, py::list borderValue,
+                            std::optional<std::reference_wrapper<PyStream>> stream,
+                            eDeviceType device);
+};
diff --git a/python/src/main.cpp b/python/src/main.cpp
index ddbec687..31444ab4 100644
--- a/python/src/main.cpp
+++ b/python/src/main.cpp
@@ -23,6 +23,7 @@ THE SOFTWARE.
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include "operators/py_op_avg_blur.hpp"
 #include "operators/py_op_bilateral_filter.hpp"
 #include "operators/py_op_bnd_box.hpp"
 #include "operators/py_op_center_crop.hpp"
@@ -61,6 +62,7 @@ PYBIND11_MODULE(rocpycv, m) {
     PyStructs::Export(m);
     PyStream::Export(m);
     PyTensor::Export(m);
+    PyOpAvgBlur::Export(m);
     PyOpCustomCrop::Export(m);
     PyOpNonMaxSuppression::Export(m);
     PyOpNormalize::Export(m);
diff --git a/python/src/operators/py_op_avg_blur.cpp b/python/src/operators/py_op_avg_blur.cpp
new file mode 100644
index 00000000..46ae9c1c
--- /dev/null
+++ b/python/src/operators/py_op_avg_blur.cpp
@@ -0,0 +1,125 @@
+/**
+Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "operators/py_op_avg_blur.hpp"
+
+#include <op_avg_blur.hpp>
+
+#include "py_helpers.hpp"
+
+PyTensor PyOpAvgBlur::Execute(PyTensor& input, py::tuple kernelSize, py::tuple anchor,
+                              eBorderType borderMode, py::list borderValue,
+                              std::optional<std::reference_wrapper<PyStream>> stream,
+                              eDeviceType device) {
+
+    hipStream_t hipStream = stream.has_value() ? stream.value().get().getStream() : nullptr;
+
+    // Extract kernel size from tuple
+    int kernelWidth = kernelSize[0].cast<int>();
+    int kernelHeight = kernelSize[1].cast<int>();
+
+    // Extract anchor from tuple
+    int kernelAnchorX = anchor[0].cast<int>();
+    int kernelAnchorY = anchor[1].cast<int>();
+
+    auto inputTensor = input.getTensor();
+    auto outputTensor = std::make_shared<roccv::Tensor>(inputTensor->shape(), inputTensor->dtype(), device);
+
+    roccv::AvgBlur op;
+    op(hipStream, *inputTensor, *outputTensor, kernelWidth, kernelHeight,
+       kernelAnchorX, kernelAnchorY, borderMode, GetFloat4FromPyList(borderValue), device);
+    return PyTensor(outputTensor);
+}
+
+void PyOpAvgBlur::ExecuteInto(PyTensor& output, PyTensor& input,
+                              py::tuple kernelSize, py::tuple anchor,
+                              eBorderType borderMode, py::list borderValue,
+                              std::optional<std::reference_wrapper<PyStream>> stream,
+                              eDeviceType device) {
+
+    hipStream_t hipStream = stream.has_value() ? stream.value().get().getStream() : nullptr;
+
+    // Extract kernel size from tuple
+    int kernelWidth = kernelSize[0].cast<int>();
+    int kernelHeight = kernelSize[1].cast<int>();
+
+    // Extract anchor from tuple
+    int kernelAnchorX = anchor[0].cast<int>();
+    int kernelAnchorY = anchor[1].cast<int>();
+
+    roccv::AvgBlur op;
+    op(hipStream, *input.getTensor(), *output.getTensor(), kernelWidth, kernelHeight,
+       kernelAnchorX, kernelAnchorY, borderMode, GetFloat4FromPyList(borderValue), device);
+}
+
+void PyOpAvgBlur::Export(py::module& m) {
+    using namespace py::literals;
+    m.def("avg_blur", &PyOpAvgBlur::Execute, "src"_a, "kernelSize"_a, "anchor"_a,
+                                              "borderMode"_a, "borderValue"_a,
+                                              "stream"_a = nullptr, "device"_a = eDeviceType::GPU, R"pbdoc(
+
+            Executes the Average Blur operation on the given HIP stream.
+
+            See also:
+                Refer to the rocCV C++ API reference for more information on this operation.
+
+            Args:
+                src (rocpycv.Tensor): Input tensor containing one or more images.
+                kernelSize (Tuple[int, int]): Kernel size as (width, height).
+                anchor (Tuple[int, int]): Kernel anchor position as (x, y).
+                borderMode (rocpycv.eBorderType): The border type to identify the pixel extrapolation method.
+                borderValue (List[float]): The color value to use when a constant border is selected.
+                stream (rocpycv.Stream, optional): HIP stream to run this operation on.
+                device (rocpycv.Device, optional): The device to run this operation on. Defaults to GPU.
+
+            Returns:
+                rocpycv.Tensor: The output tensor with blurred images.
+          )pbdoc");
+
+    m.def("avg_blur_into", &PyOpAvgBlur::ExecuteInto, "dst"_a, "src"_a, "kernelSize"_a, "anchor"_a,
+                                                       "borderMode"_a, "borderValue"_a,
+                                                       "stream"_a = nullptr, "device"_a = eDeviceType::GPU, R"pbdoc(
+
+
+            Executes the Average Blur operation on the given HIP stream, writing results into a pre-allocated output tensor.
+
+            This operation applies an average (mean) blur filter on images in a tensor.
+            The filter computes the average of all pixels within a rectangular kernel
+            for each pixel position in the image.
+
+            See also:
+                Refer to the rocCV C++ API reference for more information on this operation.
+
+            Args:
+                dst (rocpycv.Tensor): The output tensor which results are written to.
+                src (rocpycv.Tensor): Input tensor containing one or more images.
+                kernelSize (Tuple[int, int]): Kernel size as (width, height).
+                anchor (Tuple[int, int]): Kernel anchor position as (x, y).
+                borderMode (rocpycv.eBorderType): The border type to identify the pixel extrapolation method.
+                borderValue (List[float]): The color value to use when a constant border is selected (4 elements for RGBA).
+                stream (rocpycv.Stream, optional): HIP stream to run this operation on.
+                device (rocpycv.Device, optional): The device to run this operation on. Defaults to GPU.
+
+            Returns:
+                None
+           )pbdoc");
+}
diff --git a/src/op_avg_blur.cpp b/src/op_avg_blur.cpp
new file mode 100644
index 00000000..19276890
--- /dev/null
+++ b/src/op_avg_blur.cpp
@@ -0,0 +1,289 @@
+/**
+Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#include "op_avg_blur.hpp"
+
+#include <hip/hip_runtime.h>
+
+#include <functional>
+#include <iostream>
+#include <numeric>
+
+#include "common/array_wrapper.hpp"
+#include "core/wrappers/border_wrapper.hpp"
+#include "common/validation_helpers.hpp"
+#include "core/detail/casting.hpp"
+#include "core/detail/math/math.hpp"
+#include "core/detail/type_traits.hpp"
+#include "kernels/device/avg_blur_device.hpp"
+#include "kernels/host/avg_blur_host.hpp"
+
+namespace roccv {
+AvgBlur::AvgBlur() {}
+
+AvgBlur::~AvgBlur() {}
+
+template <typename T, eBorderType B>
+void dispatch_avg_blur_border_mode(hipStream_t stream, const Tensor &input, const Tensor &output,
+                                   int kernelWidth, int kernelHeight, int kernelAnchorX, int kernelAnchorY,
+                                   T borderValue, eDeviceType device) {
+    BorderWrapper<T, B> inputWrapper(input, borderValue);
+    ImageWrapper<T> outputWrapper(output);
+
+    if (outputWrapper.channels() > 4 || outputWrapper.channels() < 1) {
+        throw Exception("Invalid channel size: cannot be greater than 4 or less than 1.", eStatusType::OUT_OF_BOUNDS);
+    }
+
+    if (device == eDeviceType::GPU) {
+        dim3 block(32, 8);
+        dim3 grid((outputWrapper.width() + block.x - 1) / block.x,
+                  (outputWrapper.height() + block.y - 1) / block.y,
+                  outputWrapper.batches());
+
+        Kernels::Device::avg_blur_2d<T, BorderWrapper<T, B>, ImageWrapper<T>>
+            <<<grid, block, 0, stream>>>(
+                inputWrapper, outputWrapper,
+                kernelWidth, kernelHeight,
+                kernelAnchorX, kernelAnchorY);
+
+        hipError_t err = hipGetLastError();
+        if (err != hipSuccess) {
+            throw Exception("Average blur kernel launch failed: " + std::string(hipGetErrorString(err)),
+                          eStatusType::INVALID_OPERATION);
+        }
+    } else if (device == eDeviceType::CPU) {
+        Kernels::Host::avg_blur_2d<T, BorderWrapper<T, B>, ImageWrapper<T>>(
+            inputWrapper, outputWrapper,
+            kernelWidth, kernelHeight,
+            kernelAnchorX, kernelAnchorY);
+    }
+}
+
+/**
+ * @brief Optimized separable average blur using two-pass filtering with shared memory tiling
+ *
+ * This function implements average blur as two 1D convolutions (horizontal then vertical)
+ * instead of a single 2D convolution. Benefits:
+ * - Reduces computational complexity from O(k²) to O(2k) per pixel
+ * - Uses shared memory tiling to reduce global memory bandwidth
+ * - Typically 1.5-5.5× faster than direct 2D approach for kernels >= 5×5
+ *
+ * Trade-off: Requires intermediate buffer (same size as output)
+ */
+template <typename T, eBorderType B>
+void dispatch_avg_blur_border_mode_separable(hipStream_t stream, const Tensor &input, const Tensor &output,
+                                             int kernelWidth, int kernelHeight, int kernelAnchorX, int kernelAnchorY,
+                                             T borderValue, eDeviceType device) {
+    BorderWrapper<T, B> inputWrapper(input, borderValue);
+    ImageWrapper<T> outputWrapper(output);
+
+    if (outputWrapper.channels() > 4 || outputWrapper.channels() < 1) {
+        throw Exception("Invalid channel size: cannot be greater than 4 or less than 1.", eStatusType::OUT_OF_BOUNDS);
+    }
+
+    if (device == eDeviceType::GPU) {
+        // Allocate intermediate buffer for horizontal pass result
+        Tensor intermediate(output.shape(), output.dtype(), device);
+        ImageWrapper<T> intermediateWrapper(intermediate);
+
+        // Constants for tiling
+        constexpr int BLOCK_WIDTH = 128;   // Horizontal: 128 threads/block for good occupancy
+        constexpr int BLOCK_HEIGHT = 128;  // Vertical: testing 128 for maximum occupancy
+
+        // Horizontal pass: input -> intermediate
+        {
+            dim3 block(BLOCK_WIDTH, 1);
+            dim3 grid((outputWrapper.width() + BLOCK_WIDTH - 1) / BLOCK_WIDTH,
+                     outputWrapper.height(),
+                     outputWrapper.batches());
+
+            int halo = kernelWidth - 1;
+            int tileWidth = BLOCK_WIDTH + halo;
+            size_t smemSize = tileWidth * sizeof(T);
+
+            Kernels::Device::avg_blur_horizontal<T, BLOCK_WIDTH, BorderWrapper<T, B>, ImageWrapper<T>>
+                <<<grid, block, smemSize, stream>>>(
+                    inputWrapper, intermediateWrapper,
+                    kernelWidth, kernelAnchorX);
+
+            hipError_t err = hipGetLastError();
+            if (err != hipSuccess) {
+                throw Exception("Horizontal blur kernel launch failed: " + std::string(hipGetErrorString(err)),
+                              eStatusType::INVALID_OPERATION);
+            }
+        }
+
+        // Vertical pass: intermediate -> output
+        // Wrap intermediate buffer with border handling for vertical direction
+        BorderWrapper<T, B> intermediateWrapperWithBorder(intermediate, borderValue);
+        {
+            dim3 block(1, BLOCK_HEIGHT);
+            dim3 grid(outputWrapper.width(),
+                     (outputWrapper.height() + BLOCK_HEIGHT - 1) / BLOCK_HEIGHT,
+                     outputWrapper.batches());
+
+            int halo = kernelHeight - 1;
+            int tileHeight = BLOCK_HEIGHT + halo;
+            size_t smemSize = tileHeight * sizeof(T);
+
+            Kernels::Device::avg_blur_vertical<T, BLOCK_HEIGHT, BorderWrapper<T, B>, ImageWrapper<T>>
+                <<<grid, block, smemSize, stream>>>(
+                    intermediateWrapperWithBorder, outputWrapper,
+                    kernelHeight, kernelAnchorY);
+
+            hipError_t err = hipGetLastError();
+            if (err != hipSuccess) {
+                throw Exception("Vertical blur kernel launch failed: " + std::string(hipGetErrorString(err)),
+                              eStatusType::INVALID_OPERATION);
+            }
+        }
+        
+    } else if (device == eDeviceType::CPU) {
+        Kernels::Host::avg_blur_2d<T, BorderWrapper<T, B>, ImageWrapper<T>>(
+            inputWrapper, outputWrapper,
+            kernelWidth, kernelHeight,
+            kernelAnchorX, kernelAnchorY);
+    }
+}
+
+template <typename T>
+void dispatch_avg_blur_dtype(hipStream_t stream, const Tensor &input, const Tensor &output,
+                             int kernelWidth, int kernelHeight, int kernelAnchorX, int kernelAnchorY,
+                             eBorderType borderMode, float4 borderValue, eDeviceType device) {
+    // Select kernel dispatcher based on requested border mode.
+    // clang-format off
+    static const std::unordered_map<eBorderType, std::function<void(hipStream_t, const Tensor&, const Tensor&, int, int, int, int, T, eDeviceType)>>
+        funcs = {
+            {eBorderType::BORDER_TYPE_REPLICATE,   dispatch_avg_blur_border_mode<T, eBorderType::BORDER_TYPE_REPLICATE>},
+            {eBorderType::BORDER_TYPE_CONSTANT,    dispatch_avg_blur_border_mode<T, eBorderType::BORDER_TYPE_CONSTANT>},
+            {eBorderType::BORDER_TYPE_REFLECT,     dispatch_avg_blur_border_mode<T, eBorderType::BORDER_TYPE_REFLECT>},
+            {eBorderType::BORDER_TYPE_REFLECT101,  dispatch_avg_blur_border_mode<T, eBorderType::BORDER_TYPE_REFLECT101>},
+            {eBorderType::BORDER_TYPE_WRAP,        dispatch_avg_blur_border_mode<T, eBorderType::BORDER_TYPE_WRAP>}
+        };
+    // clang-format on
+
+    if (!funcs.contains(borderMode)) {
+        throw Exception("AvgBlur does not support the given border mode.", eStatusType::NOT_IMPLEMENTED);
+    }
+
+    auto func = funcs.at(borderMode);
+    func(stream, input, output, kernelWidth, kernelHeight, kernelAnchorX, kernelAnchorY,
+         detail::SaturateCast<T>(borderValue), device);
+}
+
+template <typename T>
+void dispatch_avg_blur_dtype_optimized(hipStream_t stream, const Tensor &input, const Tensor &output,
+                                       int kernelWidth, int kernelHeight, int kernelAnchorX, int kernelAnchorY,
+                                       eBorderType borderMode, float4 borderValue, eDeviceType device) {
+    // Select kernel dispatcher based on requested border mode.
+    // clang-format off
+    static const std::unordered_map<eBorderType, std::function<void(hipStream_t, const Tensor&, const Tensor&, int, int, int, int, T, eDeviceType)>>
+        funcs = {
+            {eBorderType::BORDER_TYPE_REPLICATE,   dispatch_avg_blur_border_mode_separable<T, eBorderType::BORDER_TYPE_REPLICATE>},
+            {eBorderType::BORDER_TYPE_CONSTANT,    dispatch_avg_blur_border_mode_separable<T, eBorderType::BORDER_TYPE_CONSTANT>},
+            {eBorderType::BORDER_TYPE_REFLECT,     dispatch_avg_blur_border_mode_separable<T, eBorderType::BORDER_TYPE_REFLECT>},
+            {eBorderType::BORDER_TYPE_REFLECT101,  dispatch_avg_blur_border_mode_separable<T, eBorderType::BORDER_TYPE_REFLECT101>},
+            {eBorderType::BORDER_TYPE_WRAP,        dispatch_avg_blur_border_mode_separable<T, eBorderType::BORDER_TYPE_WRAP>}
+        };
+    // clang-format on
+
+    if (!funcs.contains(borderMode)) {
+        throw Exception("AvgBlur does not support the given border mode.", eStatusType::NOT_IMPLEMENTED);
+    }
+
+    auto func = funcs.at(borderMode);
+    func(stream, input, output, kernelWidth, kernelHeight, kernelAnchorX, kernelAnchorY,
+         detail::SaturateCast<T>(borderValue), device);
+}
+
+void AvgBlur::operator()(hipStream_t stream, const Tensor &input, const Tensor &output,
+                         int kernelWidth, int kernelHeight, int kernelAnchorX, int kernelAnchorY,
+                         eBorderType borderType, float4 borderValue, eDeviceType device) {
+    // Verify that the tensors are located on the right device (CPU or GPU).
+    CHECK_TENSOR_DEVICE(input, device);
+    CHECK_TENSOR_DEVICE(output, device);
+
+    // Ensure all tensors are using supported datatypes
+    CHECK_TENSOR_DATATYPES(input, DATA_TYPE_U8, DATA_TYPE_U16, DATA_TYPE_S16, DATA_TYPE_S32, DATA_TYPE_F32);
+    CHECK_TENSOR_DATATYPES(output, DATA_TYPE_U8, DATA_TYPE_U16, DATA_TYPE_S16, DATA_TYPE_S32, DATA_TYPE_F32);
+
+    // Ensure all tensors are using supported layouts.
+    CHECK_TENSOR_LAYOUT(input, TENSOR_LAYOUT_NHWC, TENSOR_LAYOUT_HWC);
+    CHECK_TENSOR_LAYOUT(output, TENSOR_LAYOUT_NHWC, TENSOR_LAYOUT_HWC);
+
+    CHECK_TENSOR_CHANNELS(input, 1, 3, 4);
+
+    eDataType dtype = input.dtype().etype();
+    int64_t channels = input.shape(input.layout().channels_index());
+
+    // Ensure the layout and shapes for the input/output tensor match
+    CHECK_TENSOR_COMPARISON(input.layout() == output.layout());
+    CHECK_TENSOR_COMPARISON(input.shape() == output.shape());
+
+    // Choose between direct 2D and optimized separable approach based on kernel size
+    // Benchmark results show crossover point between 9×9 and 11×11 kernels
+    // Below 11×11: kernel launch overhead dominates computational savings
+    // At 11×11 and above: separable filtering provides 1.3-3× speedup
+    // For GPU only - CPU uses direct 2D in both cases
+    bool useSeparable = (device == eDeviceType::GPU) &&
+                       (kernelWidth >= 11 || kernelHeight >= 11);
+
+    if (useSeparable) {
+        // Use optimized separable filtering with shared memory tiling
+        // clang-format off
+        static const std::unordered_map<
+            eDataType, std::array<std::function<void(hipStream_t, const Tensor &, const Tensor &, int, int, int, int,
+                                                     eBorderType, float4, eDeviceType)>, 4>>
+            funcs_optimized = {
+                {eDataType::DATA_TYPE_U8, {dispatch_avg_blur_dtype_optimized<uchar1>, 0, dispatch_avg_blur_dtype_optimized<uchar3>, dispatch_avg_blur_dtype_optimized<uchar4>}},
+                {eDataType::DATA_TYPE_U16, {dispatch_avg_blur_dtype_optimized<ushort1>, 0, dispatch_avg_blur_dtype_optimized<ushort3>, dispatch_avg_blur_dtype_optimized<ushort4>}},
+                {eDataType::DATA_TYPE_S16, {dispatch_avg_blur_dtype_optimized<short1>, 0, dispatch_avg_blur_dtype_optimized<short3>, dispatch_avg_blur_dtype_optimized<short4>}},
+                {eDataType::DATA_TYPE_S32, {dispatch_avg_blur_dtype_optimized<int1>, 0, dispatch_avg_blur_dtype_optimized<int3>, dispatch_avg_blur_dtype_optimized<int4>}},
+                {eDataType::DATA_TYPE_F32, {dispatch_avg_blur_dtype_optimized<float1>, 0, dispatch_avg_blur_dtype_optimized<float3>, dispatch_avg_blur_dtype_optimized<float4>}}
+            };
+        // clang-format on
+
+        auto func = funcs_optimized.at(dtype)[channels - 1];
+        if (func == 0) throw Exception("Not mapped to a defined function.", eStatusType::INVALID_OPERATION);
+        func(stream, input, output, kernelWidth, kernelHeight, kernelAnchorX, kernelAnchorY, borderType, borderValue, device);
+    } else {
+        // Use direct 2D convolution (better for small kernels)
+        // clang-format off
+        static const std::unordered_map<
+            eDataType, std::array<std::function<void(hipStream_t, const Tensor &, const Tensor &, int, int, int, int,
+                                                     eBorderType, float4, eDeviceType)>, 4>>
+            funcs = {
+                {eDataType::DATA_TYPE_U8, {dispatch_avg_blur_dtype<uchar1>, 0, dispatch_avg_blur_dtype<uchar3>, dispatch_avg_blur_dtype<uchar4>}},
+                {eDataType::DATA_TYPE_U16, {dispatch_avg_blur_dtype<ushort1>, 0, dispatch_avg_blur_dtype<ushort3>, dispatch_avg_blur_dtype<ushort4>}},
+                {eDataType::DATA_TYPE_S16, {dispatch_avg_blur_dtype<short1>, 0, dispatch_avg_blur_dtype<short3>, dispatch_avg_blur_dtype<short4>}},
+                {eDataType::DATA_TYPE_S32, {dispatch_avg_blur_dtype<int1>, 0, dispatch_avg_blur_dtype<int3>, dispatch_avg_blur_dtype<int4>}},
+                {eDataType::DATA_TYPE_F32, {dispatch_avg_blur_dtype<float1>, 0, dispatch_avg_blur_dtype<float3>, dispatch_avg_blur_dtype<float4>}}
+            };
+        // clang-format on
+
+        auto func = funcs.at(dtype)[channels - 1];
+        if (func == 0) throw Exception("Not mapped to a defined function.", eStatusType::INVALID_OPERATION);
+        func(stream, input, output, kernelWidth, kernelHeight, kernelAnchorX, kernelAnchorY, borderType, borderValue, device);
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////////////////////////////
+}
+}  // namespace roccv
\ No newline at end of file
diff --git a/tests/roccv/cpp/src/tests/operators/test_op_avg_blur.cpp b/tests/roccv/cpp/src/tests/operators/test_op_avg_blur.cpp
new file mode 100644
index 00000000..33ec722e
--- /dev/null
+++ b/tests/roccv/cpp/src/tests/operators/test_op_avg_blur.cpp
@@ -0,0 +1,302 @@
+/**
+Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <core/detail/casting.hpp>
+#include <core/detail/type_traits.hpp>
+#include <core/detail/vector_utils.hpp>
+#include <core/wrappers/border_wrapper.hpp>
+#include <core/wrappers/image_wrapper.hpp>
+#include <op_avg_blur.hpp>
+
+#include "test_helpers.hpp"
+
+using namespace roccv;
+using namespace roccv::detail;
+using namespace roccv::tests;
+
+namespace {
+
+/**
+ * @brief Verified golden C++ model for the average blur operation on one image.
+ *
+ * @tparam T Vectorized datatype of the image's pixels.
+ * @param[in] input Input tensor containing image data.
+ * @param[out] output Output tensor containing blurred image data.
+ * @param[in] kernelWidth Width of the averaging kernel
+ * @param[in] kernelHeight Height of the averaging kernel
+ * @param[in] kernelAnchorX X-coordinate of kernel anchor point
+ * @param[in] kernelAnchorY Y-coordinate of kernel anchor point
+ * @param[in] borderMode Border pixel extrapolation method
+ * @param[in] borderValue Color for constant border mode
+ * @return None.
+ */
+template <typename T, eBorderType borderMode, typename BT = detail::BaseType<T>>
+void GenerateGoldenAvgBlur(std::vector<BT>& input, std::vector<BT>& output, int32_t batchSize, Size2D imageSize,
+                           int kernelWidth, int kernelHeight, int kernelAnchorX, int kernelAnchorY, T borderValue) {
+    BorderWrapper<T, borderMode> src(ImageWrapper<T>(input, batchSize, imageSize.w, imageSize.h), borderValue);
+    ImageWrapper<T> dst(output, batchSize, imageSize.w, imageSize.h);
+    using namespace roccv::detail;
+    using WorkType = MakeType<float, NumElements<T>>;
+
+    // Compute kernel area for averaging
+    float kernelArea = static_cast<float>(kernelWidth * kernelHeight);
+
+    // Iterate over all batches
+    for (int b = 0; b < dst.batches(); b++) {
+        // Iterate over all output pixels
+        for (int j = 0; j < dst.height(); j++) {
+            for (int i = 0; i < dst.width(); i++) {
+                // Initialize accumulator
+                WorkType sum = SetAll<WorkType>(0.0f);
+
+                // Compute the sum over the kernel window
+                for (int ky = 0; ky < kernelHeight; ++ky) {
+                    int srcY = j - kernelAnchorY + ky;
+
+                    for (int kx = 0; kx < kernelWidth; ++kx) {
+                        int srcX = i - kernelAnchorX + kx;
+
+                        T pixel = src.at(b, srcY, srcX, 0);
+
+                        sum = sum + StaticCast<WorkType>(pixel);
+                    }
+                }
+
+                WorkType average = sum / kernelArea;
+
+                dst.at(b, j, i, 0) = SaturateCast<T>(average);
+            }
+        }
+    }
+}
+
+/**
+ * @brief Tests correctness of the average blur operator, comparing it against a generated golden result.
+ *
+ * @tparam T Underlying datatype of the image's pixels.
+ * @tparam BT Base type of the image data.
+ * @param[in] batchSize Number of images in the batch.
+ * @param[in] width Width of each image in the batch.
+ * @param[in] height Height of each image in the batch.
+ * @param[in] format Image format.
+ * @param[in] kernelWidth Width of the averaging kernel
+ * @param[in] kernelHeight Height of the averaging kernel
+ * @param[in] borderColor Color for constant border mode
+ * @param[in] device Device this correctness test should be run on.
+ */
+template <typename T, eBorderType BorderMode, typename BT = detail::BaseType<T>>
+void TestCorrectness(int batchSize, int width, int height, ImageFormat format, int kernelWidth, int kernelHeight,
+                     float4 borderColor, eDeviceType device) {
+    // Create input and output tensor based on test parameters
+    Tensor input(batchSize, {width, height}, format, device);
+    Tensor output(batchSize, {width, height}, format, device);
+
+    // Create a vector and fill it with random data.
+    std::vector<BT> inputData(input.shape().size());
+    FillVector(inputData);
+    if constexpr (std::is_floating_point_v<BT>) {
+        for (int i = 0; i < inputData.size(); i++) {
+            inputData[i] *= static_cast<BT>(std::numeric_limits<ushort>::max());
+        }
+    }
+
+    // Copy generated input data into input tensor
+    CopyVectorIntoTensor(input, inputData);
+
+    // Calculate kernel anchor (center of kernel)
+    int kernelAnchorX = kernelWidth / 2;
+    int kernelAnchorY = kernelHeight / 2;
+
+    hipStream_t stream;
+    HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream));
+    AvgBlur op;
+    op(stream, input, output, kernelWidth, kernelHeight, kernelAnchorX, kernelAnchorY, BorderMode, borderColor,
+       device);
+    HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(stream));
+    HIP_VALIDATE_NO_ERRORS(hipStreamDestroy(stream));
+
+    // Copy data from output tensor into a host allocated vector
+    std::vector<BT> outputData(output.shape().size());
+    CopyTensorIntoVector(outputData, output);
+
+    // Calculate golden reference
+    std::vector<BT> refData(output.shape().size());
+    GenerateGoldenAvgBlur<T, BorderMode>(inputData, refData, batchSize, {width, height}, kernelWidth, kernelHeight,
+                                         kernelAnchorX, kernelAnchorY, detail::SaturateCast<T>(borderColor));
+
+    // Compare data in actual output versus the generated golden reference image
+    CompareVectorsNear(outputData, refData, 1);
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+    TEST_CASES_BEGIN();
+
+    // GPU correctness tests - U8 (1 channel)
+    TEST_CASE((TestCorrectness<uchar, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_U8, 3, 3, {0.0, 0.0, 0.0, 0.0},
+                                                            eDeviceType::GPU)));
+    TEST_CASE((TestCorrectness<uchar, BORDER_TYPE_REPLICATE>(4, 32, 32, FMT_U8, 5, 5, {0.0, 0.0, 0.0, 0.0},
+                                                             eDeviceType::GPU)));
+    TEST_CASE((TestCorrectness<uchar, BORDER_TYPE_REFLECT>(2, 24, 24, FMT_U8, 7, 7, {0.0, 0.0, 0.0, 0.0},
+                                                           eDeviceType::GPU)));
+
+    // GPU correctness tests - RGB8 (3 channels)
+    TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_RGB8, 3, 3, {0.0, 0.0, 0.0, 0.0},
+                                                             eDeviceType::GPU)));
+    TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_REFLECT>(2, 32, 32, FMT_RGB8, 5, 5, {100.0, 100.0, 0.0, 0.0},
+                                                            eDeviceType::GPU)));
+    TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_WRAP>(1, 16, 16, FMT_RGB8, 3, 3, {50.0, 50.0, 50.0, 0.0},
+                                                         eDeviceType::GPU)));
+
+    // GPU correctness tests - RGBA8 (4 channels)
+    TEST_CASE((TestCorrectness<uchar4, BORDER_TYPE_WRAP>(1, 10, 10, FMT_RGBA8, 3, 3, {0.0, 0.0, 0.0, 0.0},
+                                                         eDeviceType::GPU)));
+    TEST_CASE((TestCorrectness<uchar4, BORDER_TYPE_REPLICATE>(5, 64, 64, FMT_RGBA8, 5, 5, {0.0, 0.0, 0.0, 0.0},
+                                                              eDeviceType::GPU)));
+    TEST_CASE((TestCorrectness<uchar4, BORDER_TYPE_REFLECT101>(2, 20, 20, FMT_RGBA8, 7, 7, {128.0, 128.0, 128.0, 255.0},
+                                                               eDeviceType::GPU)));
+
+    // GPU correctness tests - S16 (signed 16-bit)
+    TEST_CASE((TestCorrectness<short1, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_S16, 3, 3, {500.0, 500.0, 0.0, 0.0},
+                                                             eDeviceType::GPU)));
+    TEST_CASE((TestCorrectness<short1, BORDER_TYPE_REFLECT>(3, 20, 20, FMT_S16, 5, 5, {500.0, 500.0, 0.0, 0.0},
+                                                            eDeviceType::GPU)));
+    TEST_CASE((TestCorrectness<short3, BORDER_TYPE_REPLICATE>(2, 16, 16, FMT_RGBs16, 3, 3, {100.0, 100.0, 100.0, 0.0},
+                                                              eDeviceType::GPU)));
+    TEST_CASE((TestCorrectness<short4, BORDER_TYPE_WRAP>(1, 24, 24, FMT_RGBAs16, 5, 5, {0.0, 0.0, 0.0, 0.0},
+                                                         eDeviceType::GPU)));
+
+    // GPU correctness tests - U16 (unsigned 16-bit)
+    TEST_CASE((TestCorrectness<ushort1, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_U16, 3, 3, {0.0, 0.0, 0.0, 0.0},
+                                                              eDeviceType::GPU)));
+    TEST_CASE((TestCorrectness<ushort1, BORDER_TYPE_REFLECT>(2, 20, 20, FMT_U16, 5, 5, {0.0, 0.0, 0.0, 0.0},
+                                                             eDeviceType::GPU)));
+    TEST_CASE((TestCorrectness<ushort3, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_RGB16, 3, 3, {500.0, 600.0, 0.0, 0.0},
+                                                              eDeviceType::GPU)));
+    TEST_CASE((TestCorrectness<ushort4, BORDER_TYPE_REFLECT>(1, 20, 20, FMT_RGBA16, 5, 5, {500.0, 600.0, 0.0, 0.0},
+                                                             eDeviceType::GPU)));
+
+    // GPU correctness tests - S32 (signed 32-bit)
+    TEST_CASE((TestCorrectness<int1, BORDER_TYPE_CONSTANT>(1, 32, 32, FMT_S32, 3, 3, {500.0, 500.0, 0.0, 0.0},
+                                                           eDeviceType::GPU)));
+    TEST_CASE((TestCorrectness<int1, BORDER_TYPE_WRAP>(2, 32, 32, FMT_S32, 5, 5, {500.0, 500.0, 0.0, 0.0},
+                                                       eDeviceType::GPU)));
+    TEST_CASE((TestCorrectness<int3, BORDER_TYPE_REPLICATE>(1, 16, 16, FMT_RGBs32, 3, 3, {100.0, 100.0, 100.0, 0.0},
+                                                            eDeviceType::GPU)));
+    TEST_CASE((TestCorrectness<int4, BORDER_TYPE_REFLECT>(2, 24, 24, FMT_RGBAs32, 5, 5, {0.0, 0.0, 0.0, 0.0},
+                                                          eDeviceType::GPU)));
+
+    // GPU correctness tests - F32 (float)
+    TEST_CASE((TestCorrectness<float1, BORDER_TYPE_REPLICATE>(1, 24, 24, FMT_F32, 3, 3, {500.0, 500.0, 0.0, 0.0},
+                                                              eDeviceType::GPU)));
+    TEST_CASE((TestCorrectness<float1, BORDER_TYPE_WRAP>(2, 24, 24, FMT_F32, 5, 5, {600.0, 500.0, 0.0, 0.0},
+                                                         eDeviceType::GPU)));
+    TEST_CASE((TestCorrectness<float3, BORDER_TYPE_REPLICATE>(1, 24, 24, FMT_RGBf32, 3, 3, {500.0, 500.0, 0.0, 0.0},
+                                                              eDeviceType::GPU)));
+    TEST_CASE((TestCorrectness<float3, BORDER_TYPE_WRAP>(2, 24, 24, FMT_RGBf32, 7, 7, {600.0, 500.0, 0.0, 0.0},
+                                                         eDeviceType::GPU)));
+    TEST_CASE((TestCorrectness<float4, BORDER_TYPE_REPLICATE>(1, 24, 24, FMT_RGBAf32, 3, 3, {500.0, 500.0, 0.0, 0.0},
+                                                              eDeviceType::GPU)));
+    TEST_CASE((TestCorrectness<float4, BORDER_TYPE_WRAP>(2, 24, 24, FMT_RGBAf32, 5, 5, {600.0, 500.0, 0.0, 0.0},
+                                                         eDeviceType::GPU)));
+
+    // CPU correctness tests - U8 (1 channel)
+    TEST_CASE((TestCorrectness<uchar, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_U8, 3, 3, {0.0, 0.0, 0.0, 0.0},
+                                                            eDeviceType::CPU)));
+    TEST_CASE((TestCorrectness<uchar, BORDER_TYPE_REPLICATE>(4, 32, 32, FMT_U8, 5, 5, {0.0, 0.0, 0.0, 0.0},
+                                                             eDeviceType::CPU)));
+    TEST_CASE((TestCorrectness<uchar, BORDER_TYPE_REFLECT>(2, 24, 24, FMT_U8, 7, 7, {0.0, 0.0, 0.0, 0.0},
+                                                           eDeviceType::CPU)));
+
+    // CPU correctness tests - RGB8 (3 channels)
+    TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_RGB8, 3, 3, {0.0, 0.0, 0.0, 0.0},
+                                                             eDeviceType::CPU)));
+    TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_REFLECT>(2, 32, 32, FMT_RGB8, 5, 5, {100.0, 100.0, 0.0, 0.0},
+                                                            eDeviceType::CPU)));
+    TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_WRAP>(1, 16, 16, FMT_RGB8, 3, 3, {50.0, 50.0, 50.0, 0.0},
+                                                         eDeviceType::CPU)));
+
+    // CPU correctness tests - RGBA8 (4 channels)
+    TEST_CASE((TestCorrectness<uchar4, BORDER_TYPE_WRAP>(1, 10, 10, FMT_RGBA8, 3, 3, {0.0, 0.0, 0.0, 0.0},
+                                                         eDeviceType::CPU)));
+    TEST_CASE((TestCorrectness<uchar4, BORDER_TYPE_REPLICATE>(5, 64, 64, FMT_RGBA8, 5, 5, {0.0, 0.0, 0.0, 0.0},
+                                                              eDeviceType::CPU)));
+    TEST_CASE((TestCorrectness<uchar4, BORDER_TYPE_REFLECT101>(2, 20, 20, FMT_RGBA8, 7, 7, {128.0, 128.0, 128.0, 255.0},
+                                                               eDeviceType::CPU)));
+
+    // CPU correctness tests - S16 (signed 16-bit)
+    TEST_CASE((TestCorrectness<short1, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_S16, 3, 3, {500.0, 500.0, 0.0, 0.0},
+                                                             eDeviceType::CPU)));
+    TEST_CASE((TestCorrectness<short1, BORDER_TYPE_REFLECT>(3, 20, 20, FMT_S16, 5, 5, {500.0, 500.0, 0.0, 0.0},
+                                                            eDeviceType::CPU)));
+    TEST_CASE((TestCorrectness<short3, BORDER_TYPE_REPLICATE>(2, 16, 16, FMT_RGBs16, 3, 3, {100.0, 100.0, 100.0, 0.0},
+                                                              eDeviceType::CPU)));
+    TEST_CASE((TestCorrectness<short4, BORDER_TYPE_WRAP>(1, 24, 24, FMT_RGBAs16, 5, 5, {0.0, 0.0, 0.0, 0.0},
+                                                         eDeviceType::CPU)));
+
+    // CPU correctness tests - U16 (unsigned 16-bit)
+    TEST_CASE((TestCorrectness<ushort1, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_U16, 3, 3, {0.0, 0.0, 0.0, 0.0},
+                                                              eDeviceType::CPU)));
+    TEST_CASE((TestCorrectness<ushort1, BORDER_TYPE_REFLECT>(2, 20, 20, FMT_U16, 5, 5, {0.0, 0.0, 0.0, 0.0},
+                                                             eDeviceType::CPU)));
+    TEST_CASE((TestCorrectness<ushort3, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_RGB16, 3, 3, {500.0, 600.0, 0.0, 0.0},
+                                                              eDeviceType::CPU)));
+    TEST_CASE((TestCorrectness<ushort4, BORDER_TYPE_REFLECT>(1, 20, 20, FMT_RGBA16, 5, 5, {500.0, 600.0, 0.0, 0.0},
+                                                             eDeviceType::CPU)));
+
+    // CPU correctness tests - S32 (signed 32-bit)
+    TEST_CASE((TestCorrectness<int1, BORDER_TYPE_CONSTANT>(1, 32, 32, FMT_S32, 3, 3, {500.0, 500.0, 0.0, 0.0},
+                                                           eDeviceType::CPU)));
+    TEST_CASE((TestCorrectness<int1, BORDER_TYPE_WRAP>(2, 32, 32, FMT_S32, 5, 5, {500.0, 500.0, 0.0, 0.0},
+                                                       eDeviceType::CPU)));
+    TEST_CASE((TestCorrectness<int3, BORDER_TYPE_REPLICATE>(1, 16, 16, FMT_RGBs32, 3, 3, {100.0, 100.0, 100.0, 0.0},
+                                                            eDeviceType::CPU)));
+    TEST_CASE((TestCorrectness<int4, BORDER_TYPE_REFLECT>(2, 24, 24, FMT_RGBAs32, 5, 5, {0.0, 0.0, 0.0, 0.0},
+                                                          eDeviceType::CPU)));
+
+    // CPU correctness tests - F32 (float)
+    TEST_CASE((TestCorrectness<float1, BORDER_TYPE_REPLICATE>(1, 24, 24, FMT_F32, 3, 3, {500.0, 500.0, 0.0, 0.0},
+                                                              eDeviceType::CPU)));
+    TEST_CASE((TestCorrectness<float1, BORDER_TYPE_WRAP>(2, 24, 24, FMT_F32, 5, 5, {600.0, 500.0, 0.0, 0.0},
+                                                         eDeviceType::CPU)));
+    TEST_CASE((TestCorrectness<float3, BORDER_TYPE_REPLICATE>(1, 24, 24, FMT_RGBf32, 3, 3, {500.0, 500.0, 0.0, 0.0},
+                                                              eDeviceType::CPU)));
+    TEST_CASE((TestCorrectness<float3, BORDER_TYPE_WRAP>(2, 24, 24, FMT_RGBf32, 7, 7, {600.0, 500.0, 0.0, 0.0},
+                                                         eDeviceType::CPU)));
+    TEST_CASE((TestCorrectness<float4, BORDER_TYPE_REPLICATE>(1, 24, 24, FMT_RGBAf32, 3, 3, {500.0, 500.0, 0.0, 0.0},
+                                                              eDeviceType::CPU)));
+    TEST_CASE((TestCorrectness<float4, BORDER_TYPE_WRAP>(2, 24, 24, FMT_RGBAf32, 5, 5, {600.0, 500.0, 0.0, 0.0},
+                                                         eDeviceType::CPU)));
+
+    // Additional edge cases - various kernel sizes
+    TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_REPLICATE>(1, 40, 40, FMT_RGB8, 9, 9, {0.0, 0.0, 0.0, 0.0},
+                                                              eDeviceType::GPU)));
+    TEST_CASE((TestCorrectness<float4, BORDER_TYPE_CONSTANT>(1, 50, 50, FMT_RGBAf32, 11, 11, {0.0, 0.0, 0.0, 0.0},
+                                                             eDeviceType::GPU)));
+    TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_REPLICATE>(1, 40, 40, FMT_RGB8, 9, 9, {0.0, 0.0, 0.0, 0.0},
+                                                              eDeviceType::CPU)));
+    TEST_CASE((TestCorrectness<float4, BORDER_TYPE_CONSTANT>(1, 50, 50, FMT_RGBAf32, 11, 11, {0.0, 0.0, 0.0, 0.0},
+                                                             eDeviceType::CPU)));
+
+    TEST_CASES_END();
+}
diff --git a/tests/roccv/python/test_op_avg_blur.py b/tests/roccv/python/test_op_avg_blur.py
new file mode 100644
index 00000000..d38ab2fe
--- /dev/null
+++ b/tests/roccv/python/test_op_avg_blur.py
@@ -0,0 +1,56 @@
+# ##############################################################################
+# Copyright (c)  - 2026 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# ##############################################################################
+
+import pytest
+import rocpycv
+
+from test_helpers import compare_tensors, generate_tensor
+
+
+@pytest.mark.parametrize("device", [rocpycv.eDeviceType.GPU, rocpycv.eDeviceType.CPU])
+@pytest.mark.parametrize("dtype", [rocpycv.eDataType.U8, rocpycv.eDataType.U16, rocpycv.eDataType.S16, rocpycv.eDataType.S32, rocpycv.eDataType.F32])
+@pytest.mark.parametrize("border_mode", [rocpycv.eBorderType.CONSTANT])
+@pytest.mark.parametrize("border_val", [[0, 0, 0, 0]])
+@pytest.mark.parametrize("kernel_size,anchor", [
+    ((3, 3), (1, 1)),   
+    ((7, 7), (3, 3)),   
+    ((9, 9), (4, 4))
+])
+@pytest.mark.parametrize("channels", [1, 3, 4])
+@pytest.mark.parametrize("samples,height,width", [
+    (1, 56, 64),
+    (3, 14, 40),
+    (5, 45, 105)
+])
+def test_op_avg_blur(samples, height, width, channels, border_mode, border_val, kernel_size, anchor, dtype, device):
+    input = generate_tensor(samples, width, height, channels, dtype, device)
+    output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device)
+
+    stream = rocpycv.Stream()
+    rocpycv.avg_blur_into(output_golden, input, kernel_size, anchor,
+                          border_mode, border_val, stream, device)
+    output = rocpycv.avg_blur(input, kernel_size, anchor,
+                              border_mode, border_val, stream, device)
+    stream.synchronize()
+
+    compare_tensors(output, output_golden)

From 4ec666c916205af7087bdc931a4d98ae4b02d2ca Mon Sep 17 00:00:00 2001
From: svcbuild <paveltc>
Date: Thu, 9 Apr 2026 14:33:14 -0700
Subject: [PATCH 2/2] Made -1,-1 the default anchor, added anchors to C++ tests

---
 src/op_avg_blur.cpp                           |   8 ++
 .../src/tests/operators/test_op_avg_blur.cpp  | 124 +++++++++---------
 tests/roccv/python/test_op_avg_blur.py        |   2 +-
 3 files changed, 70 insertions(+), 64 deletions(-)

diff --git a/src/op_avg_blur.cpp b/src/op_avg_blur.cpp
index 19276890..27563e97 100644
--- a/src/op_avg_blur.cpp
+++ b/src/op_avg_blur.cpp
@@ -231,6 +231,14 @@ void AvgBlur::operator()(hipStream_t stream, const Tensor &input, const Tensor &
 
     CHECK_TENSOR_CHANNELS(input, 1, 3, 4);
 
+    // Handle default anchor (-1, -1) by setting to kernel center
+    if (kernelAnchorX == -1) {
+        kernelAnchorX = kernelWidth / 2;
+    }
+    if (kernelAnchorY == -1) {
+        kernelAnchorY = kernelHeight / 2;
+    }
+
     eDataType dtype = input.dtype().etype();
     int64_t channels = input.shape(input.layout().channels_index());
 
diff --git a/tests/roccv/cpp/src/tests/operators/test_op_avg_blur.cpp b/tests/roccv/cpp/src/tests/operators/test_op_avg_blur.cpp
index 33ec722e..38f566d4 100644
--- a/tests/roccv/cpp/src/tests/operators/test_op_avg_blur.cpp
+++ b/tests/roccv/cpp/src/tests/operators/test_op_avg_blur.cpp
@@ -100,12 +100,14 @@ void GenerateGoldenAvgBlur(std::vector<BT>& input, std::vector<BT>& output, int3
  * @param[in] format Image format.
  * @param[in] kernelWidth Width of the averaging kernel
  * @param[in] kernelHeight Height of the averaging kernel
+ * @param[in] kernelAnchorX X-coordinate of kernel anchor point
+ * @param[in] kernelAnchorY Y-coordinate of kernel anchor point
  * @param[in] borderColor Color for constant border mode
  * @param[in] device Device this correctness test should be run on.
  */
 template <typename T, eBorderType BorderMode, typename BT = detail::BaseType<T>>
 void TestCorrectness(int batchSize, int width, int height, ImageFormat format, int kernelWidth, int kernelHeight,
-                     float4 borderColor, eDeviceType device) {
+                     int kernelAnchorX, int kernelAnchorY, float4 borderColor, eDeviceType device) {
     // Create input and output tensor based on test parameters
     Tensor input(batchSize, {width, height}, format, device);
     Tensor output(batchSize, {width, height}, format, device);
@@ -122,10 +124,6 @@ void TestCorrectness(int batchSize, int width, int height, ImageFormat format, i
     // Copy generated input data into input tensor
     CopyVectorIntoTensor(input, inputData);
 
-    // Calculate kernel anchor (center of kernel)
-    int kernelAnchorX = kernelWidth / 2;
-    int kernelAnchorY = kernelHeight / 2;
-
     hipStream_t stream;
     HIP_VALIDATE_NO_ERRORS(hipStreamCreate(&stream));
     AvgBlur op;
@@ -153,149 +151,149 @@ int main(int argc, char** argv) {
     TEST_CASES_BEGIN();
 
     // GPU correctness tests - U8 (1 channel)
-    TEST_CASE((TestCorrectness<uchar, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_U8, 3, 3, {0.0, 0.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<uchar, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_U8, 3, 3, 1, 1, {0.0, 0.0, 0.0, 0.0},
                                                             eDeviceType::GPU)));
-    TEST_CASE((TestCorrectness<uchar, BORDER_TYPE_REPLICATE>(4, 32, 32, FMT_U8, 5, 5, {0.0, 0.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<uchar, BORDER_TYPE_REPLICATE>(4, 32, 32, FMT_U8, 5, 5, 2, 2, {0.0, 0.0, 0.0, 0.0},
                                                              eDeviceType::GPU)));
-    TEST_CASE((TestCorrectness<uchar, BORDER_TYPE_REFLECT>(2, 24, 24, FMT_U8, 7, 7, {0.0, 0.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<uchar, BORDER_TYPE_REFLECT>(2, 24, 24, FMT_U8, 7, 7, 3, 3, {0.0, 0.0, 0.0, 0.0},
                                                            eDeviceType::GPU)));
 
     // GPU correctness tests - RGB8 (3 channels)
-    TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_RGB8, 3, 3, {0.0, 0.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_RGB8, 3, 3, 1, 1, {0.0, 0.0, 0.0, 0.0},
                                                              eDeviceType::GPU)));
-    TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_REFLECT>(2, 32, 32, FMT_RGB8, 5, 5, {100.0, 100.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_REFLECT>(2, 32, 32, FMT_RGB8, 5, 5, 2, 2, {100.0, 100.0, 0.0, 0.0},
                                                             eDeviceType::GPU)));
-    TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_WRAP>(1, 16, 16, FMT_RGB8, 3, 3, {50.0, 50.0, 50.0, 0.0},
+    TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_WRAP>(1, 16, 16, FMT_RGB8, 3, 3, 1, 1, {50.0, 50.0, 50.0, 0.0},
                                                          eDeviceType::GPU)));
 
     // GPU correctness tests - RGBA8 (4 channels)
-    TEST_CASE((TestCorrectness<uchar4, BORDER_TYPE_WRAP>(1, 10, 10, FMT_RGBA8, 3, 3, {0.0, 0.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<uchar4, BORDER_TYPE_WRAP>(1, 10, 10, FMT_RGBA8, 3, 3, 1, 1, {0.0, 0.0, 0.0, 0.0},
                                                          eDeviceType::GPU)));
-    TEST_CASE((TestCorrectness<uchar4, BORDER_TYPE_REPLICATE>(5, 64, 64, FMT_RGBA8, 5, 5, {0.0, 0.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<uchar4, BORDER_TYPE_REPLICATE>(5, 64, 64, FMT_RGBA8, 5, 5, 2, 2, {0.0, 0.0, 0.0, 0.0},
                                                               eDeviceType::GPU)));
-    TEST_CASE((TestCorrectness<uchar4, BORDER_TYPE_REFLECT101>(2, 20, 20, FMT_RGBA8, 7, 7, {128.0, 128.0, 128.0, 255.0},
+    TEST_CASE((TestCorrectness<uchar4, BORDER_TYPE_REFLECT101>(2, 20, 20, FMT_RGBA8, 7, 7, 3, 3, {128.0, 128.0, 128.0, 255.0},
                                                                eDeviceType::GPU)));
 
     // GPU correctness tests - S16 (signed 16-bit)
-    TEST_CASE((TestCorrectness<short1, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_S16, 3, 3, {500.0, 500.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<short1, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_S16, 3, 3, 1, 1, {500.0, 500.0, 0.0, 0.0},
                                                              eDeviceType::GPU)));
-    TEST_CASE((TestCorrectness<short1, BORDER_TYPE_REFLECT>(3, 20, 20, FMT_S16, 5, 5, {500.0, 500.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<short1, BORDER_TYPE_REFLECT>(3, 20, 20, FMT_S16, 5, 5, 2, 2, {500.0, 500.0, 0.0, 0.0},
                                                             eDeviceType::GPU)));
-    TEST_CASE((TestCorrectness<short3, BORDER_TYPE_REPLICATE>(2, 16, 16, FMT_RGBs16, 3, 3, {100.0, 100.0, 100.0, 0.0},
+    TEST_CASE((TestCorrectness<short3, BORDER_TYPE_REPLICATE>(2, 16, 16, FMT_RGBs16, 3, 3, 1, 1, {100.0, 100.0, 100.0, 0.0},
                                                               eDeviceType::GPU)));
-    TEST_CASE((TestCorrectness<short4, BORDER_TYPE_WRAP>(1, 24, 24, FMT_RGBAs16, 5, 5, {0.0, 0.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<short4, BORDER_TYPE_WRAP>(1, 24, 24, FMT_RGBAs16, 5, 5, 2, 2, {0.0, 0.0, 0.0, 0.0},
                                                          eDeviceType::GPU)));
 
     // GPU correctness tests - U16 (unsigned 16-bit)
-    TEST_CASE((TestCorrectness<ushort1, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_U16, 3, 3, {0.0, 0.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<ushort1, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_U16, 3, 3, 1, 1, {0.0, 0.0, 0.0, 0.0},
                                                               eDeviceType::GPU)));
-    TEST_CASE((TestCorrectness<ushort1, BORDER_TYPE_REFLECT>(2, 20, 20, FMT_U16, 5, 5, {0.0, 0.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<ushort1, BORDER_TYPE_REFLECT>(2, 20, 20, FMT_U16, 5, 5, 2, 2, {0.0, 0.0, 0.0, 0.0},
                                                              eDeviceType::GPU)));
-    TEST_CASE((TestCorrectness<ushort3, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_RGB16, 3, 3, {500.0, 600.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<ushort3, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_RGB16, 3, 3, 1, 1, {500.0, 600.0, 0.0, 0.0},
                                                               eDeviceType::GPU)));
-    TEST_CASE((TestCorrectness<ushort4, BORDER_TYPE_REFLECT>(1, 20, 20, FMT_RGBA16, 5, 5, {500.0, 600.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<ushort4, BORDER_TYPE_REFLECT>(1, 20, 20, FMT_RGBA16, 5, 5, 2, 2, {500.0, 600.0, 0.0, 0.0},
                                                              eDeviceType::GPU)));
 
     // GPU correctness tests - S32 (signed 32-bit)
-    TEST_CASE((TestCorrectness<int1, BORDER_TYPE_CONSTANT>(1, 32, 32, FMT_S32, 3, 3, {500.0, 500.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<int1, BORDER_TYPE_CONSTANT>(1, 32, 32, FMT_S32, 3, 3, 1, 1, {500.0, 500.0, 0.0, 0.0},
                                                            eDeviceType::GPU)));
-    TEST_CASE((TestCorrectness<int1, BORDER_TYPE_WRAP>(2, 32, 32, FMT_S32, 5, 5, {500.0, 500.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<int1, BORDER_TYPE_WRAP>(2, 32, 32, FMT_S32, 5, 5, 2, 2, {500.0, 500.0, 0.0, 0.0},
                                                        eDeviceType::GPU)));
-    TEST_CASE((TestCorrectness<int3, BORDER_TYPE_REPLICATE>(1, 16, 16, FMT_RGBs32, 3, 3, {100.0, 100.0, 100.0, 0.0},
+    TEST_CASE((TestCorrectness<int3, BORDER_TYPE_REPLICATE>(1, 16, 16, FMT_RGBs32, 3, 3, 1, 1, {100.0, 100.0, 100.0, 0.0},
                                                             eDeviceType::GPU)));
-    TEST_CASE((TestCorrectness<int4, BORDER_TYPE_REFLECT>(2, 24, 24, FMT_RGBAs32, 5, 5, {0.0, 0.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<int4, BORDER_TYPE_REFLECT>(2, 24, 24, FMT_RGBAs32, 5, 5, 2, 2, {0.0, 0.0, 0.0, 0.0},
                                                           eDeviceType::GPU)));
 
     // GPU correctness tests - F32 (float)
-    TEST_CASE((TestCorrectness<float1, BORDER_TYPE_REPLICATE>(1, 24, 24, FMT_F32, 3, 3, {500.0, 500.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<float1, BORDER_TYPE_REPLICATE>(1, 24, 24, FMT_F32, 3, 3, 1, 1, {500.0, 500.0, 0.0, 0.0},
                                                               eDeviceType::GPU)));
-    TEST_CASE((TestCorrectness<float1, BORDER_TYPE_WRAP>(2, 24, 24, FMT_F32, 5, 5, {600.0, 500.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<float1, BORDER_TYPE_WRAP>(2, 24, 24, FMT_F32, 5, 5, 2, 2, {600.0, 500.0, 0.0, 0.0},
                                                          eDeviceType::GPU)));
-    TEST_CASE((TestCorrectness<float3, BORDER_TYPE_REPLICATE>(1, 24, 24, FMT_RGBf32, 3, 3, {500.0, 500.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<float3, BORDER_TYPE_REPLICATE>(1, 24, 24, FMT_RGBf32, 3, 3, 1, 1, {500.0, 500.0, 0.0, 0.0},
                                                               eDeviceType::GPU)));
-    TEST_CASE((TestCorrectness<float3, BORDER_TYPE_WRAP>(2, 24, 24, FMT_RGBf32, 7, 7, {600.0, 500.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<float3, BORDER_TYPE_WRAP>(2, 24, 24, FMT_RGBf32, 7, 7, 3, 3, {600.0, 500.0, 0.0, 0.0},
                                                          eDeviceType::GPU)));
-    TEST_CASE((TestCorrectness<float4, BORDER_TYPE_REPLICATE>(1, 24, 24, FMT_RGBAf32, 3, 3, {500.0, 500.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<float4, BORDER_TYPE_REPLICATE>(1, 24, 24, FMT_RGBAf32, 3, 3, 1, 1, {500.0, 500.0, 0.0, 0.0},
                                                               eDeviceType::GPU)));
-    TEST_CASE((TestCorrectness<float4, BORDER_TYPE_WRAP>(2, 24, 24, FMT_RGBAf32, 5, 5, {600.0, 500.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<float4, BORDER_TYPE_WRAP>(2, 24, 24, FMT_RGBAf32, 5, 5, 2, 2, {600.0, 500.0, 0.0, 0.0},
                                                          eDeviceType::GPU)));
 
     // CPU correctness tests - U8 (1 channel)
-    TEST_CASE((TestCorrectness<uchar, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_U8, 3, 3, {0.0, 0.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<uchar, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_U8, 3, 3, 1, 1, {0.0, 0.0, 0.0, 0.0},
                                                             eDeviceType::CPU)));
-    TEST_CASE((TestCorrectness<uchar, BORDER_TYPE_REPLICATE>(4, 32, 32, FMT_U8, 5, 5, {0.0, 0.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<uchar, BORDER_TYPE_REPLICATE>(4, 32, 32, FMT_U8, 5, 5, 2, 2, {0.0, 0.0, 0.0, 0.0},
                                                              eDeviceType::CPU)));
-    TEST_CASE((TestCorrectness<uchar, BORDER_TYPE_REFLECT>(2, 24, 24, FMT_U8, 7, 7, {0.0, 0.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<uchar, BORDER_TYPE_REFLECT>(2, 24, 24, FMT_U8, 7, 7, 3, 3, {0.0, 0.0, 0.0, 0.0},
                                                            eDeviceType::CPU)));
 
     // CPU correctness tests - RGB8 (3 channels)
-    TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_RGB8, 3, 3, {0.0, 0.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_RGB8, 3, 3, 1, 1, {0.0, 0.0, 0.0, 0.0},
                                                              eDeviceType::CPU)));
-    TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_REFLECT>(2, 32, 32, FMT_RGB8, 5, 5, {100.0, 100.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_REFLECT>(2, 32, 32, FMT_RGB8, 5, 5, 2, 2, {100.0, 100.0, 0.0, 0.0},
                                                             eDeviceType::CPU)));
-    TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_WRAP>(1, 16, 16, FMT_RGB8, 3, 3, {50.0, 50.0, 50.0, 0.0},
+    TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_WRAP>(1, 16, 16, FMT_RGB8, 3, 3, 1, 1, {50.0, 50.0, 50.0, 0.0},
                                                          eDeviceType::CPU)));
 
     // CPU correctness tests - RGBA8 (4 channels)
-    TEST_CASE((TestCorrectness<uchar4, BORDER_TYPE_WRAP>(1, 10, 10, FMT_RGBA8, 3, 3, {0.0, 0.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<uchar4, BORDER_TYPE_WRAP>(1, 10, 10, FMT_RGBA8, 3, 3, 1, 1, {0.0, 0.0, 0.0, 0.0},
                                                          eDeviceType::CPU)));
-    TEST_CASE((TestCorrectness<uchar4, BORDER_TYPE_REPLICATE>(5, 64, 64, FMT_RGBA8, 5, 5, {0.0, 0.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<uchar4, BORDER_TYPE_REPLICATE>(5, 64, 64, FMT_RGBA8, 5, 5, 2, 2, {0.0, 0.0, 0.0, 0.0},
                                                               eDeviceType::CPU)));
-    TEST_CASE((TestCorrectness<uchar4, BORDER_TYPE_REFLECT101>(2, 20, 20, FMT_RGBA8, 7, 7, {128.0, 128.0, 128.0, 255.0},
+    TEST_CASE((TestCorrectness<uchar4, BORDER_TYPE_REFLECT101>(2, 20, 20, FMT_RGBA8, 7, 7, 3, 3, {128.0, 128.0, 128.0, 255.0},
                                                                eDeviceType::CPU)));
 
     // CPU correctness tests - S16 (signed 16-bit)
-    TEST_CASE((TestCorrectness<short1, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_S16, 3, 3, {500.0, 500.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<short1, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_S16, 3, 3, 1, 1, {500.0, 500.0, 0.0, 0.0},
                                                              eDeviceType::CPU)));
-    TEST_CASE((TestCorrectness<short1, BORDER_TYPE_REFLECT>(3, 20, 20, FMT_S16, 5, 5, {500.0, 500.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<short1, BORDER_TYPE_REFLECT>(3, 20, 20, FMT_S16, 5, 5, 2, 2, {500.0, 500.0, 0.0, 0.0},
                                                             eDeviceType::CPU)));
-    TEST_CASE((TestCorrectness<short3, BORDER_TYPE_REPLICATE>(2, 16, 16, FMT_RGBs16, 3, 3, {100.0, 100.0, 100.0, 0.0},
+    TEST_CASE((TestCorrectness<short3, BORDER_TYPE_REPLICATE>(2, 16, 16, FMT_RGBs16, 3, 3, 1, 1, {100.0, 100.0, 100.0, 0.0},
                                                               eDeviceType::CPU)));
-    TEST_CASE((TestCorrectness<short4, BORDER_TYPE_WRAP>(1, 24, 24, FMT_RGBAs16, 5, 5, {0.0, 0.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<short4, BORDER_TYPE_WRAP>(1, 24, 24, FMT_RGBAs16, 5, 5, 2, 2, {0.0, 0.0, 0.0, 0.0},
                                                          eDeviceType::CPU)));
 
     // CPU correctness tests - U16 (unsigned 16-bit)
-    TEST_CASE((TestCorrectness<ushort1, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_U16, 3, 3, {0.0, 0.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<ushort1, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_U16, 3, 3, 1, 1, {0.0, 0.0, 0.0, 0.0},
                                                               eDeviceType::CPU)));
-    TEST_CASE((TestCorrectness<ushort1, BORDER_TYPE_REFLECT>(2, 20, 20, FMT_U16, 5, 5, {0.0, 0.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<ushort1, BORDER_TYPE_REFLECT>(2, 20, 20, FMT_U16, 5, 5, 2, 2, {0.0, 0.0, 0.0, 0.0},
                                                              eDeviceType::CPU)));
-    TEST_CASE((TestCorrectness<ushort3, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_RGB16, 3, 3, {500.0, 600.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<ushort3, BORDER_TYPE_CONSTANT>(1, 20, 20, FMT_RGB16, 3, 3, 1, 1, {500.0, 600.0, 0.0, 0.0},
                                                               eDeviceType::CPU)));
-    TEST_CASE((TestCorrectness<ushort4, BORDER_TYPE_REFLECT>(1, 20, 20, FMT_RGBA16, 5, 5, {500.0, 600.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<ushort4, BORDER_TYPE_REFLECT>(1, 20, 20, FMT_RGBA16, 5, 5, 2, 2, {500.0, 600.0, 0.0, 0.0},
                                                              eDeviceType::CPU)));
 
     // CPU correctness tests - S32 (signed 32-bit)
-    TEST_CASE((TestCorrectness<int1, BORDER_TYPE_CONSTANT>(1, 32, 32, FMT_S32, 3, 3, {500.0, 500.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<int1, BORDER_TYPE_CONSTANT>(1, 32, 32, FMT_S32, 3, 3, 1, 1, {500.0, 500.0, 0.0, 0.0},
                                                            eDeviceType::CPU)));
-    TEST_CASE((TestCorrectness<int1, BORDER_TYPE_WRAP>(2, 32, 32, FMT_S32, 5, 5, {500.0, 500.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<int1, BORDER_TYPE_WRAP>(2, 32, 32, FMT_S32, 5, 5, 2, 2, {500.0, 500.0, 0.0, 0.0},
                                                        eDeviceType::CPU)));
-    TEST_CASE((TestCorrectness<int3, BORDER_TYPE_REPLICATE>(1, 16, 16, FMT_RGBs32, 3, 3, {100.0, 100.0, 100.0, 0.0},
+    TEST_CASE((TestCorrectness<int3, BORDER_TYPE_REPLICATE>(1, 16, 16, FMT_RGBs32, 3, 3, 1, 1, {100.0, 100.0, 100.0, 0.0},
                                                             eDeviceType::CPU)));
-    TEST_CASE((TestCorrectness<int4, BORDER_TYPE_REFLECT>(2, 24, 24, FMT_RGBAs32, 5, 5, {0.0, 0.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<int4, BORDER_TYPE_REFLECT>(2, 24, 24, FMT_RGBAs32, 5, 5, 2, 2, {0.0, 0.0, 0.0, 0.0},
                                                           eDeviceType::CPU)));
 
     // CPU correctness tests - F32 (float)
-    TEST_CASE((TestCorrectness<float1, BORDER_TYPE_REPLICATE>(1, 24, 24, FMT_F32, 3, 3, {500.0, 500.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<float1, BORDER_TYPE_REPLICATE>(1, 24, 24, FMT_F32, 3, 3, 1, 1, {500.0, 500.0, 0.0, 0.0},
                                                               eDeviceType::CPU)));
-    TEST_CASE((TestCorrectness<float1, BORDER_TYPE_WRAP>(2, 24, 24, FMT_F32, 5, 5, {600.0, 500.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<float1, BORDER_TYPE_WRAP>(2, 24, 24, FMT_F32, 5, 5, 2, 2, {600.0, 500.0, 0.0, 0.0},
                                                          eDeviceType::CPU)));
-    TEST_CASE((TestCorrectness<float3, BORDER_TYPE_REPLICATE>(1, 24, 24, FMT_RGBf32, 3, 3, {500.0, 500.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<float3, BORDER_TYPE_REPLICATE>(1, 24, 24, FMT_RGBf32, 3, 3, 1, 1, {500.0, 500.0, 0.0, 0.0},
                                                               eDeviceType::CPU)));
-    TEST_CASE((TestCorrectness<float3, BORDER_TYPE_WRAP>(2, 24, 24, FMT_RGBf32, 7, 7, {600.0, 500.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<float3, BORDER_TYPE_WRAP>(2, 24, 24, FMT_RGBf32, 7, 7, 3, 3, {600.0, 500.0, 0.0, 0.0},
                                                          eDeviceType::CPU)));
-    TEST_CASE((TestCorrectness<float4, BORDER_TYPE_REPLICATE>(1, 24, 24, FMT_RGBAf32, 3, 3, {500.0, 500.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<float4, BORDER_TYPE_REPLICATE>(1, 24, 24, FMT_RGBAf32, 3, 3, 1, 1, {500.0, 500.0, 0.0, 0.0},
                                                               eDeviceType::CPU)));
-    TEST_CASE((TestCorrectness<float4, BORDER_TYPE_WRAP>(2, 24, 24, FMT_RGBAf32, 5, 5, {600.0, 500.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<float4, BORDER_TYPE_WRAP>(2, 24, 24, FMT_RGBAf32, 5, 5, 2, 2, {600.0, 500.0, 0.0, 0.0},
                                                          eDeviceType::CPU)));
 
     // Additional edge cases - various kernel sizes
-    TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_REPLICATE>(1, 40, 40, FMT_RGB8, 9, 9, {0.0, 0.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_REPLICATE>(1, 40, 40, FMT_RGB8, 9, 9, 4, 4, {0.0, 0.0, 0.0, 0.0},
                                                               eDeviceType::GPU)));
-    TEST_CASE((TestCorrectness<float4, BORDER_TYPE_CONSTANT>(1, 50, 50, FMT_RGBAf32, 11, 11, {0.0, 0.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<float4, BORDER_TYPE_CONSTANT>(1, 50, 50, FMT_RGBAf32, 11, 11, 5, 5, {0.0, 0.0, 0.0, 0.0},
                                                              eDeviceType::GPU)));
-    TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_REPLICATE>(1, 40, 40, FMT_RGB8, 9, 9, {0.0, 0.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<uchar3, BORDER_TYPE_REPLICATE>(1, 40, 40, FMT_RGB8, 9, 9, 4, 4, {0.0, 0.0, 0.0, 0.0},
                                                               eDeviceType::CPU)));
-    TEST_CASE((TestCorrectness<float4, BORDER_TYPE_CONSTANT>(1, 50, 50, FMT_RGBAf32, 11, 11, {0.0, 0.0, 0.0, 0.0},
+    TEST_CASE((TestCorrectness<float4, BORDER_TYPE_CONSTANT>(1, 50, 50, FMT_RGBAf32, 11, 11, 5, 5, {0.0, 0.0, 0.0, 0.0},
                                                              eDeviceType::CPU)));
 
     TEST_CASES_END();
diff --git a/tests/roccv/python/test_op_avg_blur.py b/tests/roccv/python/test_op_avg_blur.py
index d38ab2fe..877c175d 100644
--- a/tests/roccv/python/test_op_avg_blur.py
+++ b/tests/roccv/python/test_op_avg_blur.py
@@ -33,7 +33,7 @@
 @pytest.mark.parametrize("border_val", [[0, 0, 0, 0]])
 @pytest.mark.parametrize("kernel_size,anchor", [
     ((3, 3), (1, 1)),   
-    ((7, 7), (3, 3)),   
+    ((7, 7), (-1, -1)),   
     ((9, 9), (4, 4))
 ])
 @pytest.mark.parametrize("channels", [1, 3, 4])