From fc788bc4cb1f81b473c5f0a5311a0d1a37138703 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Tue, 28 Apr 2026 18:36:00 -0400
Subject: [PATCH 01/20] Expose tensor pointer in rocpycv

---
 python/include/py_tensor.hpp | 16 ++++++++++++++++
 python/src/py_tensor.cpp     | 10 ++++++++++
 2 files changed, 26 insertions(+)
diff --git a/python/include/py_tensor.hpp b/python/include/py_tensor.hpp
index 523a5f27..f0c3e1d2 100644
--- a/python/include/py_tensor.hpp
+++ b/python/include/py_tensor.hpp
@@ -138,6 +138,22 @@ class PyTensor : public std::enable_shared_from_this<PyTensor> {
      */
     eDeviceType getDevice();
 
+    /**
+     * @brief Returns the address of the tensor's underlying data buffer as an
+     * unsigned integer. For GPU tensors this is a HIP device address; for CPU
+     * tensors it is a host address. Use ``device()`` to disambiguate.
+     *
+     * The pointer is non-owning. The caller is responsible for ensuring this
+     * PyTensor remains alive for as long as the pointer is used; otherwise the
+     * underlying buffer may be freed and the pointer left dangling.
+     *
+     * Intended for zero-copy interop with frameworks that accept a raw
+     * pointer + shape + dtype (e.g. ``migraphx.argument_from_pointer``).
+     *
+     * @return uintptr_t
+     */
+    uintptr_t getDataPtr();
+
     /**
      * @brief Gets the underlying roccv::Tensor that this tensor container wraps.
      *
diff --git a/python/src/py_tensor.cpp b/python/src/py_tensor.cpp
index 913a25a4..8c864b46 100644
--- a/python/src/py_tensor.cpp
+++ b/python/src/py_tensor.cpp
@@ -178,6 +178,11 @@ eTensorLayout PyTensor::getLayout() { return m_tensor->layout().elayout(); }
 
 eDeviceType PyTensor::getDevice() { return m_tensor->device(); }
 
+uintptr_t PyTensor::getDataPtr() {
+    auto tensorData = m_tensor->exportData<roccv::TensorDataStrided>();
+    return reinterpret_cast<uintptr_t>(tensorData.basePtr());
+}
+
 std::shared_ptr<roccv::Tensor> PyTensor::getTensor() { return m_tensor; }
 
 py::tuple PyTensor::getDLDevice() {
@@ -207,6 +212,11 @@ void PyTensor::Export(pybind11::module& m) {
         .def("shape", &PyTensor::getShape, "Returns a list representing the tensor shape.")
         .def("layout", &PyTensor::getLayout, "Returns the layout for this tensor.")
         .def("device", &PyTensor::getDevice, "Returns the device this tensor is on.")
+        .def("data_ptr", &PyTensor::getDataPtr,
+             "Returns the address of the tensor's underlying buffer as an integer. "
+             "For GPU tensors this is a HIP device address; for CPU tensors a host address. "
+             "The pointer is non-owning -- keep the tensor alive for as long as the pointer is used. "
+             "Intended for zero-copy interop with frameworks like MIGraphX.")
         .def("ndim", &PyTensor::getRank, "Returns the number of dimensions of the tensor.")
         .def("dtype", &PyTensor::getDataType, "Returns the data type of the tensor.")
         .def("__dlpack_device__", &PyTensor::getDLDevice,

From 134eb7ceed2c7240d633c02f47ed4c01621a546e Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Tue, 28 Apr 2026 18:40:44 -0400
Subject: [PATCH 02/20] Expose stream pointer in rocpycv

---
 python/include/py_stream.hpp | 12 ++++++++++++
 python/src/py_stream.cpp     |  9 ++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/python/include/py_stream.hpp b/python/include/py_stream.hpp
index 202420c9..a5923359 100644
--- a/python/include/py_stream.hpp
+++ b/python/include/py_stream.hpp
@@ -55,6 +55,18 @@ class PyStream {
      */
     void synchronize();
 
+    /**
+     * @brief Returns the wrapped HIP stream handle as an unsigned integer.
+     *
+     * Intended for zero-copy interop with frameworks that accept a raw HIP
+     * stream handle (e.g. ``migraphx.run_async`` with stream type
+     * ``"ihipStream_t"``). The handle is non-owning -- keep this PyStream alive
+     * for as long as the handle is in use.
+     *
+     * @return uintptr_t
+     */
+    uintptr_t getHandle();
+
     /**
      * @brief Exports the PyStream object to the specified python module.
      *
diff --git a/python/src/py_stream.cpp b/python/src/py_stream.cpp
index ac6ae73f..c830bc8e 100644
--- a/python/src/py_stream.cpp
+++ b/python/src/py_stream.cpp
@@ -38,8 +38,15 @@ PyStream::~PyStream() {
 
 void PyStream::synchronize() { HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(m_stream)); }
 
+uintptr_t PyStream::getHandle() { return reinterpret_cast<uintptr_t>(m_stream); }
+
 void PyStream::Export(py::module& m) {
     py::class_<PyStream>(m, "Stream", "Python wrapper for HIP streams.")
         .def(py::init<>(), "Creates a HIP stream.")
-        .def("synchronize", &PyStream::synchronize, "Blocks until all worked queued on this stream is finished.");
+        .def("synchronize", &PyStream::synchronize, "Blocks until all worked queued on this stream is finished.")
+        .def("handle", &PyStream::getHandle,
+             "Returns the underlying HIP stream handle (hipStream_t) as an integer. "
+             "Intended for zero-copy interop with frameworks that accept a raw stream handle, "
+             "e.g. migraphx.run_async(..., stream_handle, \"ihipStream_t\"). "
+             "The handle is non-owning -- keep the Stream alive while the handle is in use.");
 }
\ No newline at end of file

From 228f422133b1fa5833cc66a4ace8cef7ff977cb2 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 29 Apr 2026 10:31:57 -0400
Subject: [PATCH 03/20] Update stubs

---
 python/src/rocpycv.pyi | 98 ++++++++++++++++++++++--------------------
 1 file changed, 52 insertions(+), 46 deletions(-)

diff --git a/python/src/rocpycv.pyi b/python/src/rocpycv.pyi
index 855a8d5a..c6bc639c 100644
--- a/python/src/rocpycv.pyi
+++ b/python/src/rocpycv.pyi
@@ -153,6 +153,10 @@ class Stream:
         """
         Creates a HIP stream.
         """
+    def handle(self) -> int:
+        """
+        Returns the underlying HIP stream handle (hipStream_t) as an integer. Intended for zero-copy interop with frameworks that accept a raw stream handle, e.g. migraphx.run_async(..., stream_handle, "ihipStream_t"). The handle is non-owning -- keep the Stream alive while the handle is in use.
+        """
     def synchronize(self) -> None:
         """
         Blocks until all worked queued on this stream is finished.
@@ -166,7 +170,7 @@ class Tensor:
         """
         Returns a tuple containing the DLPack device and device id for the tensor.
         """
-    def __init__(self, shape: collections.abc.Sequence[typing.SupportsInt | typing.SupportsIndex], layout: eTensorLayout, dtype: eDataType, device: eDeviceType = ...) -> None:
+    def __init__(self, shape: collections.abc.Sequence[typing.SupportsInt | typing.SupportsIndex], layout: eTensorLayout, dtype: eDataType, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
         """
         Constructs a tensor object.
         """
@@ -174,6 +178,10 @@ class Tensor:
         """
         Returns a deep copy of the tensor with data copied to a specified device type.
         """
+    def data_ptr(self) -> int:
+        """
+        Returns the address of the tensor's underlying buffer as an integer. For GPU tensors this is a HIP device address; for CPU tensors a host address. The pointer is non-owning -- keep the tensor alive for as long as the pointer is used. Intended for zero-copy interop with frameworks like MIGraphX.
+        """
     def device(self) -> eDeviceType:
         """
         Returns the device this tensor is on.
@@ -472,8 +480,6 @@ class eDataType:
       F32
     
       F64
-    
-      4S16
     """
     F32: typing.ClassVar[eDataType]  # value = <eDataType.F32: 6>
     F64: typing.ClassVar[eDataType]  # value = <eDataType.F64: 7>
@@ -483,7 +489,7 @@ class eDataType:
     U16: typing.ClassVar[eDataType]  # value = <eDataType.U16: 2>
     U32: typing.ClassVar[eDataType]  # value = <eDataType.U32: 4>
     U8: typing.ClassVar[eDataType]  # value = <eDataType.U8: 0>
-    __members__: typing.ClassVar[dict[str, eDataType]]  # value = {'U8': <eDataType.U8: 0>, 'S8': <eDataType.S8: 1>, 'U16': <eDataType.U16: 2>, 'S16': <eDataType.S16: 3>, 'U32': <eDataType.U32: 4>, 'S32': <eDataType.S32: 5>, 'F32': <eDataType.F32: 6>, 'F64': <eDataType.F64: 7>, '4S16': <eDataType.4S16: 8>}
+    __members__: typing.ClassVar[dict[str, eDataType]]  # value = {'U8': <eDataType.U8: 0>, 'S8': <eDataType.S8: 1>, 'U16': <eDataType.U16: 2>, 'S16': <eDataType.S16: 3>, 'U32': <eDataType.U32: 4>, 'S32': <eDataType.S32: 5>, 'F32': <eDataType.F32: 6>, 'F64': <eDataType.F64: 7>}
     def __eq__(self, other: typing.Any) -> bool:
         ...
     def __getstate__(self) -> int:
@@ -728,7 +734,7 @@ class eThresholdType:
     @property
     def value(self) -> int:
         ...
-def advcvtcolor(src: Tensor, conversion_code: eColorConversionCode, color_spec: eColorSpec, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def advcvtcolor(src: Tensor, conversion_code: eColorConversionCode, color_spec: eColorSpec, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Advanced Color Convert operation on the given HIP stream.
     
@@ -745,7 +751,7 @@ def advcvtcolor(src: Tensor, conversion_code: eColorConversionCode, color_spec:
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def advcvtcolor_into(dst: Tensor, src: Tensor, conversion_code: eColorConversionCode, color_spec: eColorSpec, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def advcvtcolor_into(dst: Tensor, src: Tensor, conversion_code: eColorConversionCode, color_spec: eColorSpec, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Advanced Color Convert operation on the given HIP stream.
     
@@ -763,7 +769,7 @@ def advcvtcolor_into(dst: Tensor, src: Tensor, conversion_code: eColorConversion
                 Returns:
                     None
     """
-def bilateral_filter(src: Tensor, diameter: typing.SupportsInt | typing.SupportsIndex, sigmaColor: typing.SupportsFloat | typing.SupportsIndex, sigmaSpace: typing.SupportsFloat | typing.SupportsIndex, borderMode: eBorderType, borderValue: list, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def bilateral_filter(src: Tensor, diameter: typing.SupportsInt | typing.SupportsIndex, sigmaColor: typing.SupportsFloat | typing.SupportsIndex, sigmaSpace: typing.SupportsFloat | typing.SupportsIndex, borderMode: eBorderType, borderValue: list, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Bilateral Filter operation on the given HIP stream.
     
@@ -783,7 +789,7 @@ def bilateral_filter(src: Tensor, diameter: typing.SupportsInt | typing.Supports
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def bilateral_filter_into(dst: Tensor, src: Tensor, diameter: typing.SupportsInt | typing.SupportsIndex, sigmaColor: typing.SupportsFloat | typing.SupportsIndex, sigmaSpace: typing.SupportsFloat | typing.SupportsIndex, borderMode: eBorderType, borderValue: list, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def bilateral_filter_into(dst: Tensor, src: Tensor, diameter: typing.SupportsInt | typing.SupportsIndex, sigmaColor: typing.SupportsFloat | typing.SupportsIndex, sigmaSpace: typing.SupportsFloat | typing.SupportsIndex, borderMode: eBorderType, borderValue: list, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Bilateral Filter operation on the given HIP stream.
     
@@ -804,7 +810,7 @@ def bilateral_filter_into(dst: Tensor, src: Tensor, diameter: typing.SupportsInt
                 Returns:
                     None
     """
-def bndbox(src: Tensor, bnd_boxes: BndBoxes, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def bndbox(src: Tensor, bnd_boxes: BndBoxes, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the BndBox operation on the given HIP stream.
     
@@ -820,7 +826,7 @@ def bndbox(src: Tensor, bnd_boxes: BndBoxes, stream: rocpycv.Stream | None = Non
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def bndbox_into(dst: Tensor, src: Tensor, bnd_boxes: BndBoxes, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def bndbox_into(dst: Tensor, src: Tensor, bnd_boxes: BndBoxes, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the BndBox operation on the given HIP stream.
     
@@ -837,7 +843,7 @@ def bndbox_into(dst: Tensor, src: Tensor, bnd_boxes: BndBoxes, stream: rocpycv.S
                 Returns:
                     None
     """
-def center_crop(src: Tensor, crop_size: tuple, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def center_crop(src: Tensor, crop_size: tuple, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Center Crop operation on the given HIP stream.
     
@@ -854,7 +860,7 @@ def center_crop(src: Tensor, crop_size: tuple, stream: rocpycv.Stream | None = N
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def center_crop_into(dst: Tensor, src: Tensor, crop_size: tuple, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def center_crop_into(dst: Tensor, src: Tensor, crop_size: tuple, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Center Crop operation on the given HIP stream.
     
@@ -871,7 +877,7 @@ def center_crop_into(dst: Tensor, src: Tensor, crop_size: tuple, stream: rocpycv
                 Returns:
                     None
     """
-def composite(foreground: Tensor, background: Tensor, fgmask: Tensor, outchannels: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def composite(foreground: Tensor, background: Tensor, fgmask: Tensor, outchannels: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Composite operation on the given HIP stream.
     
@@ -889,7 +895,7 @@ def composite(foreground: Tensor, background: Tensor, fgmask: Tensor, outchannel
                 Returns:
                     rocpycv.Tensor: The output tensor with <outchannels> number of channels.
     """
-def composite_into(dst: Tensor, foreground: Tensor, background: Tensor, fgmask: Tensor, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def composite_into(dst: Tensor, foreground: Tensor, background: Tensor, fgmask: Tensor, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Composite operation on the given HIP stream.
     
@@ -907,7 +913,7 @@ def composite_into(dst: Tensor, foreground: Tensor, background: Tensor, fgmask:
                 Returns:
                     None
     """
-def convert_to(src: Tensor, dtype: eDataType, alpha: typing.SupportsFloat | typing.SupportsIndex = 1.0, beta: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def convert_to(src: Tensor, dtype: eDataType, alpha: typing.SupportsFloat | typing.SupportsIndex = 1.0, beta: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Convert To operation on the given HIP stream.
     
@@ -925,7 +931,7 @@ def convert_to(src: Tensor, dtype: eDataType, alpha: typing.SupportsFloat | typi
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def convert_to_into(dst: Tensor, src: Tensor, alpha: typing.SupportsFloat | typing.SupportsIndex = 1.0, beta: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def convert_to_into(dst: Tensor, src: Tensor, alpha: typing.SupportsFloat | typing.SupportsIndex = 1.0, beta: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Convert To operation on the given HIP stream.
     
@@ -943,7 +949,7 @@ def convert_to_into(dst: Tensor, src: Tensor, alpha: typing.SupportsFloat | typi
                 Returns:
                     None
     """
-def copymakeborder(src: Tensor, border_mode: eBorderType = ..., border_value: list = [0.0, 0.0, 0.0, 0.0], top: typing.SupportsInt | typing.SupportsIndex, bottom: typing.SupportsInt | typing.SupportsIndex, left: typing.SupportsInt | typing.SupportsIndex, right: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def copymakeborder(src: Tensor, border_mode: eBorderType = eBorderType.eBorderType.CONSTANT, border_value: list = [0.0, 0.0, 0.0, 0.0], top: typing.SupportsInt | typing.SupportsIndex, bottom: typing.SupportsInt | typing.SupportsIndex, left: typing.SupportsInt | typing.SupportsIndex, right: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the CopyMakeBorder operation on the given HIP stream.
     
@@ -964,7 +970,7 @@ def copymakeborder(src: Tensor, border_mode: eBorderType = ..., border_value: li
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def copymakeborder_into(dst: Tensor, src: Tensor, border_mode: eBorderType = ..., border_value: list = [0.0, 0.0, 0.0, 0.0], top: typing.SupportsInt | typing.SupportsIndex, left: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def copymakeborder_into(dst: Tensor, src: Tensor, border_mode: eBorderType = eBorderType.eBorderType.CONSTANT, border_value: list = [0.0, 0.0, 0.0, 0.0], top: typing.SupportsInt | typing.SupportsIndex, left: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the CopyMakeBorder operation on the given HIP stream.
     
@@ -984,7 +990,7 @@ def copymakeborder_into(dst: Tensor, src: Tensor, border_mode: eBorderType = ...
                 Returns:
                     None
     """
-def custom_crop(src: Tensor, crop_rect: Box, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def custom_crop(src: Tensor, crop_rect: Box, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Custom Crop operation on the given HIP stream.
     
@@ -1001,7 +1007,7 @@ def custom_crop(src: Tensor, crop_rect: Box, stream: rocpycv.Stream | None = Non
                 Returns:
                     None
     """
-def custom_crop_into(dst: Tensor, src: Tensor, crop_rect: Box, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def custom_crop_into(dst: Tensor, src: Tensor, crop_rect: Box, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Custom Crop operation on the given HIP stream.
     
@@ -1017,7 +1023,7 @@ def custom_crop_into(dst: Tensor, src: Tensor, crop_rect: Box, stream: rocpycv.S
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def cvtcolor(src: Tensor, conversion_code: eColorConversionCode, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def cvtcolor(src: Tensor, conversion_code: eColorConversionCode, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Color Convert operation on the given HIP stream.
     
@@ -1033,7 +1039,7 @@ def cvtcolor(src: Tensor, conversion_code: eColorConversionCode, stream: rocpycv
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def cvtcolor_into(dst: Tensor, src: Tensor, conversion_code: eColorConversionCode, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def cvtcolor_into(dst: Tensor, src: Tensor, conversion_code: eColorConversionCode, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Color Convert operation on the given HIP stream.
     
@@ -1050,7 +1056,7 @@ def cvtcolor_into(dst: Tensor, src: Tensor, conversion_code: eColorConversionCod
                 Returns:
                     None
     """
-def flip(src: Tensor, flip_code: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def flip(src: Tensor, flip_code: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Flip operation on the given HIP stream.
     
@@ -1066,7 +1072,7 @@ def flip(src: Tensor, flip_code: typing.SupportsInt | typing.SupportsIndex, stre
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def flip_into(dst: Tensor, src: Tensor, flip_code: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def flip_into(dst: Tensor, src: Tensor, flip_code: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Flip operation on the given HIP stream.
     
@@ -1087,7 +1093,7 @@ def from_dlpack(buffer: typing.Any, layout: eTensorLayout) -> Tensor:
     """
     Wraps a DLPack supported tensor in a rocpycv tensor.
     """
-def gamma_contrast(src: Tensor, gamma: typing.SupportsFloat | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def gamma_contrast(src: Tensor, gamma: typing.SupportsFloat | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Gamma Contrast operation on the given HIP stream.
     
@@ -1103,7 +1109,7 @@ def gamma_contrast(src: Tensor, gamma: typing.SupportsFloat | typing.SupportsInd
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def gamma_contrast_into(dst: Tensor, src: Tensor, gamma: typing.SupportsFloat | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def gamma_contrast_into(dst: Tensor, src: Tensor, gamma: typing.SupportsFloat | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Gamma Contrast operation on the given HIP stream.
     
@@ -1120,7 +1126,7 @@ def gamma_contrast_into(dst: Tensor, src: Tensor, gamma: typing.SupportsFloat |
                 Returns:
                     None
     """
-def histogram(src: Tensor, mask: rocpycv.Tensor | None, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def histogram(src: Tensor, mask: rocpycv.Tensor | None, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Histogram operation on the given HIP stream.
     
@@ -1136,7 +1142,7 @@ def histogram(src: Tensor, mask: rocpycv.Tensor | None, stream: rocpycv.Stream |
                 Returns:
                     rocpycv.Tensor: Output tensor with width of 256 and a height equal to the batch size of input (1 if HWC input).
     """
-def histogram_into(dst: Tensor, src: Tensor, mask: rocpycv.Tensor | None, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def histogram_into(dst: Tensor, src: Tensor, mask: rocpycv.Tensor | None, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Histogram operation on the given HIP stream.
     
@@ -1153,7 +1159,7 @@ def histogram_into(dst: Tensor, src: Tensor, mask: rocpycv.Tensor | None, stream
                 Returns:
                     None
     """
-def nms(src: Tensor, scores: Tensor, score_threshold: typing.SupportsFloat | typing.SupportsIndex = 1.1920928955078125e-07, iou_threshold: typing.SupportsFloat | typing.SupportsIndex = 1.0, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def nms(src: Tensor, scores: Tensor, score_threshold: typing.SupportsFloat | typing.SupportsIndex = 1.1920928955078125e-07, iou_threshold: typing.SupportsFloat | typing.SupportsIndex = 1.0, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Non-maximum Suppression operation on the given HIP stream.
     
@@ -1171,7 +1177,7 @@ def nms(src: Tensor, scores: Tensor, score_threshold: typing.SupportsFloat | typ
                 Returns:
                     rocpycv.Tensor: The output tensor of shape [i, j], containing 1 (kept) or 0 (suppressed) for each bounding box (j) per batch (i). Results will be written to this tensor.
     """
-def nms_into(dst: Tensor, src: Tensor, scores: Tensor, score_threshold: typing.SupportsFloat | typing.SupportsIndex = 1.1920928955078125e-07, iou_threshold: typing.SupportsFloat | typing.SupportsIndex = 1.0, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def nms_into(dst: Tensor, src: Tensor, scores: Tensor, score_threshold: typing.SupportsFloat | typing.SupportsIndex = 1.1920928955078125e-07, iou_threshold: typing.SupportsFloat | typing.SupportsIndex = 1.0, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Non-maximum Suppression operation on the given HIP stream.
     
@@ -1190,7 +1196,7 @@ def nms_into(dst: Tensor, src: Tensor, scores: Tensor, score_threshold: typing.S
                 Returns:
                     None
     """
-def normalize(src: Tensor, base: Tensor, scale: Tensor, flags: typing.SupportsInt | typing.SupportsIndex | None = None, globalscale: typing.SupportsFloat | typing.SupportsIndex = 1.0, globalshift: typing.SupportsFloat | typing.SupportsIndex = 0.0, epsilon: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def normalize(src: Tensor, base: Tensor, scale: Tensor, flags: typing.SupportsInt | typing.SupportsIndex | None = None, globalscale: typing.SupportsFloat | typing.SupportsIndex = 1.0, globalshift: typing.SupportsFloat | typing.SupportsIndex = 0.0, epsilon: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Normalize operation on the given HIP stream.
     
@@ -1211,7 +1217,7 @@ def normalize(src: Tensor, base: Tensor, scale: Tensor, flags: typing.SupportsIn
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def normalize_into(dst: Tensor, src: Tensor, base: Tensor, scale: Tensor, flags: typing.SupportsInt | typing.SupportsIndex | None = None, globalscale: typing.SupportsFloat | typing.SupportsIndex = 1.0, globalshift: typing.SupportsFloat | typing.SupportsIndex = 0.0, epsilon: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def normalize_into(dst: Tensor, src: Tensor, base: Tensor, scale: Tensor, flags: typing.SupportsInt | typing.SupportsIndex | None = None, globalscale: typing.SupportsFloat | typing.SupportsIndex = 1.0, globalshift: typing.SupportsFloat | typing.SupportsIndex = 0.0, epsilon: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                   Executes the Normalize operation on the given HIP stream.
       
@@ -1233,7 +1239,7 @@ def normalize_into(dst: Tensor, src: Tensor, base: Tensor, scale: Tensor, flags:
                   Returns:
                       None
     """
-def reformat(input: Tensor, out_layout: eTensorLayout, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def reformat(input: Tensor, out_layout: eTensorLayout, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Reformat operation and returns the result as a new tensor.
     
@@ -1249,7 +1255,7 @@ def reformat(input: Tensor, out_layout: eTensorLayout, stream: rocpycv.Stream |
                 Returns:
                     rocpycv.Tensor: The reformatted tensor.
     """
-def reformat_into(output: Tensor, input: Tensor, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def reformat_into(output: Tensor, input: Tensor, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Reformat operation on the given HIP stream.
     
@@ -1265,7 +1271,7 @@ def reformat_into(output: Tensor, input: Tensor, stream: rocpycv.Stream | None =
                 Returns:
                     None
     """
-def remap(src: Tensor, map: Tensor, in_interpolation: eInterpolationType, map_interpolation: eInterpolationType, map_value_type: eRemapType, align_corners: bool, border_type: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def remap(src: Tensor, map: Tensor, in_interpolation: eInterpolationType, map_interpolation: eInterpolationType, map_value_type: eRemapType, align_corners: bool, border_type: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Remap operation on the given HIP stream.
     
@@ -1287,7 +1293,7 @@ def remap(src: Tensor, map: Tensor, in_interpolation: eInterpolationType, map_in
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def remap_into(dst: Tensor, src: Tensor, map: Tensor, in_interpolation: eInterpolationType, map_interpolation: eInterpolationType, map_value_type: eRemapType, align_corners: bool, border_type: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def remap_into(dst: Tensor, src: Tensor, map: Tensor, in_interpolation: eInterpolationType, map_interpolation: eInterpolationType, map_value_type: eRemapType, align_corners: bool, border_type: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Remap operation on the given HIP stream.
     
@@ -1310,7 +1316,7 @@ def remap_into(dst: Tensor, src: Tensor, map: Tensor, in_interpolation: eInterpo
                 Returns:
                     None
     """
-def resize(src: Tensor, shape: tuple, interp: eInterpolationType, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def resize(src: Tensor, shape: tuple, interp: eInterpolationType, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Resize operation on the given HIP stream.
     
@@ -1327,7 +1333,7 @@ def resize(src: Tensor, shape: tuple, interp: eInterpolationType, stream: rocpyc
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def resize_into(dst: Tensor, src: Tensor, interp: eInterpolationType, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def resize_into(dst: Tensor, src: Tensor, interp: eInterpolationType, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Resize operation on the given HIP stream.
     
@@ -1344,7 +1350,7 @@ def resize_into(dst: Tensor, src: Tensor, interp: eInterpolationType, stream: ro
                 Returns:
                     None
     """
-def rotate(src: Tensor, angle_deg: typing.SupportsFloat | typing.SupportsIndex, shift: tuple, interpolation: eInterpolationType, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def rotate(src: Tensor, angle_deg: typing.SupportsFloat | typing.SupportsIndex, shift: tuple, interpolation: eInterpolationType, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Rotate operation on the given HIP stream.
     
@@ -1362,7 +1368,7 @@ def rotate(src: Tensor, angle_deg: typing.SupportsFloat | typing.SupportsIndex,
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def rotate_into(dst: Tensor, src: Tensor, angle_deg: typing.SupportsFloat | typing.SupportsIndex, shift: tuple, interpolation: eInterpolationType, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def rotate_into(dst: Tensor, src: Tensor, angle_deg: typing.SupportsFloat | typing.SupportsIndex, shift: tuple, interpolation: eInterpolationType, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Rotate operation on the given HIP stream.
     
@@ -1381,7 +1387,7 @@ def rotate_into(dst: Tensor, src: Tensor, angle_deg: typing.SupportsFloat | typi
                 Returns:
                     None
     """
-def threshold(src: Tensor, thresh: Tensor, maxVal: Tensor, maxBatchSize: typing.SupportsInt | typing.SupportsIndex, threshType: eThresholdType, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def threshold(src: Tensor, thresh: Tensor, maxVal: Tensor, maxBatchSize: typing.SupportsInt | typing.SupportsIndex, threshType: eThresholdType, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Thresholding operation on the given HIP stream.
     
@@ -1397,7 +1403,7 @@ def threshold(src: Tensor, thresh: Tensor, maxVal: Tensor, maxBatchSize: typing.
                     stream (rocpycv.Stream, optional): HIP stream to run this operation on.
                     device (rocpycv.Device, optional): The device to run this operation on. Defaults to GPU.
     """
-def threshold_into(dst: Tensor, src: Tensor, thresh: Tensor, maxVal: Tensor, maxBatchSize: typing.SupportsInt | typing.SupportsIndex, threshType: eThresholdType, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def threshold_into(dst: Tensor, src: Tensor, thresh: Tensor, maxVal: Tensor, maxBatchSize: typing.SupportsInt | typing.SupportsIndex, threshType: eThresholdType, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Thresholding operation on the given HIP stream.
     
@@ -1414,7 +1420,7 @@ def threshold_into(dst: Tensor, src: Tensor, thresh: Tensor, maxVal: Tensor, max
                     stream (rocpycv.Stream, optional): HIP stream to run this operation on.
                     device (rocpycv.Device, optional): The device to run this operation on. Defaults to GPU.
     """
-def warp_affine(src: Tensor, xform: list, inverted: bool, interp: eInterpolationType, border_mode: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def warp_affine(src: Tensor, xform: list, inverted: bool, interp: eInterpolationType, border_mode: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Warp Affine operation on the given HIP stream.
     
@@ -1434,7 +1440,7 @@ def warp_affine(src: Tensor, xform: list, inverted: bool, interp: eInterpolation
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def warp_affine_into(dst: Tensor, src: Tensor, xform: list, inverted: bool, interp: eInterpolationType, border_mode: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def warp_affine_into(dst: Tensor, src: Tensor, xform: list, inverted: bool, interp: eInterpolationType, border_mode: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Warp Affine operation on the given HIP stream.
     
@@ -1455,7 +1461,7 @@ def warp_affine_into(dst: Tensor, src: Tensor, xform: list, inverted: bool, inte
                 Returns:
                     None
     """
-def warp_perspective(src: Tensor, xform: list, inverted: bool, interp: eInterpolationType, border_mode: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def warp_perspective(src: Tensor, xform: list, inverted: bool, interp: eInterpolationType, border_mode: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Warp Perspective operation on the given HIP stream.
     
@@ -1475,7 +1481,7 @@ def warp_perspective(src: Tensor, xform: list, inverted: bool, interp: eInterpol
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def warp_perspective_into(dst: Tensor, src: Tensor, xform: list, inverted: bool, interp: eInterpolationType, border_mode: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def warp_perspective_into(dst: Tensor, src: Tensor, xform: list, inverted: bool, interp: eInterpolationType, border_mode: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Warp Perspective operation on the given HIP stream.
     

From 12c85cf5f46e5ef067d2601c5e345cded3d20819 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 29 Apr 2026 11:14:08 -0400
Subject: [PATCH 04/20] Add migraphx classification example

---
 .../python/migraphx-classification/README.md  |  52 ++++++
 .../migraphx_classification.py                | 173 ++++++++++++++++++
 2 files changed, 225 insertions(+)
 create mode 100644 samples/python/migraphx-classification/README.md
 create mode 100644 samples/python/migraphx-classification/migraphx_classification.py

diff --git a/samples/python/migraphx-classification/README.md b/samples/python/migraphx-classification/README.md
new file mode 100644
index 00000000..0de2d90d
--- /dev/null
+++ b/samples/python/migraphx-classification/README.md
@@ -0,0 +1,52 @@
+# rocCV MIGraphX Classification Sample
+
+This sample demonstrates how to use rocCV to preprocess an image on the GPU and run inference with a ResNet50 ONNX model through MIGraphX. The preprocessed tensor is handed off to MIGraphX via a raw GPU pointer for zero-copy interop, so no host round-trip is needed between preprocessing and inference.
+
+## Dependencies
+
+- A rocCV build with the Python bindings (`rocpycv`) on `PYTHONPATH`. Build rocCV with Python 3.11 by passing the following to cmake:
+  ```shell
+  -DPYTHON_VERSION_SUGGESTED=3.11
+  ```
+- [MIGraphX](https://github.com/ROCm/AMDMIGraphX) with its Python bindings.
+- `opencv-python` and `numpy`.
+- A ResNet50 ONNX model with input name `data` and shape `[N, 3, 224, 224]` (e.g. the ONNX Model Zoo `resnet50-v1-7.onnx`).
+- Optional: a newline-separated ImageNet class labels file for human-readable output.
+
+## Command line
+
+```shell
+python3.11 migraphx_classification.py \
+    --input  path/to/image.jpg \
+    --model  path/to/resnet50.onnx \
+    --labels path/to/imagenet_classes.txt \
+    --top-k  5
+```
+
+Arguments:
+- `--input` (required): path to the input image.
+- `--model` (required): path to the ResNet50 ONNX file.
+- `--labels` (optional): path to an ImageNet class label file. If omitted, classes are reported by index.
+- `--top-k` (optional, default 5): number of top predictions to print.
+
+On the first run, the script compiles the ONNX model for the GPU and caches the result alongside the ONNX file as `<model>_b1.mxr`. Subsequent runs load the cached `.mxr` directly and skip compilation.
+
+## Preprocessing Operators
+
+The preprocessing pipeline runs entirely on the GPU through `rocpycv`:
+
+1. **CvtColor**: Converts the OpenCV BGR image to RGB.
+2. **Resize**: Resizes to 224x224 using cubic interpolation.
+3. **Convert To**: Casts U8 pixels to float32 (no scaling — the `/255` step is folded into the normalize parameters).
+4. **Normalize**: Applies ImageNet mean/std normalization. The mean and std constants are pre-multiplied by 255 so the operator can normalize directly from the [0, 255] float pixel range in a single pass.
+5. **Reformat**: Converts the tensor from NHWC to NCHW, the layout MIGraphX/ONNX expects.
+
+## MIGraphX Interop
+
+The compiled MIGraphX program is built with `offload_copy=False`, so input and output buffers must already live on the GPU. The sample binds:
+- The rocCV preprocessed tensor's GPU pointer (`tensor.data_ptr()`) as the `data` input via `migraphx.argument_from_pointer`.
+- A `migraphx.allocate_gpu` buffer as the output.
+
+Inference is launched with `model.run_async` using the same HIP stream as the preprocessing pipeline (`stream.handle()`), so preprocessing and inference are serialized on a single stream with no extra synchronization until the final `stream.synchronize()`.
+
+The output logits are copied back to the host with `migraphx.from_gpu`, passed through softmax, and the top-K classes are printed.
diff --git a/samples/python/migraphx-classification/migraphx_classification.py b/samples/python/migraphx-classification/migraphx_classification.py
new file mode 100644
index 00000000..451df9e3
--- /dev/null
+++ b/samples/python/migraphx-classification/migraphx_classification.py
@@ -0,0 +1,173 @@
+# ##############################################################################
+# Copyright (c)  - 2026 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# ##############################################################################
+
+"""Classification with rocCV preprocessing and MIGraphX inference."""
+
+from __future__ import annotations
+
+import argparse
+import os
+
+import cv2
+import migraphx
+import numpy as np
+import rocpycv
+
+
+# ImageNet normalization, scaled to the [0, 255] pixel range so we can apply
+# them directly to U8-derived float pixels without a separate /255 step:
+#   (pixel/255 - mean) / std  ==  (pixel - mean*255) / (std*255)
+IMAGENET_MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32) * 255.0
+IMAGENET_STD = np.array([0.229, 0.224, 0.225], dtype=np.float32) * 255.0
+
+INPUT_H, INPUT_W = 224, 224
+
+
+def read_image(image_path: str) -> np.ndarray:
+    """Read an image from disk as an NHWC uint8 BGR numpy array."""
+    bgr = cv2.imread(image_path)
+    if bgr is None:
+        raise FileNotFoundError(f"Unable to load image: {image_path}")
+    return np.stack([bgr])
+
+
+def load_or_compile_model(onnx_path: str) -> migraphx.program:
+    """Load a cached compiled model, or parse + compile + cache the ONNX file."""
+    # TODO: Support other batch sizes later
+    batch_size = 1
+    cache_path = f"{os.path.splitext(onnx_path)[0]}_b{batch_size}.mxr"
+
+    if os.path.exists(cache_path):
+        print(f"Loading cached compiled model: {cache_path}")
+        return migraphx.load(cache_path, format="msgpack")
+
+    print(f"Parsing ONNX: {onnx_path}")
+    model = migraphx.parse_onnx(
+        onnx_path,
+        map_input_dims={"data": [batch_size, 3, INPUT_H, INPUT_W]},
+    )
+
+    print("Compiling for GPU...")
+    # offload_copy=False allows us to bind GPU buffers directly to allow for
+    # zero-copy interop.
+    model.compile(migraphx.get_target("gpu"), offload_copy=False)
+
+    print(f"Caching compiled model to: {cache_path}")
+    migraphx.save(model, cache_path, format="msgpack")
+    return model
+
+
+def load_labels(labels_path: str | None) -> list[str] | None:
+    if labels_path is None:
+        return None
+    with open(labels_path) as f:
+        return [line.strip() for line in f if line.strip()]
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Classification with rocCV preprocessing and MIGraphX inference"
+    )
+    parser.add_argument("--input", required=True, help="Path to input image")
+    parser.add_argument(
+        "--model", required=True, help="Path to a ResNet50 ONNX model"
+    )
+    parser.add_argument(
+        "--labels",
+        default=None,
+        help="Optional path to a newline-separated ImageNet class labels file",
+    )
+    parser.add_argument("--top-k", type=int, default=5)
+    return parser.parse_args()
+
+
+def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray:
+    x = x - np.max(x, axis=axis, keepdims=True)
+    e = np.exp(x)
+    return e / np.sum(e, axis=axis, keepdims=True)
+
+
+def main() -> None:
+    args = parse_args()
+
+    # 1. Load the model
+    model = load_or_compile_model(args.model)
+
+    print(f"Reading image: {args.input}")
+    np_image = read_image(args.input)
+    print(f"Input image shape: {np_image.shape}")
+
+    print("Preprocessing with rocCV...")
+    stream = rocpycv.Stream()
+
+    # 2. Convert the image to a rocCV tensor in NHWC layout.
+    tensor = rocpycv.from_dlpack(np_image, rocpycv.NHWC).copy_to(rocpycv.GPU)
+
+    # 3. Convert from BGR to RGB for MIGraphX.
+    tensor = rocpycv.cvtcolor(tensor, rocpycv.COLOR_BGR2RGB, stream, rocpycv.GPU)
+
+    # 4. Resize to 224x224.
+    tensor = rocpycv.resize(tensor, (1, INPUT_H, INPUT_W, 3), rocpycv.CUBIC, stream, rocpycv.GPU)
+
+    # 5. Cast U8 -> F32 (no scaling; normalize step folds in /255).
+    tensor = rocpycv.convert_to(tensor, rocpycv.eDataType.F32, 1.0, 0.0, stream, rocpycv.GPU)
+
+    # 6. ImageNet normalize: (pixel - mean) / std.
+    mean_t = rocpycv.from_dlpack(IMAGENET_MEAN.reshape(1, 1, 1, 3), rocpycv.NHWC).copy_to(rocpycv.GPU)
+    std_t = rocpycv.from_dlpack(IMAGENET_STD.reshape(1, 1, 1, 3), rocpycv.NHWC).copy_to(rocpycv.GPU)
+    tensor = rocpycv.normalize(tensor, mean_t, std_t, rocpycv.NormalizeFlags.SCALE_IS_STDDEV, 1.0, 0.0, 0.0, stream, rocpycv.GPU)
+
+    # 7. NHWC -> NCHW (MIGraphX / ONNX expects NCHW).
+    tensor = rocpycv.reformat(tensor, rocpycv.eTensorLayout.NCHW, stream, rocpycv.GPU)
+    print(f"Preprocessed tensor shape (NCHW): {tensor.shape()}")
+
+    print("Running MIGraphX inference...")
+
+    # Setup MIGraphX arguments/shapes
+    in_shape = migraphx.shape(type="float_type", lens=tensor.shape())
+    out_shape = migraphx.shape(type="float_type", lens=[1, 1000])
+    in_arg = migraphx.argument_from_pointer(in_shape, tensor.data_ptr())
+    out_buf = migraphx.allocate_gpu(out_shape)
+
+    outputs = model.run_async(
+        {"data": in_arg, "main:#output_0": out_buf},
+        stream.handle(),
+        "ihipStream_t",
+    )
+    stream.synchronize()
+
+    logits = np.array(migraphx.from_gpu(outputs[0]))
+    probs = softmax(logits, axis=1)
+
+    labels = load_labels(args.labels)
+
+    # Report top-K for the first image in the batch.
+    print(f"\nTop {args.top_k} predictions:")
+    top = np.argsort(probs[0])[::-1][: args.top_k]
+    for rank, idx in enumerate(top, start=1):
+        name = labels[idx] if labels is not None and idx < len(labels) else f"class {idx}"
+        print(f"  {rank}. {name}: {probs[0][idx]:.6f}")
+
+
+if __name__ == "__main__":
+    main()

From 8fb6c4f6ce575d6494f00792bd37c797fb29d36c Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 29 Apr 2026 11:14:23 -0400
Subject: [PATCH 05/20] Remove 4S16 from pyenums

---
 python/src/py_enums.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/src/py_enums.cpp b/python/src/py_enums.cpp
index 0ce39d08..6059c280 100644
--- a/python/src/py_enums.cpp
+++ b/python/src/py_enums.cpp
@@ -48,7 +48,6 @@ void PyEnums::Export(py::module& m) {
         .value("S32", DATA_TYPE_S32)
         .value("F32", DATA_TYPE_F32)
         .value("F64", DATA_TYPE_F64)
-        .value("4S16", DATA_TYPE_4S16)
         .export_values();
 
     py::enum_<eDeviceType>(m, "eDeviceType")

From e024eefcf86c1b01fb10d8bb03e49f7cc3bece5c Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 29 Apr 2026 12:10:06 -0400
Subject: [PATCH 06/20] Move classification sample location

---
 samples/{python => }/migraphx-classification/README.md            | 0
 .../migraphx-classification/migraphx_classification.py            | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename samples/{python => }/migraphx-classification/README.md (100%)
 rename samples/{python => }/migraphx-classification/migraphx_classification.py (100%)

diff --git a/samples/python/migraphx-classification/README.md b/samples/migraphx-classification/README.md
similarity index 100%
rename from samples/python/migraphx-classification/README.md
rename to samples/migraphx-classification/README.md
diff --git a/samples/python/migraphx-classification/migraphx_classification.py b/samples/migraphx-classification/migraphx_classification.py
similarity index 100%
rename from samples/python/migraphx-classification/migraphx_classification.py
rename to samples/migraphx-classification/migraphx_classification.py

From 25e2b9dcb16be5e6805764a6dc68373f7b9aacd9 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 29 Apr 2026 12:11:06 -0400
Subject: [PATCH 07/20] Move pytorch classification sample location

---
 samples/{classification => pytorch-classification}/README.md      | 0
 .../pytorch_classification.py                                     | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename samples/{classification => pytorch-classification}/README.md (100%)
 rename samples/{classification => pytorch-classification}/pytorch_classification.py (100%)

diff --git a/samples/classification/README.md b/samples/pytorch-classification/README.md
similarity index 100%
rename from samples/classification/README.md
rename to samples/pytorch-classification/README.md
diff --git a/samples/classification/pytorch_classification.py b/samples/pytorch-classification/pytorch_classification.py
similarity index 100%
rename from samples/classification/pytorch_classification.py
rename to samples/pytorch-classification/pytorch_classification.py

From c8da0a2e1e0bf2a00ad32c9914f500379904c952 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 29 Apr 2026 17:31:30 -0400
Subject: [PATCH 08/20] Cleanup migraphx classification sample

---
 samples/migraphx-classification/README.md     |  4 +-
 .../migraphx_classification.py                | 62 +++++++++++--------
 2 files changed, 37 insertions(+), 29 deletions(-)

diff --git a/samples/migraphx-classification/README.md b/samples/migraphx-classification/README.md
index 0de2d90d..ca1dfdc1 100644
--- a/samples/migraphx-classification/README.md
+++ b/samples/migraphx-classification/README.md
@@ -35,8 +35,8 @@ On the first run, the script compiles the ONNX model for the GPU and caches the
 
 The preprocessing pipeline runs entirely on the GPU through `rocpycv`:
 
-1. **CvtColor**: Converts the OpenCV BGR image to RGB.
-2. **Resize**: Resizes to 224x224 using cubic interpolation.
+1. **Resize**: Resizes to 224x224 using cubic interpolation.
+2. **CvtColor**: Converts the OpenCV BGR image to RGB.
 3. **Convert To**: Casts U8 pixels to float32 (no scaling — the `/255` step is folded into the normalize parameters).
 4. **Normalize**: Applies ImageNet mean/std normalization. The mean and std constants are pre-multiplied by 255 so the operator can normalize directly from the [0, 255] float pixel range in a single pass.
 5. **Reformat**: Converts the tensor from NHWC to NCHW, the layout MIGraphX/ONNX expects.
diff --git a/samples/migraphx-classification/migraphx_classification.py b/samples/migraphx-classification/migraphx_classification.py
index 451df9e3..17d7a81e 100644
--- a/samples/migraphx-classification/migraphx_classification.py
+++ b/samples/migraphx-classification/migraphx_classification.py
@@ -41,6 +41,7 @@
 IMAGENET_STD = np.array([0.229, 0.224, 0.225], dtype=np.float32) * 255.0
 
 INPUT_H, INPUT_W = 224, 224
+BATCH_SIZE = 1
 
 
 def read_image(image_path: str) -> np.ndarray:
@@ -51,11 +52,12 @@ def read_image(image_path: str) -> np.ndarray:
     return np.stack([bgr])
 
 
-def load_or_compile_model(onnx_path: str) -> migraphx.program:
+def load_or_compile_model(onnx_path: str, use_fp16: bool = True) -> migraphx.program:
     """Load a cached compiled model, or parse + compile + cache the ONNX file."""
     # TODO: Support other batch sizes later
     batch_size = 1
-    cache_path = f"{os.path.splitext(onnx_path)[0]}_b{batch_size}.mxr"
+    precision_tag = "fp16" if use_fp16 else "fp32"
+    cache_path = f"{os.path.splitext(onnx_path)[0]}_b{batch_size}_{precision_tag}.mxr"
 
     if os.path.exists(cache_path):
         print(f"Loading cached compiled model: {cache_path}")
@@ -67,6 +69,12 @@ def load_or_compile_model(onnx_path: str) -> migraphx.program:
         map_input_dims={"data": [batch_size, 3, INPUT_H, INPUT_W]},
     )
 
+    if use_fp16:
+        print("Quantizing to FP16...")
+        # Inserts internal float -> half conversions; model inputs/outputs stay
+        # float32, so the existing F32 buffer setup remains unchanged.
+        migraphx.quantize_fp16(model)
+
     print("Compiling for GPU...")
     # offload_copy=False allows us to bind GPU buffers directly to allow for
     # zero-copy interop.
@@ -110,45 +118,44 @@ def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray:
 def main() -> None:
     args = parse_args()
 
-    # 1. Load the model
     model = load_or_compile_model(args.model)
 
     print(f"Reading image: {args.input}")
     np_image = read_image(args.input)
     print(f"Input image shape: {np_image.shape}")
 
-    print("Preprocessing with rocCV...")
-    stream = rocpycv.Stream()
+    # Load/allocate tensors on the GPU
+    input_tensor : rocpycv.Tensor = rocpycv.from_dlpack(np_image, rocpycv.NHWC).copy_to(rocpycv.GPU)
+    resized      : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.U8)
+    rgb          : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.U8)
+    f32          : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.F32)
+    normalized   : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.F32)
+    nchw         : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, 3, INPUT_H, INPUT_W), rocpycv.NCHW, rocpycv.F32)
 
-    # 2. Convert the image to a rocCV tensor in NHWC layout.
-    tensor = rocpycv.from_dlpack(np_image, rocpycv.NHWC).copy_to(rocpycv.GPU)
+    mean_t       : rocpycv.Tensor = rocpycv.from_dlpack(IMAGENET_MEAN.reshape(1, 1, 1, 3), rocpycv.NHWC).copy_to(rocpycv.GPU)
+    std_t        : rocpycv.Tensor = rocpycv.from_dlpack(IMAGENET_STD.reshape(1, 1, 1, 3), rocpycv.NHWC).copy_to(rocpycv.GPU)
 
-    # 3. Convert from BGR to RGB for MIGraphX.
-    tensor = rocpycv.cvtcolor(tensor, rocpycv.COLOR_BGR2RGB, stream, rocpycv.GPU)
-
-    # 4. Resize to 224x224.
-    tensor = rocpycv.resize(tensor, (1, INPUT_H, INPUT_W, 3), rocpycv.CUBIC, stream, rocpycv.GPU)
+    # Setup MIGraphX arguments/shapes
+    in_shape   : migraphx.shape    = migraphx.shape(type="float_type", lens=nchw.shape())
+    out_shape  : migraphx.shape    = migraphx.shape(type="float_type", lens=[BATCH_SIZE, 1000])
 
-    # 5. Cast U8 -> F32 (no scaling; normalize step folds in /255).
-    tensor = rocpycv.convert_to(tensor, rocpycv.eDataType.F32, 1.0, 0.0, stream, rocpycv.GPU)
+    in_arg     : migraphx.argument = migraphx.argument_from_pointer(in_shape, nchw.data_ptr())
+    out_buf    : migraphx.buffer   = migraphx.allocate_gpu(out_shape)
 
-    # 6. ImageNet normalize: (pixel - mean) / std.
-    mean_t = rocpycv.from_dlpack(IMAGENET_MEAN.reshape(1, 1, 1, 3), rocpycv.NHWC).copy_to(rocpycv.GPU)
-    std_t = rocpycv.from_dlpack(IMAGENET_STD.reshape(1, 1, 1, 3), rocpycv.NHWC).copy_to(rocpycv.GPU)
-    tensor = rocpycv.normalize(tensor, mean_t, std_t, rocpycv.NormalizeFlags.SCALE_IS_STDDEV, 1.0, 0.0, 0.0, stream, rocpycv.GPU)
+    # Begin preprocessing
+    print("Preprocessing with rocCV...")
+    stream = rocpycv.Stream()
 
-    # 7. NHWC -> NCHW (MIGraphX / ONNX expects NCHW).
-    tensor = rocpycv.reformat(tensor, rocpycv.eTensorLayout.NCHW, stream, rocpycv.GPU)
-    print(f"Preprocessed tensor shape (NCHW): {tensor.shape()}")
+    rocpycv.resize_into(resized, input_tensor, rocpycv.CUBIC, stream)
+    rocpycv.cvtcolor_into(rgb, resized, rocpycv.COLOR_BGR2RGB, stream)
+    rocpycv.convert_to_into(f32, rgb, 1.0, 0.0, stream)
+    rocpycv.normalize_into(normalized, f32, mean_t, std_t, rocpycv.NormalizeFlags.SCALE_IS_STDDEV, 1.0, 0.0, 0.0, stream)
+    rocpycv.reformat_into(nchw, normalized, stream)
+    
+    print(f"Preprocessed tensor shape (NCHW): {nchw.shape()}")
 
     print("Running MIGraphX inference...")
 
-    # Setup MIGraphX arguments/shapes
-    in_shape = migraphx.shape(type="float_type", lens=tensor.shape())
-    out_shape = migraphx.shape(type="float_type", lens=[1, 1000])
-    in_arg = migraphx.argument_from_pointer(in_shape, tensor.data_ptr())
-    out_buf = migraphx.allocate_gpu(out_shape)
-
     outputs = model.run_async(
         {"data": in_arg, "main:#output_0": out_buf},
         stream.handle(),
@@ -156,6 +163,7 @@ def main() -> None:
     )
     stream.synchronize()
 
+    # Postprocess the inference results
     logits = np.array(migraphx.from_gpu(outputs[0]))
     probs = softmax(logits, axis=1)
 

From 666226fa62af41ea7983fb847d11e7f0529e5535 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 29 Apr 2026 17:35:24 -0400
Subject: [PATCH 09/20] Minor cleanup

---
 samples/migraphx-classification/README.md                 | 7 ++-----
 .../migraphx-classification/migraphx_classification.py    | 8 +++-----
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/samples/migraphx-classification/README.md b/samples/migraphx-classification/README.md
index ca1dfdc1..c2edcfdd 100644
--- a/samples/migraphx-classification/README.md
+++ b/samples/migraphx-classification/README.md
@@ -4,10 +4,7 @@ This sample demonstrates how to use rocCV to preprocess an image on the GPU and
 
 ## Dependencies
 
-- A rocCV build with the Python bindings (`rocpycv`) on `PYTHONPATH`. Build rocCV with Python 3.11 by passing the following to cmake:
-  ```shell
-  -DPYTHON_VERSION_SUGGESTED=3.11
-  ```
+- A rocCV build with the Python bindings (`rocpycv`) on `PYTHONPATH`.
 - [MIGraphX](https://github.com/ROCm/AMDMIGraphX) with its Python bindings.
 - `opencv-python` and `numpy`.
 - A ResNet50 ONNX model with input name `data` and shape `[N, 3, 224, 224]` (e.g. the ONNX Model Zoo `resnet50-v1-7.onnx`).
@@ -16,7 +13,7 @@ This sample demonstrates how to use rocCV to preprocess an image on the GPU and
 ## Command line
 
 ```shell
-python3.11 migraphx_classification.py \
+python3 migraphx_classification.py \
     --input  path/to/image.jpg \
     --model  path/to/resnet50.onnx \
     --labels path/to/imagenet_classes.txt \
diff --git a/samples/migraphx-classification/migraphx_classification.py b/samples/migraphx-classification/migraphx_classification.py
index 17d7a81e..c144e534 100644
--- a/samples/migraphx-classification/migraphx_classification.py
+++ b/samples/migraphx-classification/migraphx_classification.py
@@ -54,10 +54,8 @@ def read_image(image_path: str) -> np.ndarray:
 
 def load_or_compile_model(onnx_path: str, use_fp16: bool = True) -> migraphx.program:
     """Load a cached compiled model, or parse + compile + cache the ONNX file."""
-    # TODO: Support other batch sizes later
-    batch_size = 1
     precision_tag = "fp16" if use_fp16 else "fp32"
-    cache_path = f"{os.path.splitext(onnx_path)[0]}_b{batch_size}_{precision_tag}.mxr"
+    cache_path = f"{os.path.splitext(onnx_path)[0]}_b{BATCH_SIZE}_{precision_tag}.mxr"
 
     if os.path.exists(cache_path):
         print(f"Loading cached compiled model: {cache_path}")
@@ -66,7 +64,7 @@ def load_or_compile_model(onnx_path: str, use_fp16: bool = True) -> migraphx.pro
     print(f"Parsing ONNX: {onnx_path}")
     model = migraphx.parse_onnx(
         onnx_path,
-        map_input_dims={"data": [batch_size, 3, INPUT_H, INPUT_W]},
+        map_input_dims={"data": [BATCH_SIZE, 3, INPUT_H, INPUT_W]},
     )
 
     if use_fp16:
@@ -144,7 +142,7 @@ def main() -> None:
 
     # Begin preprocessing
     print("Preprocessing with rocCV...")
-    stream = rocpycv.Stream()
+    stream : rocpycv.Stream = rocpycv.Stream()
 
     rocpycv.resize_into(resized, input_tensor, rocpycv.CUBIC, stream)
     rocpycv.cvtcolor_into(rgb, resized, rocpycv.COLOR_BGR2RGB, stream)

From d0ee8993170a3c7ab352bcafe99cc0e1fe6ee2af Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 29 Apr 2026 17:39:44 -0400
Subject: [PATCH 10/20] Minor variable name changes

---
 samples/migraphx-classification/README.md     |  2 +-
 .../migraphx_classification.py                | 24 +++++++++----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/samples/migraphx-classification/README.md b/samples/migraphx-classification/README.md
index c2edcfdd..83c7f4e2 100644
--- a/samples/migraphx-classification/README.md
+++ b/samples/migraphx-classification/README.md
@@ -41,7 +41,7 @@ The preprocessing pipeline runs entirely on the GPU through `rocpycv`:
 ## MIGraphX Interop
 
 The compiled MIGraphX program is built with `offload_copy=False`, so input and output buffers must already live on the GPU. The sample binds:
-- The rocCV preprocessed tensor's GPU pointer (`tensor.data_ptr()`) as the `data` input via `migraphx.argument_from_pointer`.
+- The rocCV preprocessed tensor's GPU pointer (`preprocessed.data_ptr()`) as the `data` input via `migraphx.argument_from_pointer`.
 - A `migraphx.allocate_gpu` buffer as the output.
 
 Inference is launched with `model.run_async` using the same HIP stream as the preprocessing pipeline (`stream.handle()`), so preprocessing and inference are serialized on a single stream with no extra synchronization until the final `stream.synchronize()`.
diff --git a/samples/migraphx-classification/migraphx_classification.py b/samples/migraphx-classification/migraphx_classification.py
index c144e534..e36b0c85 100644
--- a/samples/migraphx-classification/migraphx_classification.py
+++ b/samples/migraphx-classification/migraphx_classification.py
@@ -123,21 +123,21 @@ def main() -> None:
     print(f"Input image shape: {np_image.shape}")
 
     # Load/allocate tensors on the GPU
-    input_tensor : rocpycv.Tensor = rocpycv.from_dlpack(np_image, rocpycv.NHWC).copy_to(rocpycv.GPU)
-    resized      : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.U8)
-    rgb          : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.U8)
-    f32          : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.F32)
-    normalized   : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.F32)
-    nchw         : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, 3, INPUT_H, INPUT_W), rocpycv.NCHW, rocpycv.F32)
+    input_tensor  : rocpycv.Tensor = rocpycv.from_dlpack(np_image, rocpycv.NHWC).copy_to(rocpycv.GPU)
+    resized       : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.U8)
+    rgb           : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.U8)
+    f32           : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.F32)
+    normalized    : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.F32)
+    preprocessed  : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, 3, INPUT_H, INPUT_W), rocpycv.NCHW, rocpycv.F32)
 
-    mean_t       : rocpycv.Tensor = rocpycv.from_dlpack(IMAGENET_MEAN.reshape(1, 1, 1, 3), rocpycv.NHWC).copy_to(rocpycv.GPU)
-    std_t        : rocpycv.Tensor = rocpycv.from_dlpack(IMAGENET_STD.reshape(1, 1, 1, 3), rocpycv.NHWC).copy_to(rocpycv.GPU)
+    mean_t        : rocpycv.Tensor = rocpycv.from_dlpack(IMAGENET_MEAN.reshape(1, 1, 1, 3), rocpycv.NHWC).copy_to(rocpycv.GPU)
+    std_t         : rocpycv.Tensor = rocpycv.from_dlpack(IMAGENET_STD.reshape(1, 1, 1, 3), rocpycv.NHWC).copy_to(rocpycv.GPU)
 
     # Setup MIGraphX arguments/shapes
-    in_shape   : migraphx.shape    = migraphx.shape(type="float_type", lens=nchw.shape())
+    in_shape   : migraphx.shape    = migraphx.shape(type="float_type", lens=preprocessed.shape())
     out_shape  : migraphx.shape    = migraphx.shape(type="float_type", lens=[BATCH_SIZE, 1000])
 
-    in_arg     : migraphx.argument = migraphx.argument_from_pointer(in_shape, nchw.data_ptr())
+    in_arg     : migraphx.argument = migraphx.argument_from_pointer(in_shape, preprocessed.data_ptr())
     out_buf    : migraphx.buffer   = migraphx.allocate_gpu(out_shape)
 
     # Begin preprocessing
@@ -148,9 +148,9 @@ def main() -> None:
     rocpycv.cvtcolor_into(rgb, resized, rocpycv.COLOR_BGR2RGB, stream)
     rocpycv.convert_to_into(f32, rgb, 1.0, 0.0, stream)
     rocpycv.normalize_into(normalized, f32, mean_t, std_t, rocpycv.NormalizeFlags.SCALE_IS_STDDEV, 1.0, 0.0, 0.0, stream)
-    rocpycv.reformat_into(nchw, normalized, stream)
+    rocpycv.reformat_into(preprocessed, normalized, stream)
     
-    print(f"Preprocessed tensor shape (NCHW): {nchw.shape()}")
+    print(f"Preprocessed tensor shape (NCHW): {preprocessed.shape()}")
 
     print("Running MIGraphX inference...")
 

From 8ad82ba014b8a7f65410eb7190bde9e0854bba08 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 29 Apr 2026 18:15:51 -0400
Subject: [PATCH 11/20] Allow use of numpy types to specify rocpycv.Tensor
 types

---
 python/include/py_helpers.hpp                 | 21 +++++++-
 python/src/py_helpers.cpp                     | 49 +++++++++++++++++++
 python/src/py_tensor.cpp                      | 29 ++++++++---
 python/src/rocpycv.pyi                        | 12 ++---
 .../migraphx_classification.py                | 18 +++----
 5 files changed, 107 insertions(+), 22 deletions(-)

diff --git a/python/include/py_helpers.hpp b/python/include/py_helpers.hpp
index 369cac10..2e3dc117 100644
--- a/python/include/py_helpers.hpp
+++ b/python/include/py_helpers.hpp
@@ -86,4 +86,23 @@ extern double2 GetDouble2FromTuple(py::tuple src);
  * @param src A python tuple of size 2.
  * @return int2
  */
-extern int2 GetInt2FromTuple(py::tuple src);
\ No newline at end of file
+extern int2 GetInt2FromTuple(py::tuple src);
+
+/**
+ * @brief Resolves a Python object to an eTensorLayout. Accepts either an rocpycv.eTensorLayout enum
+ * value, or a layout string such as "NHWC". Throws std::runtime_error for unsupported inputs.
+ *
+ * @param obj A Python object describing the tensor layout.
+ * @return eTensorLayout
+ */
+extern eTensorLayout LayoutFromPyObject(py::object obj);
+
+/**
+ * @brief Resolves a Python object to an eDataType. Accepts either an rocpycv.eDataType enum value,
+ * or anything coercible to a NumPy dtype (e.g. ``np.float32``, ``np.dtype("uint8")``,
+ * ``"float32"``). Throws std::runtime_error for unsupported inputs.
+ *
+ * @param obj A Python object describing the tensor data type.
+ * @return eDataType
+ */
+extern eDataType DataTypeFromPyObject(py::object obj);
\ No newline at end of file
diff --git a/python/src/py_helpers.cpp b/python/src/py_helpers.cpp
index 97902915..8d4dce55 100644
--- a/python/src/py_helpers.cpp
+++ b/python/src/py_helpers.cpp
@@ -22,7 +22,11 @@ THE SOFTWARE.
 
 #include "py_helpers.hpp"
 
+#include <core/tensor_layout.hpp>
+#include <pybind11/numpy.h>
+
 #include <stdexcept>
+#include <string>
 
 eDataType DLTypeToRoccvType(DLDataType dtype) {
     if (dtype.bits == 8) {
@@ -147,4 +151,49 @@ int2 GetInt2FromTuple(py::tuple src) {
         std::runtime_error("Cannot convert py::tuple to int2. py::tuple.size() != 2.");
     }
     return make_int2(src[0].cast<int>(), src[1].cast<int>());
+}
+
+eTensorLayout LayoutFromPyObject(py::object obj) {
+    if (py::isinstance<eTensorLayout>(obj)) {
+        return obj.cast<eTensorLayout>();
+    }
+
+    if (py::isinstance<py::str>(obj)) {
+        std::string s = obj.cast<std::string>();
+        for (const auto& [layout, name] : roccv::TensorLayout::layoutStringTable) {
+            if (name == s) return layout;
+        }
+        throw std::runtime_error("Unknown tensor layout string: '" + s + "'.");
+    }
+
+    throw std::runtime_error("layout must be an rocpycv.eTensorLayout or a layout string (e.g. 'NHWC').");
+}
+
+eDataType DataTypeFromPyObject(py::object obj) {
+    if (py::isinstance<eDataType>(obj)) {
+        return obj.cast<eDataType>();
+    }
+
+    // np.dtype() accepts numpy scalar types (np.float32), dtype instances, and dtype strings,
+    // so we delegate the parsing to NumPy itself rather than enumerating cases here.
+    py::dtype dt;
+    try {
+        static const py::object np_dtype = py::module_::import("numpy").attr("dtype");
+        dt = np_dtype(obj).cast<py::dtype>();
+    } catch (const std::exception&) {
+        throw std::runtime_error(
+            "dtype must be an rocpycv.eDataType or a NumPy dtype/scalar type (e.g. np.float32).");
+    }
+
+    DLDataTypeCode code;
+    switch (dt.kind()) {
+        case 'u': code = kDLUInt; break;
+        case 'i': code = kDLInt; break;
+        case 'f': code = kDLFloat; break;
+        default:
+            throw std::runtime_error("Unsupported NumPy dtype for rocpycv.Tensor (kind '" +
+                                     std::string(1, dt.kind()) + "').");
+    }
+    DLDataType dl{static_cast<uint8_t>(code), static_cast<uint8_t>(dt.itemsize() * 8), 1};
+    return DLTypeToRoccvType(dl);
 }
\ No newline at end of file
diff --git a/python/src/py_tensor.cpp b/python/src/py_tensor.cpp
index 8c864b46..4fa4dd1b 100644
--- a/python/src/py_tensor.cpp
+++ b/python/src/py_tensor.cpp
@@ -202,8 +202,15 @@ void PyTensor::Export(pybind11::module& m) {
 
     pybind11::class_<PyTensor, std::shared_ptr<PyTensor>> tensor(m, "Tensor");
     tensor
-        .def(pybind11::init<std::vector<int64_t>, eTensorLayout, eDataType, eDeviceType>(), "shape"_a, "layout"_a,
-             "dtype"_a, "device"_a = eDeviceType::GPU, "Constructs a tensor object.")
+        .def(pybind11::init([](std::vector<int64_t> shape, py::object layout, py::object dtype, eDeviceType device) {
+                 return std::make_shared<PyTensor>(shape, LayoutFromPyObject(layout), DataTypeFromPyObject(dtype),
+                                                   device);
+             }),
+             "shape"_a, "layout"_a, "dtype"_a, "device"_a = eDeviceType::GPU,
+             "Constructs a tensor object. ``layout`` may be an ``rocpycv.eTensorLayout`` (e.g. "
+             "``rocpycv.NHWC``) or a layout string (``\"NHWC\"``). ``dtype`` may be an "
+             "``rocpycv.eDataType`` (e.g. ``rocpycv.F32``) or a NumPy dtype/scalar type "
+             "(e.g. ``np.float32``).")
         .def("copy_to", &PyTensor::copyTo, "device"_a,
              "Returns a deep copy of the tensor with data copied to a specified device type.")
         .def("__dlpack__", &PyTensor::toDLPack, "stream"_a = py::none(),
@@ -221,8 +228,18 @@ void PyTensor::Export(pybind11::module& m) {
         .def("dtype", &PyTensor::getDataType, "Returns the data type of the tensor.")
         .def("__dlpack_device__", &PyTensor::getDLDevice,
              "Returns a tuple containing the DLPack device and device id for the tensor.")
-        .def("reshape", &PyTensor::reshape, "new_shape"_a, "layout"_a,
-             "Creates a new tensor with the specified shape.");
-    m.def("from_dlpack", &PyTensor::fromDLPack, "buffer"_a, "layout"_a,
-          "Wraps a DLPack supported tensor in a rocpycv tensor.");
+        .def(
+            "reshape",
+            [](PyTensor& self, std::vector<int64_t> newShape, py::object layout) {
+                return self.reshape(newShape, LayoutFromPyObject(layout));
+            },
+            "new_shape"_a, "layout"_a,
+            "Creates a new tensor with the specified shape. ``layout`` may be an "
+            "``rocpycv.eTensorLayout`` or a layout string (e.g. ``\"NHWC\"``).");
+    m.def(
+        "from_dlpack",
+        [](pybind11::object src, py::object layout) { return PyTensor::fromDLPack(src, LayoutFromPyObject(layout)); },
+        "buffer"_a, "layout"_a,
+        "Wraps a DLPack supported tensor in a rocpycv tensor. ``layout`` may be an "
+        "``rocpycv.eTensorLayout`` or a layout string (e.g. ``\"NHWC\"``).");
 }
\ No newline at end of file
diff --git a/python/src/rocpycv.pyi b/python/src/rocpycv.pyi
index c6bc639c..ce2e7584 100644
--- a/python/src/rocpycv.pyi
+++ b/python/src/rocpycv.pyi
@@ -170,9 +170,9 @@ class Tensor:
         """
         Returns a tuple containing the DLPack device and device id for the tensor.
         """
-    def __init__(self, shape: collections.abc.Sequence[typing.SupportsInt | typing.SupportsIndex], layout: eTensorLayout, dtype: eDataType, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
+    def __init__(self, shape: collections.abc.Sequence[typing.SupportsInt | typing.SupportsIndex], layout: typing.Any, dtype: typing.Any, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
         """
-        Constructs a tensor object.
+        Constructs a tensor object. ``layout`` may be an ``rocpycv.eTensorLayout`` (e.g. ``rocpycv.NHWC``) or a layout string (``"NHWC"``). ``dtype`` may be an ``rocpycv.eDataType`` (e.g. ``rocpycv.F32``) or a NumPy dtype/scalar type (e.g. ``np.float32``).
         """
     def copy_to(self, device: eDeviceType) -> Tensor:
         """
@@ -198,9 +198,9 @@ class Tensor:
         """
         Returns the number of dimensions of the tensor.
         """
-    def reshape(self, new_shape: collections.abc.Sequence[typing.SupportsInt | typing.SupportsIndex], layout: eTensorLayout) -> Tensor:
+    def reshape(self, new_shape: collections.abc.Sequence[typing.SupportsInt | typing.SupportsIndex], layout: typing.Any) -> Tensor:
         """
-        Creates a new tensor with the specified shape.
+        Creates a new tensor with the specified shape. ``layout`` may be an ``rocpycv.eTensorLayout`` or a layout string (e.g. ``"NHWC"``).
         """
     def shape(self) -> list[int]:
         """
@@ -1089,9 +1089,9 @@ def flip_into(dst: Tensor, src: Tensor, flip_code: typing.SupportsInt | typing.S
                 Returns:
                     None
     """
-def from_dlpack(buffer: typing.Any, layout: eTensorLayout) -> Tensor:
+def from_dlpack(buffer: typing.Any, layout: typing.Any) -> Tensor:
     """
-    Wraps a DLPack supported tensor in a rocpycv tensor.
+    Wraps a DLPack supported tensor in a rocpycv tensor. ``layout`` may be an ``rocpycv.eTensorLayout`` or a layout string (e.g. ``"NHWC"``).
     """
 def gamma_contrast(src: Tensor, gamma: typing.SupportsFloat | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
diff --git a/samples/migraphx-classification/migraphx_classification.py b/samples/migraphx-classification/migraphx_classification.py
index e36b0c85..b064433d 100644
--- a/samples/migraphx-classification/migraphx_classification.py
+++ b/samples/migraphx-classification/migraphx_classification.py
@@ -123,15 +123,15 @@ def main() -> None:
     print(f"Input image shape: {np_image.shape}")
 
     # Load/allocate tensors on the GPU
-    input_tensor  : rocpycv.Tensor = rocpycv.from_dlpack(np_image, rocpycv.NHWC).copy_to(rocpycv.GPU)
-    resized       : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.U8)
-    rgb           : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.U8)
-    f32           : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.F32)
-    normalized    : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.F32)
-    preprocessed  : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, 3, INPUT_H, INPUT_W), rocpycv.NCHW, rocpycv.F32)
-
-    mean_t        : rocpycv.Tensor = rocpycv.from_dlpack(IMAGENET_MEAN.reshape(1, 1, 1, 3), rocpycv.NHWC).copy_to(rocpycv.GPU)
-    std_t         : rocpycv.Tensor = rocpycv.from_dlpack(IMAGENET_STD.reshape(1, 1, 1, 3), rocpycv.NHWC).copy_to(rocpycv.GPU)
+    input_tensor  : rocpycv.Tensor = rocpycv.from_dlpack(np_image, "NHWC").copy_to(rocpycv.GPU)
+    resized       : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), "NHWC", np.uint8)
+    rgb           : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), "NHWC", np.uint8)
+    f32           : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), "NHWC", np.float32)
+    normalized    : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), "NHWC", np.float32)
+    preprocessed  : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, 3, INPUT_H, INPUT_W), "NCHW", np.float32)
+
+    mean_t        : rocpycv.Tensor = rocpycv.from_dlpack(IMAGENET_MEAN.reshape(1, 1, 1, 3), "NHWC").copy_to(rocpycv.GPU)
+    std_t         : rocpycv.Tensor = rocpycv.from_dlpack(IMAGENET_STD.reshape(1, 1, 1, 3), "NHWC").copy_to(rocpycv.GPU)
 
     # Setup MIGraphX arguments/shapes
     in_shape   : migraphx.shape    = migraphx.shape(type="float_type", lens=preprocessed.shape())

From 6a0e0faf4516e974985bafe4a2235efbfbb58db9 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 29 Apr 2026 18:26:07 -0400
Subject: [PATCH 12/20] Swap dtype <-> layout position in tensor construction

---
 python/include/py_tensor.hpp                     |  4 ++--
 python/src/py_tensor.cpp                         | 16 ++++++++--------
 python/src/rocpycv.pyi                           |  4 ++--
 .../migraphx_classification.py                   | 10 +++++-----
 tests/roccv/python/test_op_adv_cvt_color.py      |  6 +++---
 tests/roccv/python/test_op_bilateral_filter.py   |  2 +-
 tests/roccv/python/test_op_bnd_box.py            |  3 +--
 tests/roccv/python/test_op_center_crop.py        |  3 +--
 tests/roccv/python/test_op_composite.py          |  2 +-
 tests/roccv/python/test_op_convert_to.py         |  2 +-
 tests/roccv/python/test_op_copy_make_border.py   |  2 +-
 tests/roccv/python/test_op_custom_crop.py        |  3 +--
 tests/roccv/python/test_op_cvt_color.py          |  2 +-
 tests/roccv/python/test_op_flip.py               |  2 +-
 tests/roccv/python/test_op_gamma_contrast.py     |  2 +-
 tests/roccv/python/test_op_histogram.py          |  2 +-
 .../roccv/python/test_op_non_max_suppression.py  |  2 +-
 tests/roccv/python/test_op_normalize.py          |  2 +-
 tests/roccv/python/test_op_reformat.py           |  2 +-
 tests/roccv/python/test_op_remap.py              |  2 +-
 tests/roccv/python/test_op_resize.py             |  2 +-
 tests/roccv/python/test_op_rotate.py             |  2 +-
 tests/roccv/python/test_op_thresholding.py       |  2 +-
 tests/roccv/python/test_op_warp_affine.py        |  2 +-
 tests/roccv/python/test_op_warp_perspective.py   |  2 +-
 25 files changed, 40 insertions(+), 43 deletions(-)

diff --git a/python/include/py_tensor.hpp b/python/include/py_tensor.hpp
index f0c3e1d2..180006d2 100644
--- a/python/include/py_tensor.hpp
+++ b/python/include/py_tensor.hpp
@@ -38,11 +38,11 @@ class PyTensor : public std::enable_shared_from_this<PyTensor> {
      * wrap.
      *
      * @param shape The shape of the tensor.
-     * @param layout The layout of the tensor.
      * @param dtype The data type of the tensor.
+     * @param layout The layout of the tensor.
      * @param device The device of the tensor.
      */
-    PyTensor(std::vector<int64_t> shape, eTensorLayout layout, eDataType dtype, eDeviceType device);
+    PyTensor(std::vector<int64_t> shape, eDataType dtype, eTensorLayout layout, eDeviceType device);
 
     /**
      * @brief Wraps an existing roccv::Tensor inside of a newly constructed PyTensor.
diff --git a/python/src/py_tensor.cpp b/python/src/py_tensor.cpp
index 4fa4dd1b..5c757c02 100644
--- a/python/src/py_tensor.cpp
+++ b/python/src/py_tensor.cpp
@@ -59,7 +59,7 @@ DLManagedTensor* createDLManagedTensor(std::shared_ptr<roccv::Tensor> tensor, st
     return dlTensor;
 }
 
-PyTensor::PyTensor(std::vector<int64_t> shape, eTensorLayout layout, eDataType dtype, eDeviceType device) {
+PyTensor::PyTensor(std::vector<int64_t> shape, eDataType dtype, eTensorLayout layout, eDeviceType device) {
     roccv::TensorShape tShape(roccv::TensorShape(roccv::TensorLayout(layout), shape));
     m_tensor = std::make_shared<roccv::Tensor>(tShape, roccv::DataType(dtype), device);
 }
@@ -202,15 +202,15 @@ void PyTensor::Export(pybind11::module& m) {
 
     pybind11::class_<PyTensor, std::shared_ptr<PyTensor>> tensor(m, "Tensor");
     tensor
-        .def(pybind11::init([](std::vector<int64_t> shape, py::object layout, py::object dtype, eDeviceType device) {
-                 return std::make_shared<PyTensor>(shape, LayoutFromPyObject(layout), DataTypeFromPyObject(dtype),
+        .def(pybind11::init([](std::vector<int64_t> shape, py::object dtype, py::object layout, eDeviceType device) {
+                 return std::make_shared<PyTensor>(shape, DataTypeFromPyObject(dtype), LayoutFromPyObject(layout),
                                                    device);
              }),
-             "shape"_a, "layout"_a, "dtype"_a, "device"_a = eDeviceType::GPU,
-             "Constructs a tensor object. ``layout`` may be an ``rocpycv.eTensorLayout`` (e.g. "
-             "``rocpycv.NHWC``) or a layout string (``\"NHWC\"``). ``dtype`` may be an "
-             "``rocpycv.eDataType`` (e.g. ``rocpycv.F32``) or a NumPy dtype/scalar type "
-             "(e.g. ``np.float32``).")
+             "shape"_a, "dtype"_a, "layout"_a, "device"_a = eDeviceType::GPU,
+             "Constructs a tensor object. ``dtype`` may be an ``rocpycv.eDataType`` (e.g. "
+             "``rocpycv.F32``) or a NumPy dtype/scalar type (e.g. ``np.float32``). ``layout`` "
+             "may be an ``rocpycv.eTensorLayout`` (e.g. ``rocpycv.NHWC``) or a layout string "
+             "(``\"NHWC\"``).")
         .def("copy_to", &PyTensor::copyTo, "device"_a,
              "Returns a deep copy of the tensor with data copied to a specified device type.")
         .def("__dlpack__", &PyTensor::toDLPack, "stream"_a = py::none(),
diff --git a/python/src/rocpycv.pyi b/python/src/rocpycv.pyi
index ce2e7584..f24fb615 100644
--- a/python/src/rocpycv.pyi
+++ b/python/src/rocpycv.pyi
@@ -170,9 +170,9 @@ class Tensor:
         """
         Returns a tuple containing the DLPack device and device id for the tensor.
         """
-    def __init__(self, shape: collections.abc.Sequence[typing.SupportsInt | typing.SupportsIndex], layout: typing.Any, dtype: typing.Any, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
+    def __init__(self, shape: collections.abc.Sequence[typing.SupportsInt | typing.SupportsIndex], dtype: typing.Any, layout: typing.Any, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
         """
-        Constructs a tensor object. ``layout`` may be an ``rocpycv.eTensorLayout`` (e.g. ``rocpycv.NHWC``) or a layout string (``"NHWC"``). ``dtype`` may be an ``rocpycv.eDataType`` (e.g. ``rocpycv.F32``) or a NumPy dtype/scalar type (e.g. ``np.float32``).
+        Constructs a tensor object. ``dtype`` may be an ``rocpycv.eDataType`` (e.g. ``rocpycv.F32``) or a NumPy dtype/scalar type (e.g. ``np.float32``). ``layout`` may be an ``rocpycv.eTensorLayout`` (e.g. ``rocpycv.NHWC``) or a layout string (``"NHWC"``).
         """
     def copy_to(self, device: eDeviceType) -> Tensor:
         """
diff --git a/samples/migraphx-classification/migraphx_classification.py b/samples/migraphx-classification/migraphx_classification.py
index b064433d..be7fc66e 100644
--- a/samples/migraphx-classification/migraphx_classification.py
+++ b/samples/migraphx-classification/migraphx_classification.py
@@ -124,11 +124,11 @@ def main() -> None:
 
     # Load/allocate tensors on the GPU
     input_tensor  : rocpycv.Tensor = rocpycv.from_dlpack(np_image, "NHWC").copy_to(rocpycv.GPU)
-    resized       : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), "NHWC", np.uint8)
-    rgb           : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), "NHWC", np.uint8)
-    f32           : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), "NHWC", np.float32)
-    normalized    : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), "NHWC", np.float32)
-    preprocessed  : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, 3, INPUT_H, INPUT_W), "NCHW", np.float32)
+    resized       : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), np.uint8, "NHWC")
+    rgb           : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), np.uint8, "NHWC")
+    f32           : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), np.float32, "NHWC")
+    normalized    : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), np.float32, "NHWC")
+    preprocessed  : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, 3, INPUT_H, INPUT_W), np.float32, "NCHW")
 
     mean_t        : rocpycv.Tensor = rocpycv.from_dlpack(IMAGENET_MEAN.reshape(1, 1, 1, 3), "NHWC").copy_to(rocpycv.GPU)
     std_t         : rocpycv.Tensor = rocpycv.from_dlpack(IMAGENET_STD.reshape(1, 1, 1, 3), "NHWC").copy_to(rocpycv.GPU)
diff --git a/tests/roccv/python/test_op_adv_cvt_color.py b/tests/roccv/python/test_op_adv_cvt_color.py
index 8a243f68..32b7f686 100644
--- a/tests/roccv/python/test_op_adv_cvt_color.py
+++ b/tests/roccv/python/test_op_adv_cvt_color.py
@@ -62,7 +62,7 @@
 @pytest.mark.parametrize("samples,width,height", [[1, 64, 48], [2, 128, 72]])
 def test_op_advcvtcolor_interleaved444(samples, height, width, code, spec, dtype, device):
     input_tensor = generate_tensor(samples, width, height, 3, dtype, device)
-    output_golden = rocpycv.Tensor([samples, height, width, 3], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, height, width, 3], dtype, rocpycv.eTensorLayout.NHWC, device)
 
     stream = rocpycv.Stream()
     output = rocpycv.advcvtcolor(input_tensor, code, spec, stream, device)
@@ -79,7 +79,7 @@ def test_op_advcvtcolor_interleaved444(samples, height, width, code, spec, dtype
 @pytest.mark.parametrize("samples,width,height", [[1, 64, 48], [2, 128, 72]])
 def test_op_advcvtcolor_interleaved_to_semiplanar(samples, height, width, code, spec, dtype, device):
     input_tensor = generate_tensor(samples, width, height, 3, dtype, device)
-    output_golden = rocpycv.Tensor([samples, (height * 3) // 2, width, 1], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, (height * 3) // 2, width, 1], dtype, rocpycv.eTensorLayout.NHWC, device)
 
     stream = rocpycv.Stream()
     output = rocpycv.advcvtcolor(input_tensor, code, spec, stream, device)
@@ -96,7 +96,7 @@ def test_op_advcvtcolor_interleaved_to_semiplanar(samples, height, width, code,
 @pytest.mark.parametrize("samples,width,height", [[1, 64, 48], [2, 128, 72]])
 def test_op_advcvtcolor_semiplanar_to_interleaved(samples, height, width, code, spec, dtype, device):
     input_tensor = generate_tensor(samples, width, (height * 3) // 2, 1, dtype, device)
-    output_golden = rocpycv.Tensor([samples, height, width, 3], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, height, width, 3], dtype, rocpycv.eTensorLayout.NHWC, device)
 
     stream = rocpycv.Stream()
     output = rocpycv.advcvtcolor(input_tensor, code, spec, stream, device)
diff --git a/tests/roccv/python/test_op_bilateral_filter.py b/tests/roccv/python/test_op_bilateral_filter.py
index 6e0abfd3..72e6708d 100644
--- a/tests/roccv/python/test_op_bilateral_filter.py
+++ b/tests/roccv/python/test_op_bilateral_filter.py
@@ -44,7 +44,7 @@
 ])
 def test_op_bilateral_filter(samples, height, width, channels, border_mode, border_val, diameter, sigma_color, sigma_space, dtype, device):
     input = generate_tensor(samples, width, height, channels, dtype, device)
-    output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, height, width, channels], dtype, rocpycv.eTensorLayout.NHWC, device)
 
     stream = rocpycv.Stream()
     rocpycv.bilateral_filter_into(output_golden, input, diameter, sigma_color,
diff --git a/tests/roccv/python/test_op_bnd_box.py b/tests/roccv/python/test_op_bnd_box.py
index d6fd8045..f5377b65 100644
--- a/tests/roccv/python/test_op_bnd_box.py
+++ b/tests/roccv/python/test_op_bnd_box.py
@@ -52,8 +52,7 @@ def generate_boxes(samples: int, height: int, width: int) -> rocpycv.BndBoxes:
 def test_op_remap(samples, height, width, channels, device):
     input = generate_tensor(samples, width, height, channels, rocpycv.eDataType.U8, device)
     boxes = generate_boxes(samples, height, width)
-    output_golden = rocpycv.Tensor([samples, height, width, channels],
-                                   rocpycv.eTensorLayout.NHWC, rocpycv.eDataType.U8, device)
+    output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eDataType.U8, rocpycv.eTensorLayout.NHWC, device)
 
     stream = rocpycv.Stream()
     output = rocpycv.bndbox(input, boxes, stream, device)
diff --git a/tests/roccv/python/test_op_center_crop.py b/tests/roccv/python/test_op_center_crop.py
index a8054dcc..8daa507f 100644
--- a/tests/roccv/python/test_op_center_crop.py
+++ b/tests/roccv/python/test_op_center_crop.py
@@ -41,8 +41,7 @@
 ])
 def test_op_center_crop(samples, height, width, channels, dtype, box, device):
     input = generate_tensor(samples, width, height, channels, dtype, device)
-    output_golden = rocpycv.Tensor([samples, box[1], box[0], channels],
-                                   rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, box[1], box[0], channels], dtype, rocpycv.eTensorLayout.NHWC, device)
 
     if device == rocpycv.eDeviceType.GPU:
         stream = rocpycv.Stream()
diff --git a/tests/roccv/python/test_op_composite.py b/tests/roccv/python/test_op_composite.py
index e5d37538..12f43668 100644
--- a/tests/roccv/python/test_op_composite.py
+++ b/tests/roccv/python/test_op_composite.py
@@ -39,7 +39,7 @@ def test_op_composite(samples, height, width, out_channels, dtype, device):
     foreground = generate_tensor(samples, width, height, 3, dtype, device)
     background = generate_tensor(samples, width, height, 3, dtype, device)
     mask = generate_tensor(samples, width, height, 1, dtype, device)
-    output_golden = rocpycv.Tensor([samples, height, width, out_channels], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, height, width, out_channels], dtype, rocpycv.eTensorLayout.NHWC, device)
 
     stream = rocpycv.Stream()
     output = rocpycv.composite(foreground, background, mask, out_channels, stream, device)
diff --git a/tests/roccv/python/test_op_convert_to.py b/tests/roccv/python/test_op_convert_to.py
index 9e9f389f..c7555531 100644
--- a/tests/roccv/python/test_op_convert_to.py
+++ b/tests/roccv/python/test_op_convert_to.py
@@ -40,7 +40,7 @@
 ])
 def test_op_convert_to(samples, height, width, channels, device, dtype, out_dtype, alpha, beta):
     input = generate_tensor(samples, width, height, channels, dtype, device)
-    output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, out_dtype, device)
+    output_golden = rocpycv.Tensor([samples, height, width, channels], out_dtype, rocpycv.eTensorLayout.NHWC, device)
 
     stream = rocpycv.Stream()
     rocpycv.convert_to_into(output_golden, input, alpha, beta, stream, device)
diff --git a/tests/roccv/python/test_op_copy_make_border.py b/tests/roccv/python/test_op_copy_make_border.py
index 27994e00..fcafe36f 100644
--- a/tests/roccv/python/test_op_copy_make_border.py
+++ b/tests/roccv/python/test_op_copy_make_border.py
@@ -46,7 +46,7 @@
 def test_op_copy_make_border(samples, height, width, channels, top, right, bottom, left, border_mode, border_value, dtype, device):
     input = generate_tensor(samples, width, height, channels, dtype, device)
     output_golden = rocpycv.Tensor([samples, height + top + bottom, width + right + left,
-                                   channels], rocpycv.eTensorLayout.NHWC, dtype, device)
+                                   channels], dtype, rocpycv.eTensorLayout.NHWC, device)
 
     stream = rocpycv.Stream()
     output = rocpycv.copymakeborder(input, border_mode, border_value, top, bottom, left, right, stream, device)
diff --git a/tests/roccv/python/test_op_custom_crop.py b/tests/roccv/python/test_op_custom_crop.py
index 026130c5..47c3f1f1 100644
--- a/tests/roccv/python/test_op_custom_crop.py
+++ b/tests/roccv/python/test_op_custom_crop.py
@@ -41,8 +41,7 @@
 ])
 def test_op_custom_crop(samples, height, width, channels, dtype, box, device):
     input = generate_tensor(samples, width, height, channels, dtype, device)
-    output_golden = rocpycv.Tensor([samples, box.height, box.width, channels],
-                                   rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, box.height, box.width, channels], dtype, rocpycv.eTensorLayout.NHWC, device)
     if device == rocpycv.eDeviceType.GPU:
         stream = rocpycv.Stream()
         rocpycv.custom_crop_into(output_golden, input, box, stream, device)
diff --git a/tests/roccv/python/test_op_cvt_color.py b/tests/roccv/python/test_op_cvt_color.py
index 448a598b..637fbbfe 100644
--- a/tests/roccv/python/test_op_cvt_color.py
+++ b/tests/roccv/python/test_op_cvt_color.py
@@ -51,7 +51,7 @@ def test_op_cvtcolor(samples, height, width, code, dtype, device):
         out_channels = 1
 
     input = generate_tensor(samples, width, height, in_channels, dtype, device)
-    output_golden = rocpycv.Tensor([samples, height, width, out_channels], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, height, width, out_channels], dtype, rocpycv.eTensorLayout.NHWC, device)
 
     stream = rocpycv.Stream()
     output = rocpycv.cvtcolor(input, code, stream, device)
diff --git a/tests/roccv/python/test_op_flip.py b/tests/roccv/python/test_op_flip.py
index 32ed6215..c4980cd6 100644
--- a/tests/roccv/python/test_op_flip.py
+++ b/tests/roccv/python/test_op_flip.py
@@ -40,7 +40,7 @@
 def test_op_flip(samples, width, height, channels, dtype, flip_code, device):
     input_tensor = generate_tensor(samples, width, height, channels, dtype, device)
     stream = rocpycv.Stream()
-    output_tensor_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_tensor_golden = rocpycv.Tensor([samples, height, width, channels], dtype, rocpycv.eTensorLayout.NHWC, device)
     rocpycv.flip_into(output_tensor_golden, input_tensor, flip_code, stream, device)
     output_tensor = rocpycv.flip(input_tensor, flip_code, stream, device)
     stream.synchronize()
diff --git a/tests/roccv/python/test_op_gamma_contrast.py b/tests/roccv/python/test_op_gamma_contrast.py
index 68343ec9..38893594 100644
--- a/tests/roccv/python/test_op_gamma_contrast.py
+++ b/tests/roccv/python/test_op_gamma_contrast.py
@@ -38,7 +38,7 @@
 ])
 def test_op_gamma_contrast(samples, height, width, channels, gamma, dtype, device):
     input = generate_tensor(samples, width, height, channels, dtype, device)
-    output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, height, width, channels], dtype, rocpycv.eTensorLayout.NHWC, device)
 
     stream = rocpycv.Stream()
     output = rocpycv.gamma_contrast(input, gamma, stream, device)
diff --git a/tests/roccv/python/test_op_histogram.py b/tests/roccv/python/test_op_histogram.py
index e980a057..17c741d2 100644
--- a/tests/roccv/python/test_op_histogram.py
+++ b/tests/roccv/python/test_op_histogram.py
@@ -37,7 +37,7 @@
 ])
 def test_op_histogram(samples, height, width, in_dtype, out_dtype, device):
     input = generate_tensor(samples, width, height, 1, in_dtype, device)
-    output_golden = rocpycv.Tensor([samples, 256, 1], rocpycv.eTensorLayout.HWC, out_dtype, device)
+    output_golden = rocpycv.Tensor([samples, 256, 1], out_dtype, rocpycv.eTensorLayout.HWC, device)
 
     stream = rocpycv.Stream()
     output = rocpycv.histogram(input, None, stream, device)
diff --git a/tests/roccv/python/test_op_non_max_suppression.py b/tests/roccv/python/test_op_non_max_suppression.py
index 163a0a8f..e39c1e44 100644
--- a/tests/roccv/python/test_op_non_max_suppression.py
+++ b/tests/roccv/python/test_op_non_max_suppression.py
@@ -44,7 +44,7 @@ def generate_boxes(samples: int, num_boxes: int, device: rocpycv.eDeviceType) ->
 def test_op_non_max_suppression(samples, num_boxes, device):
     boxes = generate_boxes(samples, num_boxes, device)
     scores = generate_tensor_generic([samples, num_boxes], rocpycv.eTensorLayout.NW, rocpycv.eDataType.F32, device)
-    output_golden = rocpycv.Tensor([samples, num_boxes], rocpycv.eTensorLayout.NW, rocpycv.eDataType.U8, device)
+    output_golden = rocpycv.Tensor([samples, num_boxes], rocpycv.eDataType.U8, rocpycv.eTensorLayout.NW, device)
 
     stream = rocpycv.Stream()
     # Hardcoding the score and IoU threshold here. The only thing we care about is the resulting size of the
diff --git a/tests/roccv/python/test_op_normalize.py b/tests/roccv/python/test_op_normalize.py
index e25bc300..72036515 100644
--- a/tests/roccv/python/test_op_normalize.py
+++ b/tests/roccv/python/test_op_normalize.py
@@ -39,7 +39,7 @@ def test_op_normalize(samples, height, width, channels, device, dtype):
     input = generate_tensor(samples, width, height, channels, dtype, device)
     base = generate_tensor(1, 1, 1, channels, rocpycv.eDataType.F32, device)
     scale = generate_tensor(1, 1, 1, channels, rocpycv.eDataType.F32, device)
-    output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, height, width, channels], dtype, rocpycv.eTensorLayout.NHWC, device)
 
     stream = rocpycv.Stream()
     rocpycv.normalize_into(output_golden, input, base, scale, None, 1.0, 0.0, 0.0, stream, device)
diff --git a/tests/roccv/python/test_op_reformat.py b/tests/roccv/python/test_op_reformat.py
index 62715ff9..10d2b7a6 100644
--- a/tests/roccv/python/test_op_reformat.py
+++ b/tests/roccv/python/test_op_reformat.py
@@ -54,7 +54,7 @@ def test_op_reformat(samples, height, width, channels, inLayout, outLayout, devi
     input_shape = create_tensor_shape(inLayout, samples, channels, height, width)
     output_shape = create_tensor_shape(outLayout, samples, channels, height, width)
     input_tensor = generate_tensor_generic(input_shape, inLayout, dtype, device)
-    output_golden = rocpycv.Tensor(output_shape, outLayout, dtype, device)
+    output_golden = rocpycv.Tensor(output_shape, dtype, outLayout, device)
 
     stream = rocpycv.Stream()
     rocpycv.reformat_into(input_tensor, output_golden, stream, device)
diff --git a/tests/roccv/python/test_op_remap.py b/tests/roccv/python/test_op_remap.py
index 25321f9e..e9ccc394 100644
--- a/tests/roccv/python/test_op_remap.py
+++ b/tests/roccv/python/test_op_remap.py
@@ -44,7 +44,7 @@
 
 def test_op_remap(samples, width, height, channels, dtype, map_interp, interp, map_type, align_corners, border_mode, border_val, device):
     input_tensor = generate_tensor(samples, width, height, channels, dtype, device)
-    output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, height, width, channels], dtype, rocpycv.eTensorLayout.NHWC, device)
 
 
     if (map_type == rocpycv.REMAP_ABSOLUTE):
diff --git a/tests/roccv/python/test_op_resize.py b/tests/roccv/python/test_op_resize.py
index 721c1d7f..70e0c897 100644
--- a/tests/roccv/python/test_op_resize.py
+++ b/tests/roccv/python/test_op_resize.py
@@ -38,7 +38,7 @@ def test_op_resize(out_shape, in_shape, samples, channels, interp, dtype, device
     # Input/Output shapes are passed in as format [width, height]
     input = generate_tensor(samples, in_shape[0], in_shape[1], channels, dtype, device)
     output_shape = (samples, out_shape[1], out_shape[0], channels)
-    output_golden = rocpycv.Tensor(output_shape, rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor(output_shape, dtype, rocpycv.eTensorLayout.NHWC, device)
 
     stream = rocpycv.Stream()
     rocpycv.resize_into(output_golden, input, interp, stream, device)
diff --git a/tests/roccv/python/test_op_rotate.py b/tests/roccv/python/test_op_rotate.py
index 13c2fceb..1fa1e8e8 100644
--- a/tests/roccv/python/test_op_rotate.py
+++ b/tests/roccv/python/test_op_rotate.py
@@ -46,7 +46,7 @@ def calc_center_shift(center_x, center_y, angle) -> tuple[float, float]:
 ])
 def test_op_rotate(samples, width, height, channels, angle, dtype, interp, device):
     input = generate_tensor(samples, width, height, channels, dtype, device)
-    output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, height, width, channels], dtype, rocpycv.eTensorLayout.NHWC, device)
 
     center_x = (width - 1) / 2
     center_y = (height - 1) / 2
diff --git a/tests/roccv/python/test_op_thresholding.py b/tests/roccv/python/test_op_thresholding.py
index 1eae8d97..8613a266 100644
--- a/tests/roccv/python/test_op_thresholding.py
+++ b/tests/roccv/python/test_op_thresholding.py
@@ -41,7 +41,7 @@
 
 def test_op_thresholding(samples, height, width, channels, dtype, thresh, mvdata, threshType, device):
     input_tensor = generate_tensor(samples, width, height, channels, dtype, device)
-    output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, height, width, channels], dtype, rocpycv.eTensorLayout.NHWC, device)
     
     thresh_array = np.full(samples, thresh, np.float64)
     maxval_array = np.full(samples, mvdata, np.float64)
diff --git a/tests/roccv/python/test_op_warp_affine.py b/tests/roccv/python/test_op_warp_affine.py
index b4fd0dfb..9b9656b0 100644
--- a/tests/roccv/python/test_op_warp_affine.py
+++ b/tests/roccv/python/test_op_warp_affine.py
@@ -41,7 +41,7 @@
 ])
 def test_op_warp_affine(samples, width, height, channels, dtype, mat, inverted, interp, border_mode, border_val, device):
     input = generate_tensor(samples, width, height, channels, dtype, device)
-    output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, height, width, channels], dtype, rocpycv.eTensorLayout.NHWC, device)
 
     stream = rocpycv.Stream()
     output = rocpycv.warp_affine(input, mat, inverted, interp, border_mode, border_val, stream, device)
diff --git a/tests/roccv/python/test_op_warp_perspective.py b/tests/roccv/python/test_op_warp_perspective.py
index c72754a6..3164a951 100644
--- a/tests/roccv/python/test_op_warp_perspective.py
+++ b/tests/roccv/python/test_op_warp_perspective.py
@@ -41,7 +41,7 @@
 ])
 def test_op_warp_perspective(samples, width, height, channels, dtype, mat, inverted, interp, border_mode, border_val, device):
     input = generate_tensor(samples, width, height, channels, dtype, device)
-    output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, height, width, channels], dtype, rocpycv.eTensorLayout.NHWC, device)
     print(output_golden.shape())
 
     stream = rocpycv.Stream()

From ea1caff478cf1b806e5cac736ac570add7825741 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 29 Apr 2026 18:30:43 -0400
Subject: [PATCH 13/20] Add numpy/string layout options for convert_to and
 reformat python operators

---
 python/src/operators/py_op_convert_to.cpp | 18 +++++++++++++-----
 python/src/operators/py_op_reformat.cpp   | 13 ++++++++++---
 python/src/rocpycv.pyi                    | 12 +++++++-----
 3 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/python/src/operators/py_op_convert_to.cpp b/python/src/operators/py_op_convert_to.cpp
index 9100ce3f..ad3a32a5 100644
--- a/python/src/operators/py_op_convert_to.cpp
+++ b/python/src/operators/py_op_convert_to.cpp
@@ -24,6 +24,8 @@ THE SOFTWARE.
 
 #include <op_convert_to.hpp>
 
+#include "py_helpers.hpp"
+
 PyTensor PyOpConvertTo::Execute(PyTensor& input, eDataType dtype, double alpha, double beta,
                                         std::optional<std::reference_wrapper<PyStream>> stream, eDeviceType device) {
     hipStream_t hipStream = stream.has_value() ? stream.value().get().getStream() : nullptr;
@@ -44,17 +46,23 @@ void PyOpConvertTo::ExecuteInto(PyTensor& output, PyTensor& input, double alpha,
 
 void PyOpConvertTo::Export(py::module& m) {
     using namespace py::literals;
-    m.def("convert_to", &PyOpConvertTo::Execute, "src"_a, "dtype"_a, "alpha"_a = 1.0, "beta"_a = 0.0, 
-                                                    "stream"_a = nullptr, "device"_a = eDeviceType::GPU, R"pbdoc(
-            
+    m.def("convert_to",
+          [](PyTensor& input, py::object dtype, double alpha, double beta,
+             std::optional<std::reference_wrapper<PyStream>> stream, eDeviceType device) {
+              return PyOpConvertTo::Execute(input, DataTypeFromPyObject(dtype), alpha, beta, stream, device);
+          },
+          "src"_a, "dtype"_a, "alpha"_a = 1.0, "beta"_a = 0.0, "stream"_a = nullptr,
+          "device"_a = eDeviceType::GPU, R"pbdoc(
+
             Executes the Convert To operation on the given HIP stream.
 
             See also:
                 Refer to the rocCV C++ API reference for more information on this operation.
-            
+
             Args:
                 src (rocpycv.Tensor): Input tensor containing one or more images.
-                dtype (eDataType): Datatype of the output tensor.
+                dtype: Datatype of the output tensor. Either an ``rocpycv.eDataType``
+                    (e.g. ``rocpycv.F32``) or a NumPy dtype/scalar type (e.g. ``np.float32``).
                 alpha (double, optional): Scalar for output data. Defaults to 1.0.
                 beta (double, optional): Offset for the data. Defaults to 0.0.
                 stream (rocpycv.Stream, optional): HIP stream to run this operation on.
diff --git a/python/src/operators/py_op_reformat.cpp b/python/src/operators/py_op_reformat.cpp
index 6cfa4105..1351ca53 100644
--- a/python/src/operators/py_op_reformat.cpp
+++ b/python/src/operators/py_op_reformat.cpp
@@ -22,6 +22,8 @@ THE SOFTWARE.
 
 #include "operators/py_op_reformat.hpp"
 
+#include "py_helpers.hpp"
+
 void PyOpReformat::ExecuteInto(PyTensor& output, PyTensor& input,
                                std::optional<std::reference_wrapper<PyStream>> stream, eDeviceType device) {
     hipStream_t hipStream = stream.has_value() ? stream.value().get().getStream() : nullptr;
@@ -46,8 +48,12 @@ PyTensor PyOpReformat::Execute(PyTensor& input, eTensorLayout outLayout,
 void PyOpReformat::Export(py::module& m) {
     using namespace py::literals;
 
-    m.def("reformat", &PyOpReformat::Execute, "input"_a, "out_layout"_a, "stream"_a = nullptr,
-          "device"_a = eDeviceType::GPU, R"pbdoc(
+    m.def("reformat",
+          [](PyTensor& input, py::object outLayout,
+             std::optional<std::reference_wrapper<PyStream>> stream, eDeviceType device) {
+              return PyOpReformat::Execute(input, LayoutFromPyObject(outLayout), stream, device);
+          },
+          "input"_a, "out_layout"_a, "stream"_a = nullptr, "device"_a = eDeviceType::GPU, R"pbdoc(
             Executes the Reformat operation and returns the result as a new tensor.
 
             See also:
@@ -55,7 +61,8 @@ void PyOpReformat::Export(py::module& m) {
 
             Args:
                 input (rocpycv.Tensor): Input tensor to reformat.
-                out_layout (rocpycv.eTensorLayout): The layout to reformat the input tensor to.
+                out_layout: The layout to reformat the input tensor to. Either an
+                    ``rocpycv.eTensorLayout`` (e.g. ``rocpycv.NCHW``) or a layout string (``"NCHW"``).
                 stream (rocpycv.Stream, optional): HIP stream to run this operation on.
                 device (rocpycv.Device, optional): The device to run this operation on. Defaults to GPU.
 
diff --git a/python/src/rocpycv.pyi b/python/src/rocpycv.pyi
index f24fb615..b9a0aee7 100644
--- a/python/src/rocpycv.pyi
+++ b/python/src/rocpycv.pyi
@@ -913,16 +913,17 @@ def composite_into(dst: Tensor, foreground: Tensor, background: Tensor, fgmask:
                 Returns:
                     None
     """
-def convert_to(src: Tensor, dtype: eDataType, alpha: typing.SupportsFloat | typing.SupportsIndex = 1.0, beta: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
+def convert_to(src: Tensor, dtype: typing.Any, alpha: typing.SupportsFloat | typing.SupportsIndex = 1.0, beta: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Convert To operation on the given HIP stream.
     
                 See also:
                     Refer to the rocCV C++ API reference for more information on this operation.
-                
+    
                 Args:
                     src (rocpycv.Tensor): Input tensor containing one or more images.
-                    dtype (eDataType): Datatype of the output tensor.
+                    dtype: Datatype of the output tensor. Either an ``rocpycv.eDataType``
+                        (e.g. ``rocpycv.F32``) or a NumPy dtype/scalar type (e.g. ``np.float32``).
                     alpha (double, optional): Scalar for output data. Defaults to 1.0.
                     beta (double, optional): Offset for the data. Defaults to 0.0.
                     stream (rocpycv.Stream, optional): HIP stream to run this operation on.
@@ -1239,7 +1240,7 @@ def normalize_into(dst: Tensor, src: Tensor, base: Tensor, scale: Tensor, flags:
                   Returns:
                       None
     """
-def reformat(input: Tensor, out_layout: eTensorLayout, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
+def reformat(input: Tensor, out_layout: typing.Any, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Reformat operation and returns the result as a new tensor.
     
@@ -1248,7 +1249,8 @@ def reformat(input: Tensor, out_layout: eTensorLayout, stream: rocpycv.Stream |
     
                 Args:
                     input (rocpycv.Tensor): Input tensor to reformat.
-                    out_layout (rocpycv.eTensorLayout): The layout to reformat the input tensor to.
+                    out_layout: The layout to reformat the input tensor to. Either an
+                        ``rocpycv.eTensorLayout`` (e.g. ``rocpycv.NCHW``) or a layout string (``"NCHW"``).
                     stream (rocpycv.Stream, optional): HIP stream to run this operation on.
                     device (rocpycv.Device, optional): The device to run this operation on. Defaults to GPU.
     

From b4c39d5d2ccf062a23f0b34a6010cdf8713ef237 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Wed, 29 Apr 2026 18:56:01 -0400
Subject: [PATCH 14/20] Improve initial rocpycv module documentation

---
 python/src/main.cpp    | 57 +++++++++++++++++++++++++++++++++++++++---
 python/src/rocpycv.pyi | 57 +++++++++++++++++++++++++++++++++++++++---
 2 files changed, 108 insertions(+), 6 deletions(-)

diff --git a/python/src/main.cpp b/python/src/main.cpp
index ddbec687..e34c6fba 100644
--- a/python/src/main.cpp
+++ b/python/src/main.cpp
@@ -52,9 +52,60 @@ THE SOFTWARE.
 
 PYBIND11_MODULE(rocpycv, m) {
     m.doc() = R"pbdoc(
-        Python API reference
-        -----------------------
-        This is the Python API reference for rocCV.
+        rocpycv — AMD GPU-accelerated image pre/post-processing
+        =======================================================
+
+        rocpycv is the Python binding for rocCV, a HIP/ROCm image processing
+        library. It exposes a NumPy-friendly :class:`Tensor` and a suite of
+        operators (resize, normalize, color conversion, geometric warps, ...)
+        that run on either GPU (default) or CPU.
+
+        Quick start
+        -----------
+        .. code-block:: python
+
+            import numpy as np
+            import rocpycv
+
+            # Wrap a NumPy array as a CPU Tensor (zero-copy via DLPack), then
+            # copy it to the GPU (explicit H2D transfer).
+            host = np.zeros((1, 480, 640, 3), np.uint8)
+            src  = rocpycv.from_dlpack(host, "NHWC").copy_to(rocpycv.GPU)
+
+            # Functional form: operators allocate and return a new Tensor.
+            resized = rocpycv.resize(src, (1, 224, 224, 3), rocpycv.LINEAR)
+            chw     = rocpycv.reformat(resized, "NCHW")
+
+            # ``*_into`` form: write into a caller-allocated output, optionally
+            # on a stream — useful in hot preprocessing loops.
+            stream = rocpycv.Stream()
+            out    = rocpycv.Tensor((1, 224, 224, 3), np.uint8, "NHWC")
+            rocpycv.resize_into(out, src, rocpycv.LINEAR, stream)
+            stream.synchronize()
+
+        Tensors
+        -------
+        :class:`Tensor` arguments accept either rocpycv enums or familiar
+        Python types:
+
+        * ``dtype``  — ``rocpycv.F32`` or any NumPy dtype/scalar (``np.float32``).
+        * ``layout`` — ``rocpycv.NHWC`` or a layout string (``"NHWC"``).
+
+        For zero-copy interop, tensors implement the DLPack protocol — pass any
+        ``__dlpack__``-supporting object (NumPy array, PyTorch tensor, ...) to
+        :func:`from_dlpack`, and use :meth:`Tensor.data_ptr` to hand a raw GPU
+        pointer to inference frameworks such as MIGraphX.
+
+        Operators
+        ---------
+        Most operators come in two forms:
+
+        * ``op(src, ...)``       — allocates and returns a new :class:`Tensor`.
+        * ``op_into(dst, src, ...)`` — writes into a pre-allocated output,
+          avoiding per-call allocation in tight loops.
+
+        All operators accept an optional ``stream`` (a :class:`Stream` wrapping
+        a ``hipStream_t``) and a ``device`` argument (defaults to GPU).
     )pbdoc";
     PyException::Export(m);
     PyEnums::Export(m);
diff --git a/python/src/rocpycv.pyi b/python/src/rocpycv.pyi
index b9a0aee7..270fb848 100644
--- a/python/src/rocpycv.pyi
+++ b/python/src/rocpycv.pyi
@@ -1,8 +1,59 @@
 """
 
-        Python API reference
-        -----------------------
-        This is the Python API reference for rocCV.
+        rocpycv — AMD GPU-accelerated image pre/post-processing
+        =======================================================
+
+        rocpycv is the Python binding for rocCV, a HIP/ROCm image processing
+        library. It exposes a NumPy-friendly :class:`Tensor` and a suite of
+        operators (resize, normalize, color conversion, geometric warps, ...)
+        that run on either GPU (default) or CPU.
+
+        Quick start
+        -----------
+        .. code-block:: python
+
+            import numpy as np
+            import rocpycv
+
+            # Wrap a NumPy array as a CPU Tensor (zero-copy via DLPack), then
+            # copy it to the GPU (explicit H2D transfer).
+            host = np.zeros((1, 480, 640, 3), np.uint8)
+            src  = rocpycv.from_dlpack(host, "NHWC").copy_to(rocpycv.GPU)
+
+            # Functional form: operators allocate and return a new Tensor.
+            resized = rocpycv.resize(src, (1, 224, 224, 3), rocpycv.LINEAR)
+            chw     = rocpycv.reformat(resized, "NCHW")
+
+            # ``*_into`` form: write into a caller-allocated output, optionally
+            # on a stream — useful in hot preprocessing loops.
+            stream = rocpycv.Stream()
+            out    = rocpycv.Tensor((1, 224, 224, 3), np.uint8, "NHWC")
+            rocpycv.resize_into(out, src, rocpycv.LINEAR, stream)
+            stream.synchronize()
+
+        Tensors
+        -------
+        :class:`Tensor` arguments accept either rocpycv enums or familiar
+        Python types:
+
+        * ``dtype``  — ``rocpycv.F32`` or any NumPy dtype/scalar (``np.float32``).
+        * ``layout`` — ``rocpycv.NHWC`` or a layout string (``"NHWC"``).
+
+        For zero-copy interop, tensors implement the DLPack protocol — pass any
+        ``__dlpack__``-supporting object (NumPy array, PyTorch tensor, ...) to
+        :func:`from_dlpack`, and use :meth:`Tensor.data_ptr` to hand a raw GPU
+        pointer to inference frameworks such as MIGraphX.
+
+        Operators
+        ---------
+        Most operators come in two forms:
+
+        * ``op(src, ...)``       — allocates and returns a new :class:`Tensor`.
+        * ``op_into(dst, src, ...)`` — writes into a pre-allocated output,
+          avoiding per-call allocation in tight loops.
+
+        All operators accept an optional ``stream`` (a :class:`Stream` wrapping
+        a ``hipStream_t``) and a ``device`` argument (defaults to GPU).
     
 """
 from __future__ import annotations

From edc8dac8cce3b2886730b415512b1cc1bb7341b8 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Thu, 30 Apr 2026 11:35:10 -0400
Subject: [PATCH 15/20] Rename test_op_remap -> test_op_bndbox

---
 tests/roccv/python/test_op_bnd_box.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/roccv/python/test_op_bnd_box.py b/tests/roccv/python/test_op_bnd_box.py
index f5377b65..1b203a5a 100644
--- a/tests/roccv/python/test_op_bnd_box.py
+++ b/tests/roccv/python/test_op_bnd_box.py
@@ -49,7 +49,7 @@ def generate_boxes(samples: int, height: int, width: int) -> rocpycv.BndBoxes:
     (3, 150, 50),
     (7, 15, 23)
 ])
-def test_op_remap(samples, height, width, channels, device):
+def test_op_bndbox(samples, height, width, channels, device):
     input = generate_tensor(samples, width, height, channels, rocpycv.eDataType.U8, device)
     boxes = generate_boxes(samples, height, width)
     output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eDataType.U8, rocpycv.eTensorLayout.NHWC, device)

From e52f8dcaf438a645d4cb1c85edca91270978acd6 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Thu, 30 Apr 2026 11:38:36 -0400
Subject: [PATCH 16/20] Remove support for 4S16 in DLTypeToRoccvType

---
 python/src/py_helpers.cpp | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/python/src/py_helpers.cpp b/python/src/py_helpers.cpp
index 8d4dce55..15ea92fc 100644
--- a/python/src/py_helpers.cpp
+++ b/python/src/py_helpers.cpp
@@ -22,23 +22,23 @@ THE SOFTWARE.
 
 #include "py_helpers.hpp"
 
-#include <core/tensor_layout.hpp>
 #include <pybind11/numpy.h>
 
+#include <core/tensor_layout.hpp>
 #include <stdexcept>
 #include <string>
 
 eDataType DLTypeToRoccvType(DLDataType dtype) {
+    if (dtype.lanes != 1) {
+        throw std::runtime_error("Datatype is not supported.");
+    }
+
     if (dtype.bits == 8) {
         if (dtype.code == kDLUInt) return eDataType::DATA_TYPE_U8;
         if (dtype.code == kDLInt) return eDataType::DATA_TYPE_S8;
     } else if (dtype.bits == 16) {
-        if (dtype.lanes == 4) {
-            return eDataType::DATA_TYPE_4S16;
-        } else if (dtype.lanes == 1) {
-            if (dtype.code == kDLUInt) return eDataType::DATA_TYPE_U16;
-            if (dtype.code == kDLInt) return eDataType::DATA_TYPE_S16;
-        }
+        if (dtype.code == kDLUInt) return eDataType::DATA_TYPE_U16;
+        if (dtype.code == kDLInt) return eDataType::DATA_TYPE_S16;
     } else if (dtype.bits == 32) {
         if (dtype.code == kDLFloat) return eDataType::DATA_TYPE_F32;
         if (dtype.code == kDLUInt) return eDataType::DATA_TYPE_U32;
@@ -181,18 +181,23 @@ eDataType DataTypeFromPyObject(py::object obj) {
         static const py::object np_dtype = py::module_::import("numpy").attr("dtype");
         dt = np_dtype(obj).cast<py::dtype>();
     } catch (const std::exception&) {
-        throw std::runtime_error(
-            "dtype must be an rocpycv.eDataType or a NumPy dtype/scalar type (e.g. np.float32).");
+        throw std::runtime_error("dtype must be an rocpycv.eDataType or a NumPy dtype/scalar type (e.g. np.float32).");
     }
 
     DLDataTypeCode code;
     switch (dt.kind()) {
-        case 'u': code = kDLUInt; break;
-        case 'i': code = kDLInt; break;
-        case 'f': code = kDLFloat; break;
+        case 'u':
+            code = kDLUInt;
+            break;
+        case 'i':
+            code = kDLInt;
+            break;
+        case 'f':
+            code = kDLFloat;
+            break;
         default:
-            throw std::runtime_error("Unsupported NumPy dtype for rocpycv.Tensor (kind '" +
-                                     std::string(1, dt.kind()) + "').");
+            throw std::runtime_error("Unsupported NumPy dtype for rocpycv.Tensor (kind '" + std::string(1, dt.kind()) +
+                                     "').");
     }
     DLDataType dl{static_cast<uint8_t>(code), static_cast<uint8_t>(dt.itemsize() * 8), 1};
     return DLTypeToRoccvType(dl);

From ac7451787ccda810716536a789d6800b391d1515 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Thu, 30 Apr 2026 13:24:56 -0400
Subject: [PATCH 17/20] Add test for PyStream handle()

---
 tests/roccv/python/test_py_stream.py | 32 ++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 tests/roccv/python/test_py_stream.py

diff --git a/tests/roccv/python/test_py_stream.py b/tests/roccv/python/test_py_stream.py
new file mode 100644
index 00000000..8b35b16d
--- /dev/null
+++ b/tests/roccv/python/test_py_stream.py
@@ -0,0 +1,32 @@
+# ##############################################################################
+# Copyright (c)  - 2025 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# ##############################################################################
+
+import rocpycv
+
+
+def test_stream_handle_returns_valid_pointer():
+    stream = rocpycv.Stream()
+    handle = stream.handle()
+
+    assert isinstance(handle, int)
+    assert handle != 0

From c150f7927043e1c148518baee2df391c2e734cc0 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Thu, 30 Apr 2026 13:31:37 -0400
Subject: [PATCH 18/20] Add basic PyTensor test

---
 tests/roccv/python/test_py_tensor.py | 46 ++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 tests/roccv/python/test_py_tensor.py

diff --git a/tests/roccv/python/test_py_tensor.py b/tests/roccv/python/test_py_tensor.py
new file mode 100644
index 00000000..684f5d45
--- /dev/null
+++ b/tests/roccv/python/test_py_tensor.py
@@ -0,0 +1,46 @@
+# ##############################################################################
+# Copyright (c)  - 2026 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# ##############################################################################
+
+import pytest
+import rocpycv
+
+
+@pytest.mark.parametrize("device", [rocpycv.eDeviceType.GPU, rocpycv.eDeviceType.CPU])
+@pytest.mark.parametrize("dtype", [rocpycv.eDataType.U8, rocpycv.eDataType.F32, rocpycv.eDataType.S32])
+@pytest.mark.parametrize(
+    "shape, layout",
+    [
+        ([2, 32, 64, 3], rocpycv.eTensorLayout.NHWC),
+        ([1, 3, 16, 16], rocpycv.eTensorLayout.NCHW),
+        ([8, 8, 4], rocpycv.eTensorLayout.HWC),
+    ],
+)
+def test_tensor_basic_properties(shape, layout, dtype, device):
+    tensor = rocpycv.Tensor(shape, dtype, layout, device)
+
+    assert tensor.shape() == shape
+    assert tensor.ndim() == len(shape)
+    assert tensor.layout() == layout
+    assert tensor.device() == device
+    assert tensor.dtype() == dtype
+    assert tensor.data_ptr() != 0

From 07126e7253508e826e524ddf2be197d080d0d357 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Thu, 30 Apr 2026 13:31:54 -0400
Subject: [PATCH 19/20] Update year

---
 tests/roccv/python/test_py_stream.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/roccv/python/test_py_stream.py b/tests/roccv/python/test_py_stream.py
index 8b35b16d..1560e4f5 100644
--- a/tests/roccv/python/test_py_stream.py
+++ b/tests/roccv/python/test_py_stream.py
@@ -1,5 +1,5 @@
 # ##############################################################################
-# Copyright (c)  - 2025 Advanced Micro Devices, Inc.
+# Copyright (c)  - 2026 Advanced Micro Devices, Inc.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal

From f8c60eb06afd34796fe9377d5126f1f62b1db330 Mon Sep 17 00:00:00 2001
From: Zach Vincze <zavincze@amd.com>
Date: Thu, 30 Apr 2026 14:09:07 -0400
Subject: [PATCH 20/20] Add dtype/layout tests for PyTensor

---
 tests/roccv/python/test_py_tensor.py | 39 ++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/tests/roccv/python/test_py_tensor.py b/tests/roccv/python/test_py_tensor.py
index 684f5d45..d394de8b 100644
--- a/tests/roccv/python/test_py_tensor.py
+++ b/tests/roccv/python/test_py_tensor.py
@@ -21,6 +21,7 @@
 #
 # ##############################################################################
 
+import numpy as np
 import pytest
 import rocpycv
 
@@ -44,3 +45,41 @@ def test_tensor_basic_properties(shape, layout, dtype, device):
     assert tensor.device() == device
     assert tensor.dtype() == dtype
     assert tensor.data_ptr() != 0
+
+
+@pytest.mark.parametrize(
+    "dtype_in, expected_dtype",
+    [
+        (rocpycv.eDataType.U8, rocpycv.eDataType.U8),
+        (rocpycv.eDataType.F32, rocpycv.eDataType.F32),
+        (np.uint8, rocpycv.eDataType.U8),
+        (np.float32, rocpycv.eDataType.F32),
+        (np.int32, rocpycv.eDataType.S32),
+        (np.dtype("uint16"), rocpycv.eDataType.U16),
+    ],
+)
+@pytest.mark.parametrize(
+    "layout_in, expected_layout, shape",
+    [
+        (rocpycv.eTensorLayout.NHWC, rocpycv.eTensorLayout.NHWC, [2, 32, 64, 3]),
+        ("NHWC", rocpycv.eTensorLayout.NHWC, [2, 32, 64, 3]),
+        ("NCHW", rocpycv.eTensorLayout.NCHW, [1, 3, 16, 16]),
+        ("HWC", rocpycv.eTensorLayout.HWC, [8, 8, 4]),
+    ],
+)
+def test_tensor_construction_from_numpy_and_strings(dtype_in, expected_dtype, layout_in, expected_layout, shape):
+    tensor = rocpycv.Tensor(shape, dtype_in, layout_in, rocpycv.eDeviceType.GPU)
+
+    assert tensor.shape() == shape
+    assert tensor.dtype() == expected_dtype
+    assert tensor.layout() == expected_layout
+
+
+def test_tensor_construction_invalid_dtype_raises():
+    with pytest.raises(Exception):
+        rocpycv.Tensor([1, 8, 8, 3], "not_a_dtype", rocpycv.eTensorLayout.NHWC, rocpycv.eDeviceType.GPU)
+
+
+def test_tensor_construction_invalid_layout_raises():
+    with pytest.raises(Exception):
+        rocpycv.Tensor([1, 8, 8, 3], rocpycv.eDataType.U8, "ZYXW", rocpycv.eDeviceType.GPU)