diff --git a/python/include/py_helpers.hpp b/python/include/py_helpers.hpp
index 369cac10..2e3dc117 100644
--- a/python/include/py_helpers.hpp
+++ b/python/include/py_helpers.hpp
@@ -86,4 +86,23 @@ extern double2 GetDouble2FromTuple(py::tuple src);
  * @param src A python tuple of size 2.
  * @return int2
  */
-extern int2 GetInt2FromTuple(py::tuple src);
\ No newline at end of file
+extern int2 GetInt2FromTuple(py::tuple src);
+
+/**
+ * @brief Resolves a Python object to an eTensorLayout. Accepts either an rocpycv.eTensorLayout enum
+ * value, or a layout string such as "NHWC". Throws std::runtime_error for unsupported inputs.
+ *
+ * @param obj A Python object describing the tensor layout.
+ * @return eTensorLayout
+ */
+extern eTensorLayout LayoutFromPyObject(py::object obj);
+
+/**
+ * @brief Resolves a Python object to an eDataType. Accepts either an rocpycv.eDataType enum value,
+ * or anything coercible to a NumPy dtype (e.g. ``np.float32``, ``np.dtype("uint8")``,
+ * ``"float32"``). Throws std::runtime_error for unsupported inputs.
+ *
+ * @param obj A Python object describing the tensor data type.
+ * @return eDataType
+ */
+extern eDataType DataTypeFromPyObject(py::object obj);
\ No newline at end of file
diff --git a/python/include/py_stream.hpp b/python/include/py_stream.hpp
index 202420c9..a5923359 100644
--- a/python/include/py_stream.hpp
+++ b/python/include/py_stream.hpp
@@ -55,6 +55,18 @@ class PyStream {
      */
     void synchronize();
 
+    /**
+     * @brief Returns the wrapped HIP stream handle as an unsigned integer.
+     *
+     * Intended for zero-copy interop with frameworks that accept a raw HIP
+     * stream handle (e.g. ``migraphx.run_async`` with stream type
+     * ``"ihipStream_t"``). The handle is non-owning -- keep this PyStream alive
+     * for as long as the handle is in use.
+     *
+     * @return uintptr_t
+     */
+    uintptr_t getHandle();
+
     /**
      * @brief Exports the PyStream object to the specified python module.
      *
diff --git a/python/include/py_tensor.hpp b/python/include/py_tensor.hpp
index 523a5f27..180006d2 100644
--- a/python/include/py_tensor.hpp
+++ b/python/include/py_tensor.hpp
@@ -38,11 +38,11 @@ class PyTensor : public std::enable_shared_from_this<PyTensor> {
      * wrap.
      *
      * @param shape The shape of the tensor.
-     * @param layout The layout of the tensor.
      * @param dtype The data type of the tensor.
+     * @param layout The layout of the tensor.
      * @param device The device of the tensor.
      */
-    PyTensor(std::vector<int64_t> shape, eTensorLayout layout, eDataType dtype, eDeviceType device);
+    PyTensor(std::vector<int64_t> shape, eDataType dtype, eTensorLayout layout, eDeviceType device);
 
     /**
      * @brief Wraps an existing roccv::Tensor inside of a newly constructed PyTensor.
@@ -138,6 +138,22 @@ class PyTensor : public std::enable_shared_from_this<PyTensor> {
      */
     eDeviceType getDevice();
 
+    /**
+     * @brief Returns the address of the tensor's underlying data buffer as an
+     * unsigned integer. For GPU tensors this is a HIP device address; for CPU
+     * tensors it is a host address. Use ``device()`` to disambiguate.
+     *
+     * The pointer is non-owning. The caller is responsible for ensuring this
+     * PyTensor remains alive for as long as the pointer is used; otherwise the
+     * underlying buffer may be freed and the pointer left dangling.
+     *
+     * Intended for zero-copy interop with frameworks that accept a raw
+     * pointer + shape + dtype (e.g. ``migraphx.argument_from_pointer``).
+     *
+     * @return uintptr_t
+     */
+    uintptr_t getDataPtr();
+
     /**
      * @brief Gets the underlying roccv::Tensor that this tensor container wraps.
      *
diff --git a/python/src/main.cpp b/python/src/main.cpp
index ddbec687..e34c6fba 100644
--- a/python/src/main.cpp
+++ b/python/src/main.cpp
@@ -52,9 +52,60 @@ THE SOFTWARE.
 
 PYBIND11_MODULE(rocpycv, m) {
     m.doc() = R"pbdoc(
-        Python API reference
-        -----------------------
-        This is the Python API reference for rocCV.
+        rocpycv — AMD GPU-accelerated image pre/post-processing
+        =======================================================
+
+        rocpycv is the Python binding for rocCV, a HIP/ROCm image processing
+        library. It exposes a NumPy-friendly :class:`Tensor` and a suite of
+        operators (resize, normalize, color conversion, geometric warps, ...)
+        that run on either GPU (default) or CPU.
+
+        Quick start
+        -----------
+        .. code-block:: python
+
+            import numpy as np
+            import rocpycv
+
+            # Wrap a NumPy array as a CPU Tensor (zero-copy via DLPack), then
+            # copy it to the GPU (explicit H2D transfer).
+            host = np.zeros((1, 480, 640, 3), np.uint8)
+            src  = rocpycv.from_dlpack(host, "NHWC").copy_to(rocpycv.GPU)
+
+            # Functional form: operators allocate and return a new Tensor.
+            resized = rocpycv.resize(src, (1, 224, 224, 3), rocpycv.LINEAR)
+            chw     = rocpycv.reformat(resized, "NCHW")
+
+            # ``*_into`` form: write into a caller-allocated output, optionally
+            # on a stream — useful in hot preprocessing loops.
+            stream = rocpycv.Stream()
+            out    = rocpycv.Tensor((1, 224, 224, 3), np.uint8, "NHWC")
+            rocpycv.resize_into(out, src, rocpycv.LINEAR, stream)
+            stream.synchronize()
+
+        Tensors
+        -------
+        :class:`Tensor` arguments accept either rocpycv enums or familiar
+        Python types:
+
+        * ``dtype``  — ``rocpycv.F32`` or any NumPy dtype/scalar (``np.float32``).
+        * ``layout`` — ``rocpycv.NHWC`` or a layout string (``"NHWC"``).
+
+        For zero-copy interop, tensors implement the DLPack protocol — pass any
+        ``__dlpack__``-supporting object (NumPy array, PyTorch tensor, ...) to
+        :func:`from_dlpack`, and use :meth:`Tensor.data_ptr` to hand a raw GPU
+        pointer to inference frameworks such as MIGraphX.
+
+        Operators
+        ---------
+        Most operators come in two forms:
+
+        * ``op(src, ...)``       — allocates and returns a new :class:`Tensor`.
+        * ``op_into(dst, src, ...)`` — writes into a pre-allocated output,
+          avoiding per-call allocation in tight loops.
+
+        All operators accept an optional ``stream`` (a :class:`Stream` wrapping
+        a ``hipStream_t``) and a ``device`` argument (defaults to GPU).
     )pbdoc";
     PyException::Export(m);
     PyEnums::Export(m);
diff --git a/python/src/operators/py_op_convert_to.cpp b/python/src/operators/py_op_convert_to.cpp
index 9100ce3f..ad3a32a5 100644
--- a/python/src/operators/py_op_convert_to.cpp
+++ b/python/src/operators/py_op_convert_to.cpp
@@ -24,6 +24,8 @@ THE SOFTWARE.
 
 #include <op_convert_to.hpp>
 
+#include "py_helpers.hpp"
+
 PyTensor PyOpConvertTo::Execute(PyTensor& input, eDataType dtype, double alpha, double beta,
                                         std::optional<std::reference_wrapper<PyStream>> stream, eDeviceType device) {
     hipStream_t hipStream = stream.has_value() ? stream.value().get().getStream() : nullptr;
@@ -44,17 +46,23 @@ void PyOpConvertTo::ExecuteInto(PyTensor& output, PyTensor& input, double alpha,
 
 void PyOpConvertTo::Export(py::module& m) {
     using namespace py::literals;
-    m.def("convert_to", &PyOpConvertTo::Execute, "src"_a, "dtype"_a, "alpha"_a = 1.0, "beta"_a = 0.0, 
-                                                    "stream"_a = nullptr, "device"_a = eDeviceType::GPU, R"pbdoc(
-            
+    m.def("convert_to",
+          [](PyTensor& input, py::object dtype, double alpha, double beta,
+             std::optional<std::reference_wrapper<PyStream>> stream, eDeviceType device) {
+              return PyOpConvertTo::Execute(input, DataTypeFromPyObject(dtype), alpha, beta, stream, device);
+          },
+          "src"_a, "dtype"_a, "alpha"_a = 1.0, "beta"_a = 0.0, "stream"_a = nullptr,
+          "device"_a = eDeviceType::GPU, R"pbdoc(
+
             Executes the Convert To operation on the given HIP stream.
 
             See also:
                 Refer to the rocCV C++ API reference for more information on this operation.
-            
+
             Args:
                 src (rocpycv.Tensor): Input tensor containing one or more images.
-                dtype (eDataType): Datatype of the output tensor.
+                dtype: Datatype of the output tensor. Either an ``rocpycv.eDataType``
+                    (e.g. ``rocpycv.F32``) or a NumPy dtype/scalar type (e.g. ``np.float32``).
                 alpha (double, optional): Scalar for output data. Defaults to 1.0.
                 beta (double, optional): Offset for the data. Defaults to 0.0.
                 stream (rocpycv.Stream, optional): HIP stream to run this operation on.
diff --git a/python/src/operators/py_op_reformat.cpp b/python/src/operators/py_op_reformat.cpp
index 6cfa4105..1351ca53 100644
--- a/python/src/operators/py_op_reformat.cpp
+++ b/python/src/operators/py_op_reformat.cpp
@@ -22,6 +22,8 @@ THE SOFTWARE.
 
 #include "operators/py_op_reformat.hpp"
 
+#include "py_helpers.hpp"
+
 void PyOpReformat::ExecuteInto(PyTensor& output, PyTensor& input,
                                std::optional<std::reference_wrapper<PyStream>> stream, eDeviceType device) {
     hipStream_t hipStream = stream.has_value() ? stream.value().get().getStream() : nullptr;
@@ -46,8 +48,12 @@ PyTensor PyOpReformat::Execute(PyTensor& input, eTensorLayout outLayout,
 void PyOpReformat::Export(py::module& m) {
     using namespace py::literals;
 
-    m.def("reformat", &PyOpReformat::Execute, "input"_a, "out_layout"_a, "stream"_a = nullptr,
-          "device"_a = eDeviceType::GPU, R"pbdoc(
+    m.def("reformat",
+          [](PyTensor& input, py::object outLayout,
+             std::optional<std::reference_wrapper<PyStream>> stream, eDeviceType device) {
+              return PyOpReformat::Execute(input, LayoutFromPyObject(outLayout), stream, device);
+          },
+          "input"_a, "out_layout"_a, "stream"_a = nullptr, "device"_a = eDeviceType::GPU, R"pbdoc(
             Executes the Reformat operation and returns the result as a new tensor.
 
             See also:
@@ -55,7 +61,8 @@ void PyOpReformat::Export(py::module& m) {
 
             Args:
                 input (rocpycv.Tensor): Input tensor to reformat.
-                out_layout (rocpycv.eTensorLayout): The layout to reformat the input tensor to.
+                out_layout: The layout to reformat the input tensor to. Either an
+                    ``rocpycv.eTensorLayout`` (e.g. ``rocpycv.NCHW``) or a layout string (``"NCHW"``).
                 stream (rocpycv.Stream, optional): HIP stream to run this operation on.
                 device (rocpycv.Device, optional): The device to run this operation on. Defaults to GPU.
 
diff --git a/python/src/py_enums.cpp b/python/src/py_enums.cpp
index 0ce39d08..6059c280 100644
--- a/python/src/py_enums.cpp
+++ b/python/src/py_enums.cpp
@@ -48,7 +48,6 @@ void PyEnums::Export(py::module& m) {
         .value("S32", DATA_TYPE_S32)
         .value("F32", DATA_TYPE_F32)
         .value("F64", DATA_TYPE_F64)
-        .value("4S16", DATA_TYPE_4S16)
         .export_values();
 
     py::enum_<eDeviceType>(m, "eDeviceType")
diff --git a/python/src/py_helpers.cpp b/python/src/py_helpers.cpp
index 97902915..15ea92fc 100644
--- a/python/src/py_helpers.cpp
+++ b/python/src/py_helpers.cpp
@@ -22,19 +22,23 @@ THE SOFTWARE.
 
 #include "py_helpers.hpp"
 
+#include <pybind11/numpy.h>
+
+#include <core/tensor_layout.hpp>
 #include <stdexcept>
+#include <string>
 
 eDataType DLTypeToRoccvType(DLDataType dtype) {
+    if (dtype.lanes != 1) {
+        throw std::runtime_error("Datatype is not supported.");
+    }
+
     if (dtype.bits == 8) {
         if (dtype.code == kDLUInt) return eDataType::DATA_TYPE_U8;
         if (dtype.code == kDLInt) return eDataType::DATA_TYPE_S8;
     } else if (dtype.bits == 16) {
-        if (dtype.lanes == 4) {
-            return eDataType::DATA_TYPE_4S16;
-        } else if (dtype.lanes == 1) {
-            if (dtype.code == kDLUInt) return eDataType::DATA_TYPE_U16;
-            if (dtype.code == kDLInt) return eDataType::DATA_TYPE_S16;
-        }
+        if (dtype.code == kDLUInt) return eDataType::DATA_TYPE_U16;
+        if (dtype.code == kDLInt) return eDataType::DATA_TYPE_S16;
     } else if (dtype.bits == 32) {
         if (dtype.code == kDLFloat) return eDataType::DATA_TYPE_F32;
         if (dtype.code == kDLUInt) return eDataType::DATA_TYPE_U32;
@@ -147,4 +151,54 @@ int2 GetInt2FromTuple(py::tuple src) {
         std::runtime_error("Cannot convert py::tuple to int2. py::tuple.size() != 2.");
     }
     return make_int2(src[0].cast<int>(), src[1].cast<int>());
+}
+
+eTensorLayout LayoutFromPyObject(py::object obj) {
+    if (py::isinstance<eTensorLayout>(obj)) {
+        return obj.cast<eTensorLayout>();
+    }
+
+    if (py::isinstance<py::str>(obj)) {
+        std::string s = obj.cast<std::string>();
+        for (const auto& [layout, name] : roccv::TensorLayout::layoutStringTable) {
+            if (name == s) return layout;
+        }
+        throw std::runtime_error("Unknown tensor layout string: '" + s + "'.");
+    }
+
+    throw std::runtime_error("layout must be an rocpycv.eTensorLayout or a layout string (e.g. 'NHWC').");
+}
+
+eDataType DataTypeFromPyObject(py::object obj) {
+    if (py::isinstance<eDataType>(obj)) {
+        return obj.cast<eDataType>();
+    }
+
+    // np.dtype() accepts numpy scalar types (np.float32), dtype instances, and dtype strings,
+    // so we delegate the parsing to NumPy itself rather than enumerating cases here.
+    py::dtype dt;
+    try {
+        static const py::object np_dtype = py::module_::import("numpy").attr("dtype");
+        dt = np_dtype(obj).cast<py::dtype>();
+    } catch (const std::exception&) {
+        throw std::runtime_error("dtype must be an rocpycv.eDataType or a NumPy dtype/scalar type (e.g. np.float32).");
+    }
+
+    DLDataTypeCode code;
+    switch (dt.kind()) {
+        case 'u':
+            code = kDLUInt;
+            break;
+        case 'i':
+            code = kDLInt;
+            break;
+        case 'f':
+            code = kDLFloat;
+            break;
+        default:
+            throw std::runtime_error("Unsupported NumPy dtype for rocpycv.Tensor (kind '" + std::string(1, dt.kind()) +
+                                     "').");
+    }
+    DLDataType dl{static_cast<uint8_t>(code), static_cast<uint8_t>(dt.itemsize() * 8), 1};
+    return DLTypeToRoccvType(dl);
 }
\ No newline at end of file
diff --git a/python/src/py_stream.cpp b/python/src/py_stream.cpp
index ac6ae73f..c830bc8e 100644
--- a/python/src/py_stream.cpp
+++ b/python/src/py_stream.cpp
@@ -38,8 +38,15 @@ PyStream::~PyStream() {
 
 void PyStream::synchronize() { HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(m_stream)); }
 
+uintptr_t PyStream::getHandle() { return reinterpret_cast<uintptr_t>(m_stream); }
+
 void PyStream::Export(py::module& m) {
     py::class_<PyStream>(m, "Stream", "Python wrapper for HIP streams.")
         .def(py::init<>(), "Creates a HIP stream.")
-        .def("synchronize", &PyStream::synchronize, "Blocks until all worked queued on this stream is finished.");
+        .def("synchronize", &PyStream::synchronize, "Blocks until all worked queued on this stream is finished.")
+        .def("handle", &PyStream::getHandle,
+             "Returns the underlying HIP stream handle (hipStream_t) as an integer. "
+             "Intended for zero-copy interop with frameworks that accept a raw stream handle, "
+             "e.g. migraphx.run_async(..., stream_handle, \"ihipStream_t\"). "
+             "The handle is non-owning -- keep the Stream alive while the handle is in use.");
 }
\ No newline at end of file
diff --git a/python/src/py_tensor.cpp b/python/src/py_tensor.cpp
index 913a25a4..5c757c02 100644
--- a/python/src/py_tensor.cpp
+++ b/python/src/py_tensor.cpp
@@ -59,7 +59,7 @@ DLManagedTensor* createDLManagedTensor(std::shared_ptr<roccv::Tensor> tensor, st
     return dlTensor;
 }
 
-PyTensor::PyTensor(std::vector<int64_t> shape, eTensorLayout layout, eDataType dtype, eDeviceType device) {
+PyTensor::PyTensor(std::vector<int64_t> shape, eDataType dtype, eTensorLayout layout, eDeviceType device) {
     roccv::TensorShape tShape(roccv::TensorShape(roccv::TensorLayout(layout), shape));
     m_tensor = std::make_shared<roccv::Tensor>(tShape, roccv::DataType(dtype), device);
 }
@@ -178,6 +178,11 @@ eTensorLayout PyTensor::getLayout() { return m_tensor->layout().elayout(); }
 
 eDeviceType PyTensor::getDevice() { return m_tensor->device(); }
 
+uintptr_t PyTensor::getDataPtr() {
+    auto tensorData = m_tensor->exportData<roccv::TensorDataStrided>();
+    return reinterpret_cast<uintptr_t>(tensorData.basePtr());
+}
+
 std::shared_ptr<roccv::Tensor> PyTensor::getTensor() { return m_tensor; }
 
 py::tuple PyTensor::getDLDevice() {
@@ -197,8 +202,15 @@ void PyTensor::Export(pybind11::module& m) {
 
     pybind11::class_<PyTensor, std::shared_ptr<PyTensor>> tensor(m, "Tensor");
     tensor
-        .def(pybind11::init<std::vector<int64_t>, eTensorLayout, eDataType, eDeviceType>(), "shape"_a, "layout"_a,
-             "dtype"_a, "device"_a = eDeviceType::GPU, "Constructs a tensor object.")
+        .def(pybind11::init([](std::vector<int64_t> shape, py::object dtype, py::object layout, eDeviceType device) {
+                 return std::make_shared<PyTensor>(shape, DataTypeFromPyObject(dtype), LayoutFromPyObject(layout),
+                                                   device);
+             }),
+             "shape"_a, "dtype"_a, "layout"_a, "device"_a = eDeviceType::GPU,
+             "Constructs a tensor object. ``dtype`` may be an ``rocpycv.eDataType`` (e.g. "
+             "``rocpycv.F32``) or a NumPy dtype/scalar type (e.g. ``np.float32``). ``layout`` "
+             "may be an ``rocpycv.eTensorLayout`` (e.g. ``rocpycv.NHWC``) or a layout string "
+             "(``\"NHWC\"``).")
         .def("copy_to", &PyTensor::copyTo, "device"_a,
              "Returns a deep copy of the tensor with data copied to a specified device type.")
         .def("__dlpack__", &PyTensor::toDLPack, "stream"_a = py::none(),
@@ -207,12 +219,27 @@ void PyTensor::Export(pybind11::module& m) {
         .def("shape", &PyTensor::getShape, "Returns a list representing the tensor shape.")
         .def("layout", &PyTensor::getLayout, "Returns the layout for this tensor.")
         .def("device", &PyTensor::getDevice, "Returns the device this tensor is on.")
+        .def("data_ptr", &PyTensor::getDataPtr,
+             "Returns the address of the tensor's underlying buffer as an integer. "
+             "For GPU tensors this is a HIP device address; for CPU tensors a host address. "
+             "The pointer is non-owning -- keep the tensor alive for as long as the pointer is used. "
+             "Intended for zero-copy interop with frameworks like MIGraphX.")
         .def("ndim", &PyTensor::getRank, "Returns the number of dimensions of the tensor.")
         .def("dtype", &PyTensor::getDataType, "Returns the data type of the tensor.")
         .def("__dlpack_device__", &PyTensor::getDLDevice,
              "Returns a tuple containing the DLPack device and device id for the tensor.")
-        .def("reshape", &PyTensor::reshape, "new_shape"_a, "layout"_a,
-             "Creates a new tensor with the specified shape.");
-    m.def("from_dlpack", &PyTensor::fromDLPack, "buffer"_a, "layout"_a,
-          "Wraps a DLPack supported tensor in a rocpycv tensor.");
+        .def(
+            "reshape",
+            [](PyTensor& self, std::vector<int64_t> newShape, py::object layout) {
+                return self.reshape(newShape, LayoutFromPyObject(layout));
+            },
+            "new_shape"_a, "layout"_a,
+            "Creates a new tensor with the specified shape. ``layout`` may be an "
+            "``rocpycv.eTensorLayout`` or a layout string (e.g. ``\"NHWC\"``).");
+    m.def(
+        "from_dlpack",
+        [](pybind11::object src, py::object layout) { return PyTensor::fromDLPack(src, LayoutFromPyObject(layout)); },
+        "buffer"_a, "layout"_a,
+        "Wraps a DLPack supported tensor in a rocpycv tensor. ``layout`` may be an "
+        "``rocpycv.eTensorLayout`` or a layout string (e.g. ``\"NHWC\"``).");
 }
\ No newline at end of file
diff --git a/python/src/rocpycv.pyi b/python/src/rocpycv.pyi
index 855a8d5a..270fb848 100644
--- a/python/src/rocpycv.pyi
+++ b/python/src/rocpycv.pyi
@@ -1,8 +1,59 @@
 """
 
-        Python API reference
-        -----------------------
-        This is the Python API reference for rocCV.
+        rocpycv — AMD GPU-accelerated image pre/post-processing
+        =======================================================
+
+        rocpycv is the Python binding for rocCV, a HIP/ROCm image processing
+        library. It exposes a NumPy-friendly :class:`Tensor` and a suite of
+        operators (resize, normalize, color conversion, geometric warps, ...)
+        that run on either GPU (default) or CPU.
+
+        Quick start
+        -----------
+        .. code-block:: python
+
+            import numpy as np
+            import rocpycv
+
+            # Wrap a NumPy array as a CPU Tensor (zero-copy via DLPack), then
+            # copy it to the GPU (explicit H2D transfer).
+            host = np.zeros((1, 480, 640, 3), np.uint8)
+            src  = rocpycv.from_dlpack(host, "NHWC").copy_to(rocpycv.GPU)
+
+            # Functional form: operators allocate and return a new Tensor.
+            resized = rocpycv.resize(src, (1, 224, 224, 3), rocpycv.LINEAR)
+            chw     = rocpycv.reformat(resized, "NCHW")
+
+            # ``*_into`` form: write into a caller-allocated output, optionally
+            # on a stream — useful in hot preprocessing loops.
+            stream = rocpycv.Stream()
+            out    = rocpycv.Tensor((1, 224, 224, 3), np.uint8, "NHWC")
+            rocpycv.resize_into(out, src, rocpycv.LINEAR, stream)
+            stream.synchronize()
+
+        Tensors
+        -------
+        :class:`Tensor` arguments accept either rocpycv enums or familiar
+        Python types:
+
+        * ``dtype``  — ``rocpycv.F32`` or any NumPy dtype/scalar (``np.float32``).
+        * ``layout`` — ``rocpycv.NHWC`` or a layout string (``"NHWC"``).
+
+        For zero-copy interop, tensors implement the DLPack protocol — pass any
+        ``__dlpack__``-supporting object (NumPy array, PyTorch tensor, ...) to
+        :func:`from_dlpack`, and use :meth:`Tensor.data_ptr` to hand a raw GPU
+        pointer to inference frameworks such as MIGraphX.
+
+        Operators
+        ---------
+        Most operators come in two forms:
+
+        * ``op(src, ...)``       — allocates and returns a new :class:`Tensor`.
+        * ``op_into(dst, src, ...)`` — writes into a pre-allocated output,
+          avoiding per-call allocation in tight loops.
+
+        All operators accept an optional ``stream`` (a :class:`Stream` wrapping
+        a ``hipStream_t``) and a ``device`` argument (defaults to GPU).
     
 """
 from __future__ import annotations
@@ -153,6 +204,10 @@ class Stream:
         """
         Creates a HIP stream.
         """
+    def handle(self) -> int:
+        """
+        Returns the underlying HIP stream handle (hipStream_t) as an integer. Intended for zero-copy interop with frameworks that accept a raw stream handle, e.g. migraphx.run_async(..., stream_handle, "ihipStream_t"). The handle is non-owning -- keep the Stream alive while the handle is in use.
+        """
     def synchronize(self) -> None:
         """
         Blocks until all worked queued on this stream is finished.
@@ -166,14 +221,18 @@ class Tensor:
         """
         Returns a tuple containing the DLPack device and device id for the tensor.
         """
-    def __init__(self, shape: collections.abc.Sequence[typing.SupportsInt | typing.SupportsIndex], layout: eTensorLayout, dtype: eDataType, device: eDeviceType = ...) -> None:
+    def __init__(self, shape: collections.abc.Sequence[typing.SupportsInt | typing.SupportsIndex], dtype: typing.Any, layout: typing.Any, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
         """
-        Constructs a tensor object.
+        Constructs a tensor object. ``dtype`` may be an ``rocpycv.eDataType`` (e.g. ``rocpycv.F32``) or a NumPy dtype/scalar type (e.g. ``np.float32``). ``layout`` may be an ``rocpycv.eTensorLayout`` (e.g. ``rocpycv.NHWC``) or a layout string (``"NHWC"``).
         """
     def copy_to(self, device: eDeviceType) -> Tensor:
         """
         Returns a deep copy of the tensor with data copied to a specified device type.
         """
+    def data_ptr(self) -> int:
+        """
+        Returns the address of the tensor's underlying buffer as an integer. For GPU tensors this is a HIP device address; for CPU tensors a host address. The pointer is non-owning -- keep the tensor alive for as long as the pointer is used. Intended for zero-copy interop with frameworks like MIGraphX.
+        """
     def device(self) -> eDeviceType:
         """
         Returns the device this tensor is on.
@@ -190,9 +249,9 @@ class Tensor:
         """
         Returns the number of dimensions of the tensor.
         """
-    def reshape(self, new_shape: collections.abc.Sequence[typing.SupportsInt | typing.SupportsIndex], layout: eTensorLayout) -> Tensor:
+    def reshape(self, new_shape: collections.abc.Sequence[typing.SupportsInt | typing.SupportsIndex], layout: typing.Any) -> Tensor:
         """
-        Creates a new tensor with the specified shape.
+        Creates a new tensor with the specified shape. ``layout`` may be an ``rocpycv.eTensorLayout`` or a layout string (e.g. ``"NHWC"``).
         """
     def shape(self) -> list[int]:
         """
@@ -472,8 +531,6 @@ class eDataType:
       F32
     
       F64
-    
-      4S16
     """
     F32: typing.ClassVar[eDataType]  # value = <eDataType.F32: 6>
     F64: typing.ClassVar[eDataType]  # value = <eDataType.F64: 7>
@@ -483,7 +540,7 @@ class eDataType:
     U16: typing.ClassVar[eDataType]  # value = <eDataType.U16: 2>
     U32: typing.ClassVar[eDataType]  # value = <eDataType.U32: 4>
     U8: typing.ClassVar[eDataType]  # value = <eDataType.U8: 0>
-    __members__: typing.ClassVar[dict[str, eDataType]]  # value = {'U8': <eDataType.U8: 0>, 'S8': <eDataType.S8: 1>, 'U16': <eDataType.U16: 2>, 'S16': <eDataType.S16: 3>, 'U32': <eDataType.U32: 4>, 'S32': <eDataType.S32: 5>, 'F32': <eDataType.F32: 6>, 'F64': <eDataType.F64: 7>, '4S16': <eDataType.4S16: 8>}
+    __members__: typing.ClassVar[dict[str, eDataType]]  # value = {'U8': <eDataType.U8: 0>, 'S8': <eDataType.S8: 1>, 'U16': <eDataType.U16: 2>, 'S16': <eDataType.S16: 3>, 'U32': <eDataType.U32: 4>, 'S32': <eDataType.S32: 5>, 'F32': <eDataType.F32: 6>, 'F64': <eDataType.F64: 7>}
     def __eq__(self, other: typing.Any) -> bool:
         ...
     def __getstate__(self) -> int:
@@ -728,7 +785,7 @@ class eThresholdType:
     @property
     def value(self) -> int:
         ...
-def advcvtcolor(src: Tensor, conversion_code: eColorConversionCode, color_spec: eColorSpec, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def advcvtcolor(src: Tensor, conversion_code: eColorConversionCode, color_spec: eColorSpec, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Advanced Color Convert operation on the given HIP stream.
     
@@ -745,7 +802,7 @@ def advcvtcolor(src: Tensor, conversion_code: eColorConversionCode, color_spec:
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def advcvtcolor_into(dst: Tensor, src: Tensor, conversion_code: eColorConversionCode, color_spec: eColorSpec, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def advcvtcolor_into(dst: Tensor, src: Tensor, conversion_code: eColorConversionCode, color_spec: eColorSpec, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Advanced Color Convert operation on the given HIP stream.
     
@@ -763,7 +820,7 @@ def advcvtcolor_into(dst: Tensor, src: Tensor, conversion_code: eColorConversion
                 Returns:
                     None
     """
-def bilateral_filter(src: Tensor, diameter: typing.SupportsInt | typing.SupportsIndex, sigmaColor: typing.SupportsFloat | typing.SupportsIndex, sigmaSpace: typing.SupportsFloat | typing.SupportsIndex, borderMode: eBorderType, borderValue: list, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def bilateral_filter(src: Tensor, diameter: typing.SupportsInt | typing.SupportsIndex, sigmaColor: typing.SupportsFloat | typing.SupportsIndex, sigmaSpace: typing.SupportsFloat | typing.SupportsIndex, borderMode: eBorderType, borderValue: list, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Bilateral Filter operation on the given HIP stream.
     
@@ -783,7 +840,7 @@ def bilateral_filter(src: Tensor, diameter: typing.SupportsInt | typing.Supports
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def bilateral_filter_into(dst: Tensor, src: Tensor, diameter: typing.SupportsInt | typing.SupportsIndex, sigmaColor: typing.SupportsFloat | typing.SupportsIndex, sigmaSpace: typing.SupportsFloat | typing.SupportsIndex, borderMode: eBorderType, borderValue: list, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def bilateral_filter_into(dst: Tensor, src: Tensor, diameter: typing.SupportsInt | typing.SupportsIndex, sigmaColor: typing.SupportsFloat | typing.SupportsIndex, sigmaSpace: typing.SupportsFloat | typing.SupportsIndex, borderMode: eBorderType, borderValue: list, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Bilateral Filter operation on the given HIP stream.
     
@@ -804,7 +861,7 @@ def bilateral_filter_into(dst: Tensor, src: Tensor, diameter: typing.SupportsInt
                 Returns:
                     None
     """
-def bndbox(src: Tensor, bnd_boxes: BndBoxes, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def bndbox(src: Tensor, bnd_boxes: BndBoxes, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the BndBox operation on the given HIP stream.
     
@@ -820,7 +877,7 @@ def bndbox(src: Tensor, bnd_boxes: BndBoxes, stream: rocpycv.Stream | None = Non
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def bndbox_into(dst: Tensor, src: Tensor, bnd_boxes: BndBoxes, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def bndbox_into(dst: Tensor, src: Tensor, bnd_boxes: BndBoxes, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the BndBox operation on the given HIP stream.
     
@@ -837,7 +894,7 @@ def bndbox_into(dst: Tensor, src: Tensor, bnd_boxes: BndBoxes, stream: rocpycv.S
                 Returns:
                     None
     """
-def center_crop(src: Tensor, crop_size: tuple, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def center_crop(src: Tensor, crop_size: tuple, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Center Crop operation on the given HIP stream.
     
@@ -854,7 +911,7 @@ def center_crop(src: Tensor, crop_size: tuple, stream: rocpycv.Stream | None = N
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def center_crop_into(dst: Tensor, src: Tensor, crop_size: tuple, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def center_crop_into(dst: Tensor, src: Tensor, crop_size: tuple, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Center Crop operation on the given HIP stream.
     
@@ -871,7 +928,7 @@ def center_crop_into(dst: Tensor, src: Tensor, crop_size: tuple, stream: rocpycv
                 Returns:
                     None
     """
-def composite(foreground: Tensor, background: Tensor, fgmask: Tensor, outchannels: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def composite(foreground: Tensor, background: Tensor, fgmask: Tensor, outchannels: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Composite operation on the given HIP stream.
     
@@ -889,7 +946,7 @@ def composite(foreground: Tensor, background: Tensor, fgmask: Tensor, outchannel
                 Returns:
                     rocpycv.Tensor: The output tensor with <outchannels> number of channels.
     """
-def composite_into(dst: Tensor, foreground: Tensor, background: Tensor, fgmask: Tensor, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def composite_into(dst: Tensor, foreground: Tensor, background: Tensor, fgmask: Tensor, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Composite operation on the given HIP stream.
     
@@ -907,16 +964,17 @@ def composite_into(dst: Tensor, foreground: Tensor, background: Tensor, fgmask:
                 Returns:
                     None
     """
-def convert_to(src: Tensor, dtype: eDataType, alpha: typing.SupportsFloat | typing.SupportsIndex = 1.0, beta: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def convert_to(src: Tensor, dtype: typing.Any, alpha: typing.SupportsFloat | typing.SupportsIndex = 1.0, beta: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Convert To operation on the given HIP stream.
     
                 See also:
                     Refer to the rocCV C++ API reference for more information on this operation.
-                
+    
                 Args:
                     src (rocpycv.Tensor): Input tensor containing one or more images.
-                    dtype (eDataType): Datatype of the output tensor.
+                    dtype: Datatype of the output tensor. Either an ``rocpycv.eDataType``
+                        (e.g. ``rocpycv.F32``) or a NumPy dtype/scalar type (e.g. ``np.float32``).
                     alpha (double, optional): Scalar for output data. Defaults to 1.0.
                     beta (double, optional): Offset for the data. Defaults to 0.0.
                     stream (rocpycv.Stream, optional): HIP stream to run this operation on.
@@ -925,7 +983,7 @@ def convert_to(src: Tensor, dtype: eDataType, alpha: typing.SupportsFloat | typi
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def convert_to_into(dst: Tensor, src: Tensor, alpha: typing.SupportsFloat | typing.SupportsIndex = 1.0, beta: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def convert_to_into(dst: Tensor, src: Tensor, alpha: typing.SupportsFloat | typing.SupportsIndex = 1.0, beta: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Convert To operation on the given HIP stream.
     
@@ -943,7 +1001,7 @@ def convert_to_into(dst: Tensor, src: Tensor, alpha: typing.SupportsFloat | typi
                 Returns:
                     None
     """
-def copymakeborder(src: Tensor, border_mode: eBorderType = ..., border_value: list = [0.0, 0.0, 0.0, 0.0], top: typing.SupportsInt | typing.SupportsIndex, bottom: typing.SupportsInt | typing.SupportsIndex, left: typing.SupportsInt | typing.SupportsIndex, right: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def copymakeborder(src: Tensor, border_mode: eBorderType = eBorderType.eBorderType.CONSTANT, border_value: list = [0.0, 0.0, 0.0, 0.0], top: typing.SupportsInt | typing.SupportsIndex, bottom: typing.SupportsInt | typing.SupportsIndex, left: typing.SupportsInt | typing.SupportsIndex, right: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the CopyMakeBorder operation on the given HIP stream.
     
@@ -964,7 +1022,7 @@ def copymakeborder(src: Tensor, border_mode: eBorderType = ..., border_value: li
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def copymakeborder_into(dst: Tensor, src: Tensor, border_mode: eBorderType = ..., border_value: list = [0.0, 0.0, 0.0, 0.0], top: typing.SupportsInt | typing.SupportsIndex, left: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def copymakeborder_into(dst: Tensor, src: Tensor, border_mode: eBorderType = eBorderType.eBorderType.CONSTANT, border_value: list = [0.0, 0.0, 0.0, 0.0], top: typing.SupportsInt | typing.SupportsIndex, left: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the CopyMakeBorder operation on the given HIP stream.
     
@@ -984,7 +1042,7 @@ def copymakeborder_into(dst: Tensor, src: Tensor, border_mode: eBorderType = ...
                 Returns:
                     None
     """
-def custom_crop(src: Tensor, crop_rect: Box, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def custom_crop(src: Tensor, crop_rect: Box, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Custom Crop operation on the given HIP stream.
     
@@ -1001,7 +1059,7 @@ def custom_crop(src: Tensor, crop_rect: Box, stream: rocpycv.Stream | None = Non
                 Returns:
                     None
     """
-def custom_crop_into(dst: Tensor, src: Tensor, crop_rect: Box, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def custom_crop_into(dst: Tensor, src: Tensor, crop_rect: Box, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Custom Crop operation on the given HIP stream.
     
@@ -1017,7 +1075,7 @@ def custom_crop_into(dst: Tensor, src: Tensor, crop_rect: Box, stream: rocpycv.S
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def cvtcolor(src: Tensor, conversion_code: eColorConversionCode, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def cvtcolor(src: Tensor, conversion_code: eColorConversionCode, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Color Convert operation on the given HIP stream.
     
@@ -1033,7 +1091,7 @@ def cvtcolor(src: Tensor, conversion_code: eColorConversionCode, stream: rocpycv
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def cvtcolor_into(dst: Tensor, src: Tensor, conversion_code: eColorConversionCode, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def cvtcolor_into(dst: Tensor, src: Tensor, conversion_code: eColorConversionCode, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Color Convert operation on the given HIP stream.
     
@@ -1050,7 +1108,7 @@ def cvtcolor_into(dst: Tensor, src: Tensor, conversion_code: eColorConversionCod
                 Returns:
                     None
     """
-def flip(src: Tensor, flip_code: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def flip(src: Tensor, flip_code: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Flip operation on the given HIP stream.
     
@@ -1066,7 +1124,7 @@ def flip(src: Tensor, flip_code: typing.SupportsInt | typing.SupportsIndex, stre
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def flip_into(dst: Tensor, src: Tensor, flip_code: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def flip_into(dst: Tensor, src: Tensor, flip_code: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Flip operation on the given HIP stream.
     
@@ -1083,11 +1141,11 @@ def flip_into(dst: Tensor, src: Tensor, flip_code: typing.SupportsInt | typing.S
                 Returns:
                     None
     """
-def from_dlpack(buffer: typing.Any, layout: eTensorLayout) -> Tensor:
+def from_dlpack(buffer: typing.Any, layout: typing.Any) -> Tensor:
     """
-    Wraps a DLPack supported tensor in a rocpycv tensor.
+    Wraps a DLPack supported tensor in a rocpycv tensor. ``layout`` may be an ``rocpycv.eTensorLayout`` or a layout string (e.g. ``"NHWC"``).
     """
-def gamma_contrast(src: Tensor, gamma: typing.SupportsFloat | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def gamma_contrast(src: Tensor, gamma: typing.SupportsFloat | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Gamma Contrast operation on the given HIP stream.
     
@@ -1103,7 +1161,7 @@ def gamma_contrast(src: Tensor, gamma: typing.SupportsFloat | typing.SupportsInd
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def gamma_contrast_into(dst: Tensor, src: Tensor, gamma: typing.SupportsFloat | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def gamma_contrast_into(dst: Tensor, src: Tensor, gamma: typing.SupportsFloat | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Gamma Contrast operation on the given HIP stream.
     
@@ -1120,7 +1178,7 @@ def gamma_contrast_into(dst: Tensor, src: Tensor, gamma: typing.SupportsFloat |
                 Returns:
                     None
     """
-def histogram(src: Tensor, mask: rocpycv.Tensor | None, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def histogram(src: Tensor, mask: rocpycv.Tensor | None, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Histogram operation on the given HIP stream.
     
@@ -1136,7 +1194,7 @@ def histogram(src: Tensor, mask: rocpycv.Tensor | None, stream: rocpycv.Stream |
                 Returns:
                     rocpycv.Tensor: Output tensor with width of 256 and a height equal to the batch size of input (1 if HWC input).
     """
-def histogram_into(dst: Tensor, src: Tensor, mask: rocpycv.Tensor | None, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def histogram_into(dst: Tensor, src: Tensor, mask: rocpycv.Tensor | None, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Histogram operation on the given HIP stream.
     
@@ -1153,7 +1211,7 @@ def histogram_into(dst: Tensor, src: Tensor, mask: rocpycv.Tensor | None, stream
                 Returns:
                     None
     """
-def nms(src: Tensor, scores: Tensor, score_threshold: typing.SupportsFloat | typing.SupportsIndex = 1.1920928955078125e-07, iou_threshold: typing.SupportsFloat | typing.SupportsIndex = 1.0, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def nms(src: Tensor, scores: Tensor, score_threshold: typing.SupportsFloat | typing.SupportsIndex = 1.1920928955078125e-07, iou_threshold: typing.SupportsFloat | typing.SupportsIndex = 1.0, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Non-maximum Suppression operation on the given HIP stream.
     
@@ -1171,7 +1229,7 @@ def nms(src: Tensor, scores: Tensor, score_threshold: typing.SupportsFloat | typ
                 Returns:
                     rocpycv.Tensor: The output tensor of shape [i, j], containing 1 (kept) or 0 (suppressed) for each bounding box (j) per batch (i). Results will be written to this tensor.
     """
-def nms_into(dst: Tensor, src: Tensor, scores: Tensor, score_threshold: typing.SupportsFloat | typing.SupportsIndex = 1.1920928955078125e-07, iou_threshold: typing.SupportsFloat | typing.SupportsIndex = 1.0, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def nms_into(dst: Tensor, src: Tensor, scores: Tensor, score_threshold: typing.SupportsFloat | typing.SupportsIndex = 1.1920928955078125e-07, iou_threshold: typing.SupportsFloat | typing.SupportsIndex = 1.0, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Non-maximum Suppression operation on the given HIP stream.
     
@@ -1190,7 +1248,7 @@ def nms_into(dst: Tensor, src: Tensor, scores: Tensor, score_threshold: typing.S
                 Returns:
                     None
     """
-def normalize(src: Tensor, base: Tensor, scale: Tensor, flags: typing.SupportsInt | typing.SupportsIndex | None = None, globalscale: typing.SupportsFloat | typing.SupportsIndex = 1.0, globalshift: typing.SupportsFloat | typing.SupportsIndex = 0.0, epsilon: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def normalize(src: Tensor, base: Tensor, scale: Tensor, flags: typing.SupportsInt | typing.SupportsIndex | None = None, globalscale: typing.SupportsFloat | typing.SupportsIndex = 1.0, globalshift: typing.SupportsFloat | typing.SupportsIndex = 0.0, epsilon: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Normalize operation on the given HIP stream.
     
@@ -1211,7 +1269,7 @@ def normalize(src: Tensor, base: Tensor, scale: Tensor, flags: typing.SupportsIn
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def normalize_into(dst: Tensor, src: Tensor, base: Tensor, scale: Tensor, flags: typing.SupportsInt | typing.SupportsIndex | None = None, globalscale: typing.SupportsFloat | typing.SupportsIndex = 1.0, globalshift: typing.SupportsFloat | typing.SupportsIndex = 0.0, epsilon: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def normalize_into(dst: Tensor, src: Tensor, base: Tensor, scale: Tensor, flags: typing.SupportsInt | typing.SupportsIndex | None = None, globalscale: typing.SupportsFloat | typing.SupportsIndex = 1.0, globalshift: typing.SupportsFloat | typing.SupportsIndex = 0.0, epsilon: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                   Executes the Normalize operation on the given HIP stream.
       
@@ -1233,7 +1291,7 @@ def normalize_into(dst: Tensor, src: Tensor, base: Tensor, scale: Tensor, flags:
                   Returns:
                       None
     """
-def reformat(input: Tensor, out_layout: eTensorLayout, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def reformat(input: Tensor, out_layout: typing.Any, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Reformat operation and returns the result as a new tensor.
     
@@ -1242,14 +1300,15 @@ def reformat(input: Tensor, out_layout: eTensorLayout, stream: rocpycv.Stream |
     
                 Args:
                     input (rocpycv.Tensor): Input tensor to reformat.
-                    out_layout (rocpycv.eTensorLayout): The layout to reformat the input tensor to.
+                    out_layout: The layout to reformat the input tensor to. Either an
+                        ``rocpycv.eTensorLayout`` (e.g. ``rocpycv.NCHW``) or a layout string (``"NCHW"``).
                     stream (rocpycv.Stream, optional): HIP stream to run this operation on.
                     device (rocpycv.Device, optional): The device to run this operation on. Defaults to GPU.
     
                 Returns:
                     rocpycv.Tensor: The reformatted tensor.
     """
-def reformat_into(output: Tensor, input: Tensor, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def reformat_into(output: Tensor, input: Tensor, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Reformat operation on the given HIP stream.
     
@@ -1265,7 +1324,7 @@ def reformat_into(output: Tensor, input: Tensor, stream: rocpycv.Stream | None =
                 Returns:
                     None
     """
-def remap(src: Tensor, map: Tensor, in_interpolation: eInterpolationType, map_interpolation: eInterpolationType, map_value_type: eRemapType, align_corners: bool, border_type: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def remap(src: Tensor, map: Tensor, in_interpolation: eInterpolationType, map_interpolation: eInterpolationType, map_value_type: eRemapType, align_corners: bool, border_type: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Remap operation on the given HIP stream.
     
@@ -1287,7 +1346,7 @@ def remap(src: Tensor, map: Tensor, in_interpolation: eInterpolationType, map_in
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def remap_into(dst: Tensor, src: Tensor, map: Tensor, in_interpolation: eInterpolationType, map_interpolation: eInterpolationType, map_value_type: eRemapType, align_corners: bool, border_type: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def remap_into(dst: Tensor, src: Tensor, map: Tensor, in_interpolation: eInterpolationType, map_interpolation: eInterpolationType, map_value_type: eRemapType, align_corners: bool, border_type: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Remap operation on the given HIP stream.
     
@@ -1310,7 +1369,7 @@ def remap_into(dst: Tensor, src: Tensor, map: Tensor, in_interpolation: eInterpo
                 Returns:
                     None
     """
-def resize(src: Tensor, shape: tuple, interp: eInterpolationType, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def resize(src: Tensor, shape: tuple, interp: eInterpolationType, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Resize operation on the given HIP stream.
     
@@ -1327,7 +1386,7 @@ def resize(src: Tensor, shape: tuple, interp: eInterpolationType, stream: rocpyc
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def resize_into(dst: Tensor, src: Tensor, interp: eInterpolationType, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def resize_into(dst: Tensor, src: Tensor, interp: eInterpolationType, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Resize operation on the given HIP stream.
     
@@ -1344,7 +1403,7 @@ def resize_into(dst: Tensor, src: Tensor, interp: eInterpolationType, stream: ro
                 Returns:
                     None
     """
-def rotate(src: Tensor, angle_deg: typing.SupportsFloat | typing.SupportsIndex, shift: tuple, interpolation: eInterpolationType, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def rotate(src: Tensor, angle_deg: typing.SupportsFloat | typing.SupportsIndex, shift: tuple, interpolation: eInterpolationType, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Rotate operation on the given HIP stream.
     
@@ -1362,7 +1421,7 @@ def rotate(src: Tensor, angle_deg: typing.SupportsFloat | typing.SupportsIndex,
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def rotate_into(dst: Tensor, src: Tensor, angle_deg: typing.SupportsFloat | typing.SupportsIndex, shift: tuple, interpolation: eInterpolationType, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def rotate_into(dst: Tensor, src: Tensor, angle_deg: typing.SupportsFloat | typing.SupportsIndex, shift: tuple, interpolation: eInterpolationType, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Rotate operation on the given HIP stream.
     
@@ -1381,7 +1440,7 @@ def rotate_into(dst: Tensor, src: Tensor, angle_deg: typing.SupportsFloat | typi
                 Returns:
                     None
     """
-def threshold(src: Tensor, thresh: Tensor, maxVal: Tensor, maxBatchSize: typing.SupportsInt | typing.SupportsIndex, threshType: eThresholdType, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def threshold(src: Tensor, thresh: Tensor, maxVal: Tensor, maxBatchSize: typing.SupportsInt | typing.SupportsIndex, threshType: eThresholdType, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Thresholding operation on the given HIP stream.
     
@@ -1397,7 +1456,7 @@ def threshold(src: Tensor, thresh: Tensor, maxVal: Tensor, maxBatchSize: typing.
                     stream (rocpycv.Stream, optional): HIP stream to run this operation on.
                     device (rocpycv.Device, optional): The device to run this operation on. Defaults to GPU.
     """
-def threshold_into(dst: Tensor, src: Tensor, thresh: Tensor, maxVal: Tensor, maxBatchSize: typing.SupportsInt | typing.SupportsIndex, threshType: eThresholdType, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def threshold_into(dst: Tensor, src: Tensor, thresh: Tensor, maxVal: Tensor, maxBatchSize: typing.SupportsInt | typing.SupportsIndex, threshType: eThresholdType, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Thresholding operation on the given HIP stream.
     
@@ -1414,7 +1473,7 @@ def threshold_into(dst: Tensor, src: Tensor, thresh: Tensor, maxVal: Tensor, max
                     stream (rocpycv.Stream, optional): HIP stream to run this operation on.
                     device (rocpycv.Device, optional): The device to run this operation on. Defaults to GPU.
     """
-def warp_affine(src: Tensor, xform: list, inverted: bool, interp: eInterpolationType, border_mode: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def warp_affine(src: Tensor, xform: list, inverted: bool, interp: eInterpolationType, border_mode: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Warp Affine operation on the given HIP stream.
     
@@ -1434,7 +1493,7 @@ def warp_affine(src: Tensor, xform: list, inverted: bool, interp: eInterpolation
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def warp_affine_into(dst: Tensor, src: Tensor, xform: list, inverted: bool, interp: eInterpolationType, border_mode: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def warp_affine_into(dst: Tensor, src: Tensor, xform: list, inverted: bool, interp: eInterpolationType, border_mode: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Warp Affine operation on the given HIP stream.
     
@@ -1455,7 +1514,7 @@ def warp_affine_into(dst: Tensor, src: Tensor, xform: list, inverted: bool, inte
                 Returns:
                     None
     """
-def warp_perspective(src: Tensor, xform: list, inverted: bool, interp: eInterpolationType, border_mode: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor:
+def warp_perspective(src: Tensor, xform: list, inverted: bool, interp: eInterpolationType, border_mode: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor:
     """
                 Executes the Warp Perspective operation on the given HIP stream.
     
@@ -1475,7 +1534,7 @@ def warp_perspective(src: Tensor, xform: list, inverted: bool, interp: eInterpol
                 Returns:
                     rocpycv.Tensor: The output tensor.
     """
-def warp_perspective_into(dst: Tensor, src: Tensor, xform: list, inverted: bool, interp: eInterpolationType, border_mode: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None:
+def warp_perspective_into(dst: Tensor, src: Tensor, xform: list, inverted: bool, interp: eInterpolationType, border_mode: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None:
     """
                 Executes the Warp Perspective operation on the given HIP stream.
     
diff --git a/samples/migraphx-classification/README.md b/samples/migraphx-classification/README.md
new file mode 100644
index 00000000..83c7f4e2
--- /dev/null
+++ b/samples/migraphx-classification/README.md
@@ -0,0 +1,49 @@
+# rocCV MIGraphX Classification Sample
+
+This sample demonstrates how to use rocCV to preprocess an image on the GPU and run inference with a ResNet50 ONNX model through MIGraphX. The preprocessed tensor is handed off to MIGraphX via a raw GPU pointer for zero-copy interop, so no host round-trip is needed between preprocessing and inference.
+
+## Dependencies
+
+- A rocCV build with the Python bindings (`rocpycv`) on `PYTHONPATH`.
+- [MIGraphX](https://github.com/ROCm/AMDMIGraphX) with its Python bindings.
+- `opencv-python` and `numpy`.
+- A ResNet50 ONNX model with input name `data` and shape `[N, 3, 224, 224]` (e.g. the ONNX Model Zoo `resnet50-v1-7.onnx`).
+- Optional: a newline-separated ImageNet class labels file for human-readable output.
+
+## Command line
+
+```shell
+python3 migraphx_classification.py \
+    --input  path/to/image.jpg \
+    --model  path/to/resnet50.onnx \
+    --labels path/to/imagenet_classes.txt \
+    --top-k  5
+```
+
+Arguments:
+- `--input` (required): path to the input image.
+- `--model` (required): path to the ResNet50 ONNX file.
+- `--labels` (optional): path to an ImageNet class label file. If omitted, classes are reported by index.
+- `--top-k` (optional, default 5): number of top predictions to print.
+
+On the first run, the script compiles the ONNX model for the GPU and caches the result alongside the ONNX file as `<model>_b1.mxr`. Subsequent runs load the cached `.mxr` directly and skip compilation.
+
+## Preprocessing Operators
+
+The preprocessing pipeline runs entirely on the GPU through `rocpycv`:
+
+1. **Resize**: Resizes to 224x224 using cubic interpolation.
+2. **CvtColor**: Converts the OpenCV BGR image to RGB.
+3. **Convert To**: Casts U8 pixels to float32 (no scaling — the `/255` step is folded into the normalize parameters).
+4. **Normalize**: Applies ImageNet mean/std normalization. The mean and std constants are pre-multiplied by 255 so the operator can normalize directly from the [0, 255] float pixel range in a single pass.
+5. **Reformat**: Converts the tensor from NHWC to NCHW, the layout MIGraphX/ONNX expects.
+
+## MIGraphX Interop
+
+The compiled MIGraphX program is built with `offload_copy=False`, so input and output buffers must already live on the GPU. The sample binds:
+- The rocCV preprocessed tensor's GPU pointer (`preprocessed.data_ptr()`) as the `data` input via `migraphx.argument_from_pointer`.
+- A `migraphx.allocate_gpu` buffer as the output.
+
+Inference is launched with `model.run_async` using the same HIP stream as the preprocessing pipeline (`stream.handle()`), so preprocessing and inference are serialized on a single stream with no extra synchronization until the final `stream.synchronize()`.
+
+The output logits are copied back to the host with `migraphx.from_gpu`, passed through softmax, and the top-K classes are printed.
diff --git a/samples/migraphx-classification/migraphx_classification.py b/samples/migraphx-classification/migraphx_classification.py
new file mode 100644
index 00000000..be7fc66e
--- /dev/null
+++ b/samples/migraphx-classification/migraphx_classification.py
@@ -0,0 +1,179 @@
+# ##############################################################################
+# Copyright (c)  - 2026 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# ##############################################################################
+
+"""Classification with rocCV preprocessing and MIGraphX inference."""
+
+from __future__ import annotations
+
+import argparse
+import os
+
+import cv2
+import migraphx
+import numpy as np
+import rocpycv
+
+
+# ImageNet normalization, scaled to the [0, 255] pixel range so we can apply
+# them directly to U8-derived float pixels without a separate /255 step:
+#   (pixel/255 - mean) / std  ==  (pixel - mean*255) / (std*255)
+IMAGENET_MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32) * 255.0
+IMAGENET_STD = np.array([0.229, 0.224, 0.225], dtype=np.float32) * 255.0
+
+INPUT_H, INPUT_W = 224, 224
+BATCH_SIZE = 1
+
+
+def read_image(image_path: str) -> np.ndarray:
+    """Read an image from disk as an NHWC uint8 BGR numpy array."""
+    bgr = cv2.imread(image_path)
+    if bgr is None:
+        raise FileNotFoundError(f"Unable to load image: {image_path}")
+    return np.stack([bgr])
+
+
+def load_or_compile_model(onnx_path: str, use_fp16: bool = True) -> migraphx.program:
+    """Load a cached compiled model, or parse + compile + cache the ONNX file."""
+    precision_tag = "fp16" if use_fp16 else "fp32"
+    cache_path = f"{os.path.splitext(onnx_path)[0]}_b{BATCH_SIZE}_{precision_tag}.mxr"
+
+    if os.path.exists(cache_path):
+        print(f"Loading cached compiled model: {cache_path}")
+        return migraphx.load(cache_path, format="msgpack")
+
+    print(f"Parsing ONNX: {onnx_path}")
+    model = migraphx.parse_onnx(
+        onnx_path,
+        map_input_dims={"data": [BATCH_SIZE, 3, INPUT_H, INPUT_W]},
+    )
+
+    if use_fp16:
+        print("Quantizing to FP16...")
+        # Inserts internal float -> half conversions; model inputs/outputs stay
+        # float32, so the existing F32 buffer setup remains unchanged.
+        migraphx.quantize_fp16(model)
+
+    print("Compiling for GPU...")
+    # offload_copy=False allows us to bind GPU buffers directly to allow for
+    # zero-copy interop.
+    model.compile(migraphx.get_target("gpu"), offload_copy=False)
+
+    print(f"Caching compiled model to: {cache_path}")
+    migraphx.save(model, cache_path, format="msgpack")
+    return model
+
+
+def load_labels(labels_path: str | None) -> list[str] | None:
+    if labels_path is None:
+        return None
+    with open(labels_path) as f:
+        return [line.strip() for line in f if line.strip()]
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Classification with rocCV preprocessing and MIGraphX inference"
+    )
+    parser.add_argument("--input", required=True, help="Path to input image")
+    parser.add_argument(
+        "--model", required=True, help="Path to a ResNet50 ONNX model"
+    )
+    parser.add_argument(
+        "--labels",
+        default=None,
+        help="Optional path to a newline-separated ImageNet class labels file",
+    )
+    parser.add_argument("--top-k", type=int, default=5)
+    return parser.parse_args()
+
+
+def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray:
+    x = x - np.max(x, axis=axis, keepdims=True)
+    e = np.exp(x)
+    return e / np.sum(e, axis=axis, keepdims=True)
+
+
+def main() -> None:
+    args = parse_args()
+
+    model = load_or_compile_model(args.model)
+
+    print(f"Reading image: {args.input}")
+    np_image = read_image(args.input)
+    print(f"Input image shape: {np_image.shape}")
+
+    # Load/allocate tensors on the GPU
+    input_tensor  : rocpycv.Tensor = rocpycv.from_dlpack(np_image, "NHWC").copy_to(rocpycv.GPU)
+    resized       : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), np.uint8, "NHWC")
+    rgb           : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), np.uint8, "NHWC")
+    f32           : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), np.float32, "NHWC")
+    normalized    : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), np.float32, "NHWC")
+    preprocessed  : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, 3, INPUT_H, INPUT_W), np.float32, "NCHW")
+
+    mean_t        : rocpycv.Tensor = rocpycv.from_dlpack(IMAGENET_MEAN.reshape(1, 1, 1, 3), "NHWC").copy_to(rocpycv.GPU)
+    std_t         : rocpycv.Tensor = rocpycv.from_dlpack(IMAGENET_STD.reshape(1, 1, 1, 3), "NHWC").copy_to(rocpycv.GPU)
+
+    # Setup MIGraphX arguments/shapes
+    in_shape   : migraphx.shape    = migraphx.shape(type="float_type", lens=preprocessed.shape())
+    out_shape  : migraphx.shape    = migraphx.shape(type="float_type", lens=[BATCH_SIZE, 1000])
+
+    in_arg     : migraphx.argument = migraphx.argument_from_pointer(in_shape, preprocessed.data_ptr())
+    out_buf    : migraphx.buffer   = migraphx.allocate_gpu(out_shape)
+
+    # Begin preprocessing
+    print("Preprocessing with rocCV...")
+    stream : rocpycv.Stream = rocpycv.Stream()
+
+    rocpycv.resize_into(resized, input_tensor, rocpycv.CUBIC, stream)
+    rocpycv.cvtcolor_into(rgb, resized, rocpycv.COLOR_BGR2RGB, stream)
+    rocpycv.convert_to_into(f32, rgb, 1.0, 0.0, stream)
+    rocpycv.normalize_into(normalized, f32, mean_t, std_t, rocpycv.NormalizeFlags.SCALE_IS_STDDEV, 1.0, 0.0, 0.0, stream)
+    rocpycv.reformat_into(preprocessed, normalized, stream)
+    
+    print(f"Preprocessed tensor shape (NCHW): {preprocessed.shape()}")
+
+    print("Running MIGraphX inference...")
+
+    outputs = model.run_async(
+        {"data": in_arg, "main:#output_0": out_buf},
+        stream.handle(),
+        "ihipStream_t",
+    )
+    stream.synchronize()
+
+    # Postprocess the inference results
+    logits = np.array(migraphx.from_gpu(outputs[0]))
+    probs = softmax(logits, axis=1)
+
+    labels = load_labels(args.labels)
+
+    # Report top-K for the first image in the batch.
+    print(f"\nTop {args.top_k} predictions:")
+    top = np.argsort(probs[0])[::-1][: args.top_k]
+    for rank, idx in enumerate(top, start=1):
+        name = labels[idx] if labels is not None and idx < len(labels) else f"class {idx}"
+        print(f"  {rank}. {name}: {probs[0][idx]:.6f}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/samples/classification/README.md b/samples/pytorch-classification/README.md
similarity index 100%
rename from samples/classification/README.md
rename to samples/pytorch-classification/README.md
diff --git a/samples/classification/pytorch_classification.py b/samples/pytorch-classification/pytorch_classification.py
similarity index 100%
rename from samples/classification/pytorch_classification.py
rename to samples/pytorch-classification/pytorch_classification.py
diff --git a/tests/roccv/python/test_op_adv_cvt_color.py b/tests/roccv/python/test_op_adv_cvt_color.py
index 8a243f68..32b7f686 100644
--- a/tests/roccv/python/test_op_adv_cvt_color.py
+++ b/tests/roccv/python/test_op_adv_cvt_color.py
@@ -62,7 +62,7 @@
 @pytest.mark.parametrize("samples,width,height", [[1, 64, 48], [2, 128, 72]])
 def test_op_advcvtcolor_interleaved444(samples, height, width, code, spec, dtype, device):
     input_tensor = generate_tensor(samples, width, height, 3, dtype, device)
-    output_golden = rocpycv.Tensor([samples, height, width, 3], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, height, width, 3], dtype, rocpycv.eTensorLayout.NHWC, device)
 
     stream = rocpycv.Stream()
     output = rocpycv.advcvtcolor(input_tensor, code, spec, stream, device)
@@ -79,7 +79,7 @@ def test_op_advcvtcolor_interleaved444(samples, height, width, code, spec, dtype
 @pytest.mark.parametrize("samples,width,height", [[1, 64, 48], [2, 128, 72]])
 def test_op_advcvtcolor_interleaved_to_semiplanar(samples, height, width, code, spec, dtype, device):
     input_tensor = generate_tensor(samples, width, height, 3, dtype, device)
-    output_golden = rocpycv.Tensor([samples, (height * 3) // 2, width, 1], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, (height * 3) // 2, width, 1], dtype, rocpycv.eTensorLayout.NHWC, device)
 
     stream = rocpycv.Stream()
     output = rocpycv.advcvtcolor(input_tensor, code, spec, stream, device)
@@ -96,7 +96,7 @@ def test_op_advcvtcolor_interleaved_to_semiplanar(samples, height, width, code,
 @pytest.mark.parametrize("samples,width,height", [[1, 64, 48], [2, 128, 72]])
 def test_op_advcvtcolor_semiplanar_to_interleaved(samples, height, width, code, spec, dtype, device):
     input_tensor = generate_tensor(samples, width, (height * 3) // 2, 1, dtype, device)
-    output_golden = rocpycv.Tensor([samples, height, width, 3], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, height, width, 3], dtype, rocpycv.eTensorLayout.NHWC, device)
 
     stream = rocpycv.Stream()
     output = rocpycv.advcvtcolor(input_tensor, code, spec, stream, device)
diff --git a/tests/roccv/python/test_op_bilateral_filter.py b/tests/roccv/python/test_op_bilateral_filter.py
index 6e0abfd3..72e6708d 100644
--- a/tests/roccv/python/test_op_bilateral_filter.py
+++ b/tests/roccv/python/test_op_bilateral_filter.py
@@ -44,7 +44,7 @@
 ])
 def test_op_bilateral_filter(samples, height, width, channels, border_mode, border_val, diameter, sigma_color, sigma_space, dtype, device):
     input = generate_tensor(samples, width, height, channels, dtype, device)
-    output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, height, width, channels], dtype, rocpycv.eTensorLayout.NHWC, device)
 
     stream = rocpycv.Stream()
     rocpycv.bilateral_filter_into(output_golden, input, diameter, sigma_color,
diff --git a/tests/roccv/python/test_op_bnd_box.py b/tests/roccv/python/test_op_bnd_box.py
index d6fd8045..1b203a5a 100644
--- a/tests/roccv/python/test_op_bnd_box.py
+++ b/tests/roccv/python/test_op_bnd_box.py
@@ -49,11 +49,10 @@ def generate_boxes(samples: int, height: int, width: int) -> rocpycv.BndBoxes:
     (3, 150, 50),
     (7, 15, 23)
 ])
-def test_op_remap(samples, height, width, channels, device):
+def test_op_bndbox(samples, height, width, channels, device):
     input = generate_tensor(samples, width, height, channels, rocpycv.eDataType.U8, device)
     boxes = generate_boxes(samples, height, width)
-    output_golden = rocpycv.Tensor([samples, height, width, channels],
-                                   rocpycv.eTensorLayout.NHWC, rocpycv.eDataType.U8, device)
+    output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eDataType.U8, rocpycv.eTensorLayout.NHWC, device)
 
     stream = rocpycv.Stream()
     output = rocpycv.bndbox(input, boxes, stream, device)
diff --git a/tests/roccv/python/test_op_center_crop.py b/tests/roccv/python/test_op_center_crop.py
index a8054dcc..8daa507f 100644
--- a/tests/roccv/python/test_op_center_crop.py
+++ b/tests/roccv/python/test_op_center_crop.py
@@ -41,8 +41,7 @@
 ])
 def test_op_center_crop(samples, height, width, channels, dtype, box, device):
     input = generate_tensor(samples, width, height, channels, dtype, device)
-    output_golden = rocpycv.Tensor([samples, box[1], box[0], channels],
-                                   rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, box[1], box[0], channels], dtype, rocpycv.eTensorLayout.NHWC, device)
 
     if device == rocpycv.eDeviceType.GPU:
         stream = rocpycv.Stream()
diff --git a/tests/roccv/python/test_op_composite.py b/tests/roccv/python/test_op_composite.py
index e5d37538..12f43668 100644
--- a/tests/roccv/python/test_op_composite.py
+++ b/tests/roccv/python/test_op_composite.py
@@ -39,7 +39,7 @@ def test_op_composite(samples, height, width, out_channels, dtype, device):
     foreground = generate_tensor(samples, width, height, 3, dtype, device)
     background = generate_tensor(samples, width, height, 3, dtype, device)
     mask = generate_tensor(samples, width, height, 1, dtype, device)
-    output_golden = rocpycv.Tensor([samples, height, width, out_channels], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, height, width, out_channels], dtype, rocpycv.eTensorLayout.NHWC, device)
 
     stream = rocpycv.Stream()
     output = rocpycv.composite(foreground, background, mask, out_channels, stream, device)
diff --git a/tests/roccv/python/test_op_convert_to.py b/tests/roccv/python/test_op_convert_to.py
index 9e9f389f..c7555531 100644
--- a/tests/roccv/python/test_op_convert_to.py
+++ b/tests/roccv/python/test_op_convert_to.py
@@ -40,7 +40,7 @@
 ])
 def test_op_convert_to(samples, height, width, channels, device, dtype, out_dtype, alpha, beta):
     input = generate_tensor(samples, width, height, channels, dtype, device)
-    output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, out_dtype, device)
+    output_golden = rocpycv.Tensor([samples, height, width, channels], out_dtype, rocpycv.eTensorLayout.NHWC, device)
 
     stream = rocpycv.Stream()
     rocpycv.convert_to_into(output_golden, input, alpha, beta, stream, device)
diff --git a/tests/roccv/python/test_op_copy_make_border.py b/tests/roccv/python/test_op_copy_make_border.py
index 27994e00..fcafe36f 100644
--- a/tests/roccv/python/test_op_copy_make_border.py
+++ b/tests/roccv/python/test_op_copy_make_border.py
@@ -46,7 +46,7 @@
 def test_op_copy_make_border(samples, height, width, channels, top, right, bottom, left, border_mode, border_value, dtype, device):
     input = generate_tensor(samples, width, height, channels, dtype, device)
     output_golden = rocpycv.Tensor([samples, height + top + bottom, width + right + left,
-                                   channels], rocpycv.eTensorLayout.NHWC, dtype, device)
+                                   channels], dtype, rocpycv.eTensorLayout.NHWC, device)
 
     stream = rocpycv.Stream()
     output = rocpycv.copymakeborder(input, border_mode, border_value, top, bottom, left, right, stream, device)
diff --git a/tests/roccv/python/test_op_custom_crop.py b/tests/roccv/python/test_op_custom_crop.py
index 026130c5..47c3f1f1 100644
--- a/tests/roccv/python/test_op_custom_crop.py
+++ b/tests/roccv/python/test_op_custom_crop.py
@@ -41,8 +41,7 @@
 ])
 def test_op_custom_crop(samples, height, width, channels, dtype, box, device):
     input = generate_tensor(samples, width, height, channels, dtype, device)
-    output_golden = rocpycv.Tensor([samples, box.height, box.width, channels],
-                                   rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, box.height, box.width, channels], dtype, rocpycv.eTensorLayout.NHWC, device)
     if device == rocpycv.eDeviceType.GPU:
         stream = rocpycv.Stream()
         rocpycv.custom_crop_into(output_golden, input, box, stream, device)
diff --git a/tests/roccv/python/test_op_cvt_color.py b/tests/roccv/python/test_op_cvt_color.py
index 448a598b..637fbbfe 100644
--- a/tests/roccv/python/test_op_cvt_color.py
+++ b/tests/roccv/python/test_op_cvt_color.py
@@ -51,7 +51,7 @@ def test_op_cvtcolor(samples, height, width, code, dtype, device):
         out_channels = 1
 
     input = generate_tensor(samples, width, height, in_channels, dtype, device)
-    output_golden = rocpycv.Tensor([samples, height, width, out_channels], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, height, width, out_channels], dtype, rocpycv.eTensorLayout.NHWC, device)
 
     stream = rocpycv.Stream()
     output = rocpycv.cvtcolor(input, code, stream, device)
diff --git a/tests/roccv/python/test_op_flip.py b/tests/roccv/python/test_op_flip.py
index 32ed6215..c4980cd6 100644
--- a/tests/roccv/python/test_op_flip.py
+++ b/tests/roccv/python/test_op_flip.py
@@ -40,7 +40,7 @@
 def test_op_flip(samples, width, height, channels, dtype, flip_code, device):
     input_tensor = generate_tensor(samples, width, height, channels, dtype, device)
     stream = rocpycv.Stream()
-    output_tensor_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_tensor_golden = rocpycv.Tensor([samples, height, width, channels], dtype, rocpycv.eTensorLayout.NHWC, device)
     rocpycv.flip_into(output_tensor_golden, input_tensor, flip_code, stream, device)
     output_tensor = rocpycv.flip(input_tensor, flip_code, stream, device)
     stream.synchronize()
diff --git a/tests/roccv/python/test_op_gamma_contrast.py b/tests/roccv/python/test_op_gamma_contrast.py
index 68343ec9..38893594 100644
--- a/tests/roccv/python/test_op_gamma_contrast.py
+++ b/tests/roccv/python/test_op_gamma_contrast.py
@@ -38,7 +38,7 @@
 ])
 def test_op_gamma_contrast(samples, height, width, channels, gamma, dtype, device):
     input = generate_tensor(samples, width, height, channels, dtype, device)
-    output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, height, width, channels], dtype, rocpycv.eTensorLayout.NHWC, device)
 
     stream = rocpycv.Stream()
     output = rocpycv.gamma_contrast(input, gamma, stream, device)
diff --git a/tests/roccv/python/test_op_histogram.py b/tests/roccv/python/test_op_histogram.py
index e980a057..17c741d2 100644
--- a/tests/roccv/python/test_op_histogram.py
+++ b/tests/roccv/python/test_op_histogram.py
@@ -37,7 +37,7 @@
 ])
 def test_op_histogram(samples, height, width, in_dtype, out_dtype, device):
     input = generate_tensor(samples, width, height, 1, in_dtype, device)
-    output_golden = rocpycv.Tensor([samples, 256, 1], rocpycv.eTensorLayout.HWC, out_dtype, device)
+    output_golden = rocpycv.Tensor([samples, 256, 1], out_dtype, rocpycv.eTensorLayout.HWC, device)
 
     stream = rocpycv.Stream()
     output = rocpycv.histogram(input, None, stream, device)
diff --git a/tests/roccv/python/test_op_non_max_suppression.py b/tests/roccv/python/test_op_non_max_suppression.py
index 163a0a8f..e39c1e44 100644
--- a/tests/roccv/python/test_op_non_max_suppression.py
+++ b/tests/roccv/python/test_op_non_max_suppression.py
@@ -44,7 +44,7 @@ def generate_boxes(samples: int, num_boxes: int, device: rocpycv.eDeviceType) ->
 def test_op_non_max_suppression(samples, num_boxes, device):
     boxes = generate_boxes(samples, num_boxes, device)
     scores = generate_tensor_generic([samples, num_boxes], rocpycv.eTensorLayout.NW, rocpycv.eDataType.F32, device)
-    output_golden = rocpycv.Tensor([samples, num_boxes], rocpycv.eTensorLayout.NW, rocpycv.eDataType.U8, device)
+    output_golden = rocpycv.Tensor([samples, num_boxes], rocpycv.eDataType.U8, rocpycv.eTensorLayout.NW, device)
 
     stream = rocpycv.Stream()
     # Hardcoding the score and IoU threshold here. The only thing we care about is the resulting size of the
diff --git a/tests/roccv/python/test_op_normalize.py b/tests/roccv/python/test_op_normalize.py
index e25bc300..72036515 100644
--- a/tests/roccv/python/test_op_normalize.py
+++ b/tests/roccv/python/test_op_normalize.py
@@ -39,7 +39,7 @@ def test_op_normalize(samples, height, width, channels, device, dtype):
     input = generate_tensor(samples, width, height, channels, dtype, device)
     base = generate_tensor(1, 1, 1, channels, rocpycv.eDataType.F32, device)
     scale = generate_tensor(1, 1, 1, channels, rocpycv.eDataType.F32, device)
-    output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, height, width, channels], dtype, rocpycv.eTensorLayout.NHWC, device)
 
     stream = rocpycv.Stream()
     rocpycv.normalize_into(output_golden, input, base, scale, None, 1.0, 0.0, 0.0, stream, device)
diff --git a/tests/roccv/python/test_op_reformat.py b/tests/roccv/python/test_op_reformat.py
index 62715ff9..10d2b7a6 100644
--- a/tests/roccv/python/test_op_reformat.py
+++ b/tests/roccv/python/test_op_reformat.py
@@ -54,7 +54,7 @@ def test_op_reformat(samples, height, width, channels, inLayout, outLayout, devi
     input_shape = create_tensor_shape(inLayout, samples, channels, height, width)
     output_shape = create_tensor_shape(outLayout, samples, channels, height, width)
     input_tensor = generate_tensor_generic(input_shape, inLayout, dtype, device)
-    output_golden = rocpycv.Tensor(output_shape, outLayout, dtype, device)
+    output_golden = rocpycv.Tensor(output_shape, dtype, outLayout, device)
 
     stream = rocpycv.Stream()
     rocpycv.reformat_into(input_tensor, output_golden, stream, device)
diff --git a/tests/roccv/python/test_op_remap.py b/tests/roccv/python/test_op_remap.py
index 25321f9e..e9ccc394 100644
--- a/tests/roccv/python/test_op_remap.py
+++ b/tests/roccv/python/test_op_remap.py
@@ -44,7 +44,7 @@
 
 def test_op_remap(samples, width, height, channels, dtype, map_interp, interp, map_type, align_corners, border_mode, border_val, device):
     input_tensor = generate_tensor(samples, width, height, channels, dtype, device)
-    output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, height, width, channels], dtype, rocpycv.eTensorLayout.NHWC, device)
 
 
     if (map_type == rocpycv.REMAP_ABSOLUTE):
diff --git a/tests/roccv/python/test_op_resize.py b/tests/roccv/python/test_op_resize.py
index 721c1d7f..70e0c897 100644
--- a/tests/roccv/python/test_op_resize.py
+++ b/tests/roccv/python/test_op_resize.py
@@ -38,7 +38,7 @@ def test_op_resize(out_shape, in_shape, samples, channels, interp, dtype, device
     # Input/Output shapes are passed in as format [width, height]
     input = generate_tensor(samples, in_shape[0], in_shape[1], channels, dtype, device)
     output_shape = (samples, out_shape[1], out_shape[0], channels)
-    output_golden = rocpycv.Tensor(output_shape, rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor(output_shape, dtype, rocpycv.eTensorLayout.NHWC, device)
 
     stream = rocpycv.Stream()
     rocpycv.resize_into(output_golden, input, interp, stream, device)
diff --git a/tests/roccv/python/test_op_rotate.py b/tests/roccv/python/test_op_rotate.py
index 13c2fceb..1fa1e8e8 100644
--- a/tests/roccv/python/test_op_rotate.py
+++ b/tests/roccv/python/test_op_rotate.py
@@ -46,7 +46,7 @@ def calc_center_shift(center_x, center_y, angle) -> tuple[float, float]:
 ])
 def test_op_rotate(samples, width, height, channels, angle, dtype, interp, device):
     input = generate_tensor(samples, width, height, channels, dtype, device)
-    output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, height, width, channels], dtype, rocpycv.eTensorLayout.NHWC, device)
 
     center_x = (width - 1) / 2
     center_y = (height - 1) / 2
diff --git a/tests/roccv/python/test_op_thresholding.py b/tests/roccv/python/test_op_thresholding.py
index 1eae8d97..8613a266 100644
--- a/tests/roccv/python/test_op_thresholding.py
+++ b/tests/roccv/python/test_op_thresholding.py
@@ -41,7 +41,7 @@
 
 def test_op_thresholding(samples, height, width, channels, dtype, thresh, mvdata, threshType, device):
     input_tensor = generate_tensor(samples, width, height, channels, dtype, device)
-    output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, height, width, channels], dtype, rocpycv.eTensorLayout.NHWC, device)
     
     thresh_array = np.full(samples, thresh, np.float64)
     maxval_array = np.full(samples, mvdata, np.float64)
diff --git a/tests/roccv/python/test_op_warp_affine.py b/tests/roccv/python/test_op_warp_affine.py
index b4fd0dfb..9b9656b0 100644
--- a/tests/roccv/python/test_op_warp_affine.py
+++ b/tests/roccv/python/test_op_warp_affine.py
@@ -41,7 +41,7 @@
 ])
 def test_op_warp_affine(samples, width, height, channels, dtype, mat, inverted, interp, border_mode, border_val, device):
     input = generate_tensor(samples, width, height, channels, dtype, device)
-    output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, height, width, channels], dtype, rocpycv.eTensorLayout.NHWC, device)
 
     stream = rocpycv.Stream()
     output = rocpycv.warp_affine(input, mat, inverted, interp, border_mode, border_val, stream, device)
diff --git a/tests/roccv/python/test_op_warp_perspective.py b/tests/roccv/python/test_op_warp_perspective.py
index c72754a6..3164a951 100644
--- a/tests/roccv/python/test_op_warp_perspective.py
+++ b/tests/roccv/python/test_op_warp_perspective.py
@@ -41,7 +41,7 @@
 ])
 def test_op_warp_perspective(samples, width, height, channels, dtype, mat, inverted, interp, border_mode, border_val, device):
     input = generate_tensor(samples, width, height, channels, dtype, device)
-    output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device)
+    output_golden = rocpycv.Tensor([samples, height, width, channels], dtype, rocpycv.eTensorLayout.NHWC, device)
     print(output_golden.shape())
 
     stream = rocpycv.Stream()
diff --git a/tests/roccv/python/test_py_stream.py b/tests/roccv/python/test_py_stream.py
new file mode 100644
index 00000000..1560e4f5
--- /dev/null
+++ b/tests/roccv/python/test_py_stream.py
@@ -0,0 +1,32 @@
+# ##############################################################################
+# Copyright (c)  - 2026 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# ##############################################################################
+
+import rocpycv
+
+
+def test_stream_handle_returns_valid_pointer():
+    stream = rocpycv.Stream()
+    handle = stream.handle()
+
+    assert isinstance(handle, int)
+    assert handle != 0
diff --git a/tests/roccv/python/test_py_tensor.py b/tests/roccv/python/test_py_tensor.py
new file mode 100644
index 00000000..d394de8b
--- /dev/null
+++ b/tests/roccv/python/test_py_tensor.py
@@ -0,0 +1,85 @@
+# ##############################################################################
+# Copyright (c)  - 2026 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# ##############################################################################
+
+import numpy as np
+import pytest
+import rocpycv
+
+
+@pytest.mark.parametrize("device", [rocpycv.eDeviceType.GPU, rocpycv.eDeviceType.CPU])
+@pytest.mark.parametrize("dtype", [rocpycv.eDataType.U8, rocpycv.eDataType.F32, rocpycv.eDataType.S32])
+@pytest.mark.parametrize(
+    "shape, layout",
+    [
+        ([2, 32, 64, 3], rocpycv.eTensorLayout.NHWC),
+        ([1, 3, 16, 16], rocpycv.eTensorLayout.NCHW),
+        ([8, 8, 4], rocpycv.eTensorLayout.HWC),
+    ],
+)
+def test_tensor_basic_properties(shape, layout, dtype, device):
+    tensor = rocpycv.Tensor(shape, dtype, layout, device)
+
+    assert tensor.shape() == shape
+    assert tensor.ndim() == len(shape)
+    assert tensor.layout() == layout
+    assert tensor.device() == device
+    assert tensor.dtype() == dtype
+    assert tensor.data_ptr() != 0
+
+
+@pytest.mark.parametrize(
+    "dtype_in, expected_dtype",
+    [
+        (rocpycv.eDataType.U8, rocpycv.eDataType.U8),
+        (rocpycv.eDataType.F32, rocpycv.eDataType.F32),
+        (np.uint8, rocpycv.eDataType.U8),
+        (np.float32, rocpycv.eDataType.F32),
+        (np.int32, rocpycv.eDataType.S32),
+        (np.dtype("uint16"), rocpycv.eDataType.U16),
+    ],
+)
+@pytest.mark.parametrize(
+    "layout_in, expected_layout, shape",
+    [
+        (rocpycv.eTensorLayout.NHWC, rocpycv.eTensorLayout.NHWC, [2, 32, 64, 3]),
+        ("NHWC", rocpycv.eTensorLayout.NHWC, [2, 32, 64, 3]),
+        ("NCHW", rocpycv.eTensorLayout.NCHW, [1, 3, 16, 16]),
+        ("HWC", rocpycv.eTensorLayout.HWC, [8, 8, 4]),
+    ],
+)
+def test_tensor_construction_from_numpy_and_strings(dtype_in, expected_dtype, layout_in, expected_layout, shape):
+    tensor = rocpycv.Tensor(shape, dtype_in, layout_in, rocpycv.eDeviceType.GPU)
+
+    assert tensor.shape() == shape
+    assert tensor.dtype() == expected_dtype
+    assert tensor.layout() == expected_layout
+
+
+def test_tensor_construction_invalid_dtype_raises():
+    with pytest.raises(Exception):
+        rocpycv.Tensor([1, 8, 8, 3], "not_a_dtype", rocpycv.eTensorLayout.NHWC, rocpycv.eDeviceType.GPU)
+
+
+def test_tensor_construction_invalid_layout_raises():
+    with pytest.raises(Exception):
+        rocpycv.Tensor([1, 8, 8, 3], rocpycv.eDataType.U8, "ZYXW", rocpycv.eDeviceType.GPU)