ROCm · zacharyvincze · Apr 28, 2026 · Apr 28, 2026 · Apr 29, 2026 · Apr 29, 2026
diff --git a/python/include/py_helpers.hpp b/python/include/py_helpers.hpp
@@ -86,4 +86,23 @@ extern double2 GetDouble2FromTuple(py::tuple src);
  * @param src A python tuple of size 2.
  * @return int2
  */
-extern int2 GetInt2FromTuple(py::tuple src);
+extern int2 GetInt2FromTuple(py::tuple src);
+
+/**
+ * @brief Resolves a Python object to an eTensorLayout. Accepts either an rocpycv.eTensorLayout enum
+ * value, or a layout string such as "NHWC". Throws std::runtime_error for unsupported inputs.
+ *
+ * @param obj A Python object describing the tensor layout.
+ * @return eTensorLayout
+ */
+extern eTensorLayout LayoutFromPyObject(py::object obj);
+
+/**
+ * @brief Resolves a Python object to an eDataType. Accepts either an rocpycv.eDataType enum value,
+ * or anything coercible to a NumPy dtype (e.g. ``np.float32``, ``np.dtype("uint8")``,
+ * ``"float32"``). Throws std::runtime_error for unsupported inputs.
+ *
+ * @param obj A Python object describing the tensor data type.
+ * @return eDataType
+ */
+extern eDataType DataTypeFromPyObject(py::object obj);
diff --git a/python/include/py_stream.hpp b/python/include/py_stream.hpp
@@ -55,6 +55,18 @@ class PyStream {
      */
     void synchronize();
 
+    /**
+     * @brief Returns the wrapped HIP stream handle as an unsigned integer.
+     *
+     * Intended for zero-copy interop with frameworks that accept a raw HIP
+     * stream handle (e.g. ``migraphx.run_async`` with stream type
+     * ``"ihipStream_t"``). The handle is non-owning -- keep this PyStream alive
+     * for as long as the handle is in use.
+     *
+     * @return uintptr_t
+     */
+    uintptr_t getHandle();
+
     /**
      * @brief Exports the PyStream object to the specified python module.
      *

diff --git a/python/include/py_tensor.hpp b/python/include/py_tensor.hpp
@@ -38,11 +38,11 @@ class PyTensor : public std::enable_shared_from_this<PyTensor> {
      * wrap.
      *
      * @param shape The shape of the tensor.
-     * @param layout The layout of the tensor.
      * @param dtype The data type of the tensor.
+     * @param layout The layout of the tensor.
      * @param device The device of the tensor.
      */
-    PyTensor(std::vector<int64_t> shape, eTensorLayout layout, eDataType dtype, eDeviceType device);
+    PyTensor(std::vector<int64_t> shape, eDataType dtype, eTensorLayout layout, eDeviceType device);
 
     /**
      * @brief Wraps an existing roccv::Tensor inside of a newly constructed PyTensor.
@@ -138,6 +138,22 @@ class PyTensor : public std::enable_shared_from_this<PyTensor> {
      */
     eDeviceType getDevice();
 
+    /**
+     * @brief Returns the address of the tensor's underlying data buffer as an
+     * unsigned integer. For GPU tensors this is a HIP device address; for CPU
+     * tensors it is a host address. Use ``device()`` to disambiguate.
+     *
+     * The pointer is non-owning. The caller is responsible for ensuring this
+     * PyTensor remains alive for as long as the pointer is used; otherwise the
+     * underlying buffer may be freed and the pointer left dangling.
+     *
+     * Intended for zero-copy interop with frameworks that accept a raw
+     * pointer + shape + dtype (e.g. ``migraphx.argument_from_pointer``).
+     *
+     * @return uintptr_t
+     */
+    uintptr_t getDataPtr();
+
     /**
      * @brief Gets the underlying roccv::Tensor that this tensor container wraps.
      *

diff --git a/python/src/main.cpp b/python/src/main.cpp
@@ -52,9 +52,60 @@ THE SOFTWARE.
 
 PYBIND11_MODULE(rocpycv, m) {
     m.doc() = R"pbdoc(
-        Python API reference
-        -----------------------
-        This is the Python API reference for rocCV.
+        rocpycv — AMD GPU-accelerated image pre/post-processing
+        =======================================================
+
+        rocpycv is the Python binding for rocCV, a HIP/ROCm image processing
+        library. It exposes a NumPy-friendly :class:`Tensor` and a suite of
+        operators (resize, normalize, color conversion, geometric warps, ...)
+        that run on either GPU (default) or CPU.
+
+        Quick start
+        -----------
+        .. code-block:: python
+
+            import numpy as np
+            import rocpycv
+
+            # Wrap a NumPy array as a CPU Tensor (zero-copy via DLPack), then
+            # copy it to the GPU (explicit H2D transfer).
+            host = np.zeros((1, 480, 640, 3), np.uint8)
+            src  = rocpycv.from_dlpack(host, "NHWC").copy_to(rocpycv.GPU)
+
+            # Functional form: operators allocate and return a new Tensor.
+            resized = rocpycv.resize(src, (1, 224, 224, 3), rocpycv.LINEAR)
+            chw     = rocpycv.reformat(resized, "NCHW")
+
+            # ``*_into`` form: write into a caller-allocated output, optionally
+            # on a stream — useful in hot preprocessing loops.
+            stream = rocpycv.Stream()
+            out    = rocpycv.Tensor((1, 224, 224, 3), np.uint8, "NHWC")
+            rocpycv.resize_into(out, src, rocpycv.LINEAR, stream)
+            stream.synchronize()
+
+        Tensors
+        -------
+        :class:`Tensor` arguments accept either rocpycv enums or familiar
+        Python types:
+
+        * ``dtype``  — ``rocpycv.F32`` or any NumPy dtype/scalar (``np.float32``).
+        * ``layout`` — ``rocpycv.NHWC`` or a layout string (``"NHWC"``).
+
+        For zero-copy interop, tensors implement the DLPack protocol — pass any
+        ``__dlpack__``-supporting object (NumPy array, PyTorch tensor, ...) to
+        :func:`from_dlpack`, and use :meth:`Tensor.data_ptr` to hand a raw GPU
+        pointer to inference frameworks such as MIGraphX.
+
+        Operators
+        ---------
+        Most operators come in two forms:
+
+        * ``op(src, ...)``       — allocates and returns a new :class:`Tensor`.
+        * ``op_into(dst, src, ...)`` — writes into a pre-allocated output,
+          avoiding per-call allocation in tight loops.
+
+        All operators accept an optional ``stream`` (a :class:`Stream` wrapping
+        a ``hipStream_t``) and a ``device`` argument (defaults to GPU).
     )pbdoc";
     PyException::Export(m);
     PyEnums::Export(m);

diff --git a/python/src/operators/py_op_convert_to.cpp b/python/src/operators/py_op_convert_to.cpp
@@ -24,6 +24,8 @@ THE SOFTWARE.
 
 #include <op_convert_to.hpp>
 
+#include "py_helpers.hpp"
+
 PyTensor PyOpConvertTo::Execute(PyTensor& input, eDataType dtype, double alpha, double beta,
                                         std::optional<std::reference_wrapper<PyStream>> stream, eDeviceType device) {
     hipStream_t hipStream = stream.has_value() ? stream.value().get().getStream() : nullptr;
@@ -44,17 +46,23 @@ void PyOpConvertTo::ExecuteInto(PyTensor& output, PyTensor& input, double alpha,
 
 void PyOpConvertTo::Export(py::module& m) {
     using namespace py::literals;
-    m.def("convert_to", &PyOpConvertTo::Execute, "src"_a, "dtype"_a, "alpha"_a = 1.0, "beta"_a = 0.0, 
-                                                    "stream"_a = nullptr, "device"_a = eDeviceType::GPU, R"pbdoc(
-
+    m.def("convert_to",
+          [](PyTensor& input, py::object dtype, double alpha, double beta,
+             std::optional<std::reference_wrapper<PyStream>> stream, eDeviceType device) {
+              return PyOpConvertTo::Execute(input, DataTypeFromPyObject(dtype), alpha, beta, stream, device);
+          },
+          "src"_a, "dtype"_a, "alpha"_a = 1.0, "beta"_a = 0.0, "stream"_a = nullptr,
+          "device"_a = eDeviceType::GPU, R"pbdoc(
+
             Executes the Convert To operation on the given HIP stream.
 
             See also:
                 Refer to the rocCV C++ API reference for more information on this operation.
-            
+
             Args:
                 src (rocpycv.Tensor): Input tensor containing one or more images.
-                dtype (eDataType): Datatype of the output tensor.
+                dtype: Datatype of the output tensor. Either an ``rocpycv.eDataType``
+                    (e.g. ``rocpycv.F32``) or a NumPy dtype/scalar type (e.g. ``np.float32``).
                 alpha (double, optional): Scalar for output data. Defaults to 1.0.
                 beta (double, optional): Offset for the data. Defaults to 0.0.
                 stream (rocpycv.Stream, optional): HIP stream to run this operation on.

diff --git a/python/src/operators/py_op_reformat.cpp b/python/src/operators/py_op_reformat.cpp
@@ -22,6 +22,8 @@ THE SOFTWARE.
 
 #include "operators/py_op_reformat.hpp"
 
+#include "py_helpers.hpp"
+
 void PyOpReformat::ExecuteInto(PyTensor& output, PyTensor& input,
                                std::optional<std::reference_wrapper<PyStream>> stream, eDeviceType device) {
     hipStream_t hipStream = stream.has_value() ? stream.value().get().getStream() : nullptr;
@@ -46,16 +48,21 @@ PyTensor PyOpReformat::Execute(PyTensor& input, eTensorLayout outLayout,
 void PyOpReformat::Export(py::module& m) {
     using namespace py::literals;
 
-    m.def("reformat", &PyOpReformat::Execute, "input"_a, "out_layout"_a, "stream"_a = nullptr,
-          "device"_a = eDeviceType::GPU, R"pbdoc(
+    m.def("reformat",
+          [](PyTensor& input, py::object outLayout,
+             std::optional<std::reference_wrapper<PyStream>> stream, eDeviceType device) {
+              return PyOpReformat::Execute(input, LayoutFromPyObject(outLayout), stream, device);
+          },
+          "input"_a, "out_layout"_a, "stream"_a = nullptr, "device"_a = eDeviceType::GPU, R"pbdoc(
             Executes the Reformat operation and returns the result as a new tensor.
 
             See also:
                 Refer to the rocCV C++ API reference for more information on this operation.
 
             Args:
                 input (rocpycv.Tensor): Input tensor to reformat.
-                out_layout (rocpycv.eTensorLayout): The layout to reformat the input tensor to.
+                out_layout: The layout to reformat the input tensor to. Either an
+                    ``rocpycv.eTensorLayout`` (e.g. ``rocpycv.NCHW``) or a layout string (``"NCHW"``).
                 stream (rocpycv.Stream, optional): HIP stream to run this operation on.
                 device (rocpycv.Device, optional): The device to run this operation on. Defaults to GPU.
 

diff --git a/python/src/py_enums.cpp b/python/src/py_enums.cpp
@@ -48,7 +48,6 @@ void PyEnums::Export(py::module& m) {
         .value("S32", DATA_TYPE_S32)
         .value("F32", DATA_TYPE_F32)
         .value("F64", DATA_TYPE_F64)
-        .value("4S16", DATA_TYPE_4S16)
         .export_values();
 
     py::enum_<eDeviceType>(m, "eDeviceType")

diff --git a/python/src/py_helpers.cpp b/python/src/py_helpers.cpp
@@ -22,19 +22,23 @@ THE SOFTWARE.
 
 #include "py_helpers.hpp"
 
+#include <pybind11/numpy.h>
+
+#include <core/tensor_layout.hpp>
 #include <stdexcept>
+#include <string>
 
 eDataType DLTypeToRoccvType(DLDataType dtype) {
+    if (dtype.lanes != 1) {
+        throw std::runtime_error("Datatype is not supported.");
+    }
+
     if (dtype.bits == 8) {
         if (dtype.code == kDLUInt) return eDataType::DATA_TYPE_U8;
         if (dtype.code == kDLInt) return eDataType::DATA_TYPE_S8;
     } else if (dtype.bits == 16) {
-        if (dtype.lanes == 4) {
-            return eDataType::DATA_TYPE_4S16;
-        } else if (dtype.lanes == 1) {
-            if (dtype.code == kDLUInt) return eDataType::DATA_TYPE_U16;
-            if (dtype.code == kDLInt) return eDataType::DATA_TYPE_S16;
-        }
+        if (dtype.code == kDLUInt) return eDataType::DATA_TYPE_U16;
+        if (dtype.code == kDLInt) return eDataType::DATA_TYPE_S16;
     } else if (dtype.bits == 32) {
         if (dtype.code == kDLFloat) return eDataType::DATA_TYPE_F32;
         if (dtype.code == kDLUInt) return eDataType::DATA_TYPE_U32;
@@ -147,4 +151,54 @@ int2 GetInt2FromTuple(py::tuple src) {
         std::runtime_error("Cannot convert py::tuple to int2. py::tuple.size() != 2.");
     }
     return make_int2(src[0].cast<int>(), src[1].cast<int>());
+}
+
+eTensorLayout LayoutFromPyObject(py::object obj) {
+    if (py::isinstance<eTensorLayout>(obj)) {
+        return obj.cast<eTensorLayout>();
+    }
+
+    if (py::isinstance<py::str>(obj)) {
+        std::string s = obj.cast<std::string>();
+        for (const auto& [layout, name] : roccv::TensorLayout::layoutStringTable) {
+            if (name == s) return layout;
+        }
+        throw std::runtime_error("Unknown tensor layout string: '" + s + "'.");
+    }
+
+    throw std::runtime_error("layout must be an rocpycv.eTensorLayout or a layout string (e.g. 'NHWC').");
+}
+
+eDataType DataTypeFromPyObject(py::object obj) {
+    if (py::isinstance<eDataType>(obj)) {
+        return obj.cast<eDataType>();
+    }
+
+    // np.dtype() accepts numpy scalar types (np.float32), dtype instances, and dtype strings,
+    // so we delegate the parsing to NumPy itself rather than enumerating cases here.
+    py::dtype dt;
+    try {
+        static const py::object np_dtype = py::module_::import("numpy").attr("dtype");
+        dt = np_dtype(obj).cast<py::dtype>();
+    } catch (const std::exception&) {
+        throw std::runtime_error("dtype must be an rocpycv.eDataType or a NumPy dtype/scalar type (e.g. np.float32).");
+    }
+
+    DLDataTypeCode code;
+    switch (dt.kind()) {
+        case 'u':
+            code = kDLUInt;
+            break;
+        case 'i':
+            code = kDLInt;
+            break;
+        case 'f':
+            code = kDLFloat;
+            break;
+        default:
+            throw std::runtime_error("Unsupported NumPy dtype for rocpycv.Tensor (kind '" + std::string(1, dt.kind()) +
+                                     "').");
+    }
+    DLDataType dl{static_cast<uint8_t>(code), static_cast<uint8_t>(dt.itemsize() * 8), 1};
+    return DLTypeToRoccvType(dl);
 }
diff --git a/python/src/py_stream.cpp b/python/src/py_stream.cpp
@@ -38,8 +38,15 @@ PyStream::~PyStream() {
 
 void PyStream::synchronize() { HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(m_stream)); }
 
+uintptr_t PyStream::getHandle() { return reinterpret_cast<uintptr_t>(m_stream); }
+
 void PyStream::Export(py::module& m) {
     py::class_<PyStream>(m, "Stream", "Python wrapper for HIP streams.")
         .def(py::init<>(), "Creates a HIP stream.")
-        .def("synchronize", &PyStream::synchronize, "Blocks until all worked queued on this stream is finished.");
+        .def("synchronize", &PyStream::synchronize, "Blocks until all worked queued on this stream is finished.")
+        .def("handle", &PyStream::getHandle,
+             "Returns the underlying HIP stream handle (hipStream_t) as an integer. "
+             "Intended for zero-copy interop with frameworks that accept a raw stream handle, "
+             "e.g. migraphx.run_async(..., stream_handle, \"ihipStream_t\"). "
+             "The handle is non-owning -- keep the Stream alive while the handle is in use.");
 }