From fc788bc4cb1f81b473c5f0a5311a0d1a37138703 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Tue, 28 Apr 2026 18:36:00 -0400 Subject: [PATCH 01/20] Expose tensor pointer in rocpycv --- python/include/py_tensor.hpp | 16 ++++++++++++++++ python/src/py_tensor.cpp | 10 ++++++++++ 2 files changed, 26 insertions(+) diff --git a/python/include/py_tensor.hpp b/python/include/py_tensor.hpp index 523a5f27..f0c3e1d2 100644 --- a/python/include/py_tensor.hpp +++ b/python/include/py_tensor.hpp @@ -138,6 +138,22 @@ class PyTensor : public std::enable_shared_from_this { */ eDeviceType getDevice(); + /** + * @brief Returns the address of the tensor's underlying data buffer as an + * unsigned integer. For GPU tensors this is a HIP device address; for CPU + * tensors it is a host address. Use ``device()`` to disambiguate. + * + * The pointer is non-owning. The caller is responsible for ensuring this + * PyTensor remains alive for as long as the pointer is used; otherwise the + * underlying buffer may be freed and the pointer left dangling. + * + * Intended for zero-copy interop with frameworks that accept a raw + * pointer + shape + dtype (e.g. ``migraphx.argument_from_pointer``). + * + * @return uintptr_t + */ + uintptr_t getDataPtr(); + /** * @brief Gets the underlying roccv::Tensor that this tensor container wraps. * diff --git a/python/src/py_tensor.cpp b/python/src/py_tensor.cpp index 913a25a4..8c864b46 100644 --- a/python/src/py_tensor.cpp +++ b/python/src/py_tensor.cpp @@ -178,6 +178,11 @@ eTensorLayout PyTensor::getLayout() { return m_tensor->layout().elayout(); } eDeviceType PyTensor::getDevice() { return m_tensor->device(); } +uintptr_t PyTensor::getDataPtr() { + auto tensorData = m_tensor->exportData(); + return reinterpret_cast(tensorData.basePtr()); +} + std::shared_ptr PyTensor::getTensor() { return m_tensor; } py::tuple PyTensor::getDLDevice() { @@ -207,6 +212,11 @@ void PyTensor::Export(pybind11::module& m) { .def("shape", &PyTensor::getShape, "Returns a list representing the tensor shape.") .def("layout", &PyTensor::getLayout, "Returns the layout for this tensor.") .def("device", &PyTensor::getDevice, "Returns the device this tensor is on.") + .def("data_ptr", &PyTensor::getDataPtr, + "Returns the address of the tensor's underlying buffer as an integer. " + "For GPU tensors this is a HIP device address; for CPU tensors a host address. " + "The pointer is non-owning -- keep the tensor alive for as long as the pointer is used. " + "Intended for zero-copy interop with frameworks like MIGraphX.") .def("ndim", &PyTensor::getRank, "Returns the number of dimensions of the tensor.") .def("dtype", &PyTensor::getDataType, "Returns the data type of the tensor.") .def("__dlpack_device__", &PyTensor::getDLDevice, From 134eb7ceed2c7240d633c02f47ed4c01621a546e Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Tue, 28 Apr 2026 18:40:44 -0400 Subject: [PATCH 02/20] Expose stream pointer in rocpycv --- python/include/py_stream.hpp | 12 ++++++++++++ python/src/py_stream.cpp | 9 ++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/python/include/py_stream.hpp b/python/include/py_stream.hpp index 202420c9..a5923359 100644 --- a/python/include/py_stream.hpp +++ b/python/include/py_stream.hpp @@ -55,6 +55,18 @@ class PyStream { */ void synchronize(); + /** + * @brief Returns the wrapped HIP stream handle as an unsigned integer. + * + * Intended for zero-copy interop with frameworks that accept a raw HIP + * stream handle (e.g. ``migraphx.run_async`` with stream type + * ``"ihipStream_t"``). The handle is non-owning -- keep this PyStream alive + * for as long as the handle is in use. + * + * @return uintptr_t + */ + uintptr_t getHandle(); + /** * @brief Exports the PyStream object to the specified python module. * diff --git a/python/src/py_stream.cpp b/python/src/py_stream.cpp index ac6ae73f..c830bc8e 100644 --- a/python/src/py_stream.cpp +++ b/python/src/py_stream.cpp @@ -38,8 +38,15 @@ PyStream::~PyStream() { void PyStream::synchronize() { HIP_VALIDATE_NO_ERRORS(hipStreamSynchronize(m_stream)); } +uintptr_t PyStream::getHandle() { return reinterpret_cast(m_stream); } + void PyStream::Export(py::module& m) { py::class_(m, "Stream", "Python wrapper for HIP streams.") .def(py::init<>(), "Creates a HIP stream.") - .def("synchronize", &PyStream::synchronize, "Blocks until all worked queued on this stream is finished."); + .def("synchronize", &PyStream::synchronize, "Blocks until all worked queued on this stream is finished.") + .def("handle", &PyStream::getHandle, + "Returns the underlying HIP stream handle (hipStream_t) as an integer. " + "Intended for zero-copy interop with frameworks that accept a raw stream handle, " + "e.g. migraphx.run_async(..., stream_handle, \"ihipStream_t\"). " + "The handle is non-owning -- keep the Stream alive while the handle is in use."); } \ No newline at end of file From 228f422133b1fa5833cc66a4ace8cef7ff977cb2 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Wed, 29 Apr 2026 10:31:57 -0400 Subject: [PATCH 03/20] Update stubs --- python/src/rocpycv.pyi | 98 ++++++++++++++++++++++-------------------- 1 file changed, 52 insertions(+), 46 deletions(-) diff --git a/python/src/rocpycv.pyi b/python/src/rocpycv.pyi index 855a8d5a..c6bc639c 100644 --- a/python/src/rocpycv.pyi +++ b/python/src/rocpycv.pyi @@ -153,6 +153,10 @@ class Stream: """ Creates a HIP stream. """ + def handle(self) -> int: + """ + Returns the underlying HIP stream handle (hipStream_t) as an integer. Intended for zero-copy interop with frameworks that accept a raw stream handle, e.g. migraphx.run_async(..., stream_handle, "ihipStream_t"). The handle is non-owning -- keep the Stream alive while the handle is in use. + """ def synchronize(self) -> None: """ Blocks until all worked queued on this stream is finished. @@ -166,7 +170,7 @@ class Tensor: """ Returns a tuple containing the DLPack device and device id for the tensor. """ - def __init__(self, shape: collections.abc.Sequence[typing.SupportsInt | typing.SupportsIndex], layout: eTensorLayout, dtype: eDataType, device: eDeviceType = ...) -> None: + def __init__(self, shape: collections.abc.Sequence[typing.SupportsInt | typing.SupportsIndex], layout: eTensorLayout, dtype: eDataType, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None: """ Constructs a tensor object. """ @@ -174,6 +178,10 @@ class Tensor: """ Returns a deep copy of the tensor with data copied to a specified device type. """ + def data_ptr(self) -> int: + """ + Returns the address of the tensor's underlying buffer as an integer. For GPU tensors this is a HIP device address; for CPU tensors a host address. The pointer is non-owning -- keep the tensor alive for as long as the pointer is used. Intended for zero-copy interop with frameworks like MIGraphX. + """ def device(self) -> eDeviceType: """ Returns the device this tensor is on. @@ -472,8 +480,6 @@ class eDataType: F32 F64 - - 4S16 """ F32: typing.ClassVar[eDataType] # value = F64: typing.ClassVar[eDataType] # value = @@ -483,7 +489,7 @@ class eDataType: U16: typing.ClassVar[eDataType] # value = U32: typing.ClassVar[eDataType] # value = U8: typing.ClassVar[eDataType] # value = - __members__: typing.ClassVar[dict[str, eDataType]] # value = {'U8': , 'S8': , 'U16': , 'S16': , 'U32': , 'S32': , 'F32': , 'F64': , '4S16': } + __members__: typing.ClassVar[dict[str, eDataType]] # value = {'U8': , 'S8': , 'U16': , 'S16': , 'U32': , 'S32': , 'F32': , 'F64': } def __eq__(self, other: typing.Any) -> bool: ... def __getstate__(self) -> int: @@ -728,7 +734,7 @@ class eThresholdType: @property def value(self) -> int: ... -def advcvtcolor(src: Tensor, conversion_code: eColorConversionCode, color_spec: eColorSpec, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor: +def advcvtcolor(src: Tensor, conversion_code: eColorConversionCode, color_spec: eColorSpec, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor: """ Executes the Advanced Color Convert operation on the given HIP stream. @@ -745,7 +751,7 @@ def advcvtcolor(src: Tensor, conversion_code: eColorConversionCode, color_spec: Returns: rocpycv.Tensor: The output tensor. """ -def advcvtcolor_into(dst: Tensor, src: Tensor, conversion_code: eColorConversionCode, color_spec: eColorSpec, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None: +def advcvtcolor_into(dst: Tensor, src: Tensor, conversion_code: eColorConversionCode, color_spec: eColorSpec, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None: """ Executes the Advanced Color Convert operation on the given HIP stream. @@ -763,7 +769,7 @@ def advcvtcolor_into(dst: Tensor, src: Tensor, conversion_code: eColorConversion Returns: None """ -def bilateral_filter(src: Tensor, diameter: typing.SupportsInt | typing.SupportsIndex, sigmaColor: typing.SupportsFloat | typing.SupportsIndex, sigmaSpace: typing.SupportsFloat | typing.SupportsIndex, borderMode: eBorderType, borderValue: list, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor: +def bilateral_filter(src: Tensor, diameter: typing.SupportsInt | typing.SupportsIndex, sigmaColor: typing.SupportsFloat | typing.SupportsIndex, sigmaSpace: typing.SupportsFloat | typing.SupportsIndex, borderMode: eBorderType, borderValue: list, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor: """ Executes the Bilateral Filter operation on the given HIP stream. @@ -783,7 +789,7 @@ def bilateral_filter(src: Tensor, diameter: typing.SupportsInt | typing.Supports Returns: rocpycv.Tensor: The output tensor. """ -def bilateral_filter_into(dst: Tensor, src: Tensor, diameter: typing.SupportsInt | typing.SupportsIndex, sigmaColor: typing.SupportsFloat | typing.SupportsIndex, sigmaSpace: typing.SupportsFloat | typing.SupportsIndex, borderMode: eBorderType, borderValue: list, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None: +def bilateral_filter_into(dst: Tensor, src: Tensor, diameter: typing.SupportsInt | typing.SupportsIndex, sigmaColor: typing.SupportsFloat | typing.SupportsIndex, sigmaSpace: typing.SupportsFloat | typing.SupportsIndex, borderMode: eBorderType, borderValue: list, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None: """ Executes the Bilateral Filter operation on the given HIP stream. @@ -804,7 +810,7 @@ def bilateral_filter_into(dst: Tensor, src: Tensor, diameter: typing.SupportsInt Returns: None """ -def bndbox(src: Tensor, bnd_boxes: BndBoxes, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor: +def bndbox(src: Tensor, bnd_boxes: BndBoxes, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor: """ Executes the BndBox operation on the given HIP stream. @@ -820,7 +826,7 @@ def bndbox(src: Tensor, bnd_boxes: BndBoxes, stream: rocpycv.Stream | None = Non Returns: rocpycv.Tensor: The output tensor. """ -def bndbox_into(dst: Tensor, src: Tensor, bnd_boxes: BndBoxes, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None: +def bndbox_into(dst: Tensor, src: Tensor, bnd_boxes: BndBoxes, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None: """ Executes the BndBox operation on the given HIP stream. @@ -837,7 +843,7 @@ def bndbox_into(dst: Tensor, src: Tensor, bnd_boxes: BndBoxes, stream: rocpycv.S Returns: None """ -def center_crop(src: Tensor, crop_size: tuple, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor: +def center_crop(src: Tensor, crop_size: tuple, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor: """ Executes the Center Crop operation on the given HIP stream. @@ -854,7 +860,7 @@ def center_crop(src: Tensor, crop_size: tuple, stream: rocpycv.Stream | None = N Returns: rocpycv.Tensor: The output tensor. """ -def center_crop_into(dst: Tensor, src: Tensor, crop_size: tuple, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None: +def center_crop_into(dst: Tensor, src: Tensor, crop_size: tuple, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None: """ Executes the Center Crop operation on the given HIP stream. @@ -871,7 +877,7 @@ def center_crop_into(dst: Tensor, src: Tensor, crop_size: tuple, stream: rocpycv Returns: None """ -def composite(foreground: Tensor, background: Tensor, fgmask: Tensor, outchannels: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor: +def composite(foreground: Tensor, background: Tensor, fgmask: Tensor, outchannels: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor: """ Executes the Composite operation on the given HIP stream. @@ -889,7 +895,7 @@ def composite(foreground: Tensor, background: Tensor, fgmask: Tensor, outchannel Returns: rocpycv.Tensor: The output tensor with number of channels. """ -def composite_into(dst: Tensor, foreground: Tensor, background: Tensor, fgmask: Tensor, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None: +def composite_into(dst: Tensor, foreground: Tensor, background: Tensor, fgmask: Tensor, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None: """ Executes the Composite operation on the given HIP stream. @@ -907,7 +913,7 @@ def composite_into(dst: Tensor, foreground: Tensor, background: Tensor, fgmask: Returns: None """ -def convert_to(src: Tensor, dtype: eDataType, alpha: typing.SupportsFloat | typing.SupportsIndex = 1.0, beta: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor: +def convert_to(src: Tensor, dtype: eDataType, alpha: typing.SupportsFloat | typing.SupportsIndex = 1.0, beta: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor: """ Executes the Convert To operation on the given HIP stream. @@ -925,7 +931,7 @@ def convert_to(src: Tensor, dtype: eDataType, alpha: typing.SupportsFloat | typi Returns: rocpycv.Tensor: The output tensor. """ -def convert_to_into(dst: Tensor, src: Tensor, alpha: typing.SupportsFloat | typing.SupportsIndex = 1.0, beta: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None: +def convert_to_into(dst: Tensor, src: Tensor, alpha: typing.SupportsFloat | typing.SupportsIndex = 1.0, beta: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None: """ Executes the Convert To operation on the given HIP stream. @@ -943,7 +949,7 @@ def convert_to_into(dst: Tensor, src: Tensor, alpha: typing.SupportsFloat | typi Returns: None """ -def copymakeborder(src: Tensor, border_mode: eBorderType = ..., border_value: list = [0.0, 0.0, 0.0, 0.0], top: typing.SupportsInt | typing.SupportsIndex, bottom: typing.SupportsInt | typing.SupportsIndex, left: typing.SupportsInt | typing.SupportsIndex, right: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor: +def copymakeborder(src: Tensor, border_mode: eBorderType = eBorderType.eBorderType.CONSTANT, border_value: list = [0.0, 0.0, 0.0, 0.0], top: typing.SupportsInt | typing.SupportsIndex, bottom: typing.SupportsInt | typing.SupportsIndex, left: typing.SupportsInt | typing.SupportsIndex, right: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor: """ Executes the CopyMakeBorder operation on the given HIP stream. @@ -964,7 +970,7 @@ def copymakeborder(src: Tensor, border_mode: eBorderType = ..., border_value: li Returns: rocpycv.Tensor: The output tensor. """ -def copymakeborder_into(dst: Tensor, src: Tensor, border_mode: eBorderType = ..., border_value: list = [0.0, 0.0, 0.0, 0.0], top: typing.SupportsInt | typing.SupportsIndex, left: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None: +def copymakeborder_into(dst: Tensor, src: Tensor, border_mode: eBorderType = eBorderType.eBorderType.CONSTANT, border_value: list = [0.0, 0.0, 0.0, 0.0], top: typing.SupportsInt | typing.SupportsIndex, left: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None: """ Executes the CopyMakeBorder operation on the given HIP stream. @@ -984,7 +990,7 @@ def copymakeborder_into(dst: Tensor, src: Tensor, border_mode: eBorderType = ... Returns: None """ -def custom_crop(src: Tensor, crop_rect: Box, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor: +def custom_crop(src: Tensor, crop_rect: Box, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor: """ Executes the Custom Crop operation on the given HIP stream. @@ -1001,7 +1007,7 @@ def custom_crop(src: Tensor, crop_rect: Box, stream: rocpycv.Stream | None = Non Returns: None """ -def custom_crop_into(dst: Tensor, src: Tensor, crop_rect: Box, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None: +def custom_crop_into(dst: Tensor, src: Tensor, crop_rect: Box, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None: """ Executes the Custom Crop operation on the given HIP stream. @@ -1017,7 +1023,7 @@ def custom_crop_into(dst: Tensor, src: Tensor, crop_rect: Box, stream: rocpycv.S Returns: rocpycv.Tensor: The output tensor. """ -def cvtcolor(src: Tensor, conversion_code: eColorConversionCode, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor: +def cvtcolor(src: Tensor, conversion_code: eColorConversionCode, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor: """ Executes the Color Convert operation on the given HIP stream. @@ -1033,7 +1039,7 @@ def cvtcolor(src: Tensor, conversion_code: eColorConversionCode, stream: rocpycv Returns: rocpycv.Tensor: The output tensor. """ -def cvtcolor_into(dst: Tensor, src: Tensor, conversion_code: eColorConversionCode, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None: +def cvtcolor_into(dst: Tensor, src: Tensor, conversion_code: eColorConversionCode, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None: """ Executes the Color Convert operation on the given HIP stream. @@ -1050,7 +1056,7 @@ def cvtcolor_into(dst: Tensor, src: Tensor, conversion_code: eColorConversionCod Returns: None """ -def flip(src: Tensor, flip_code: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor: +def flip(src: Tensor, flip_code: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor: """ Executes the Flip operation on the given HIP stream. @@ -1066,7 +1072,7 @@ def flip(src: Tensor, flip_code: typing.SupportsInt | typing.SupportsIndex, stre Returns: rocpycv.Tensor: The output tensor. """ -def flip_into(dst: Tensor, src: Tensor, flip_code: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None: +def flip_into(dst: Tensor, src: Tensor, flip_code: typing.SupportsInt | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None: """ Executes the Flip operation on the given HIP stream. @@ -1087,7 +1093,7 @@ def from_dlpack(buffer: typing.Any, layout: eTensorLayout) -> Tensor: """ Wraps a DLPack supported tensor in a rocpycv tensor. """ -def gamma_contrast(src: Tensor, gamma: typing.SupportsFloat | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor: +def gamma_contrast(src: Tensor, gamma: typing.SupportsFloat | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor: """ Executes the Gamma Contrast operation on the given HIP stream. @@ -1103,7 +1109,7 @@ def gamma_contrast(src: Tensor, gamma: typing.SupportsFloat | typing.SupportsInd Returns: rocpycv.Tensor: The output tensor. """ -def gamma_contrast_into(dst: Tensor, src: Tensor, gamma: typing.SupportsFloat | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None: +def gamma_contrast_into(dst: Tensor, src: Tensor, gamma: typing.SupportsFloat | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None: """ Executes the Gamma Contrast operation on the given HIP stream. @@ -1120,7 +1126,7 @@ def gamma_contrast_into(dst: Tensor, src: Tensor, gamma: typing.SupportsFloat | Returns: None """ -def histogram(src: Tensor, mask: rocpycv.Tensor | None, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor: +def histogram(src: Tensor, mask: rocpycv.Tensor | None, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor: """ Executes the Histogram operation on the given HIP stream. @@ -1136,7 +1142,7 @@ def histogram(src: Tensor, mask: rocpycv.Tensor | None, stream: rocpycv.Stream | Returns: rocpycv.Tensor: Output tensor with width of 256 and a height equal to the batch size of input (1 if HWC input). """ -def histogram_into(dst: Tensor, src: Tensor, mask: rocpycv.Tensor | None, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None: +def histogram_into(dst: Tensor, src: Tensor, mask: rocpycv.Tensor | None, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None: """ Executes the Histogram operation on the given HIP stream. @@ -1153,7 +1159,7 @@ def histogram_into(dst: Tensor, src: Tensor, mask: rocpycv.Tensor | None, stream Returns: None """ -def nms(src: Tensor, scores: Tensor, score_threshold: typing.SupportsFloat | typing.SupportsIndex = 1.1920928955078125e-07, iou_threshold: typing.SupportsFloat | typing.SupportsIndex = 1.0, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor: +def nms(src: Tensor, scores: Tensor, score_threshold: typing.SupportsFloat | typing.SupportsIndex = 1.1920928955078125e-07, iou_threshold: typing.SupportsFloat | typing.SupportsIndex = 1.0, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor: """ Executes the Non-maximum Suppression operation on the given HIP stream. @@ -1171,7 +1177,7 @@ def nms(src: Tensor, scores: Tensor, score_threshold: typing.SupportsFloat | typ Returns: rocpycv.Tensor: The output tensor of shape [i, j], containing 1 (kept) or 0 (suppressed) for each bounding box (j) per batch (i). Results will be written to this tensor. """ -def nms_into(dst: Tensor, src: Tensor, scores: Tensor, score_threshold: typing.SupportsFloat | typing.SupportsIndex = 1.1920928955078125e-07, iou_threshold: typing.SupportsFloat | typing.SupportsIndex = 1.0, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None: +def nms_into(dst: Tensor, src: Tensor, scores: Tensor, score_threshold: typing.SupportsFloat | typing.SupportsIndex = 1.1920928955078125e-07, iou_threshold: typing.SupportsFloat | typing.SupportsIndex = 1.0, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None: """ Executes the Non-maximum Suppression operation on the given HIP stream. @@ -1190,7 +1196,7 @@ def nms_into(dst: Tensor, src: Tensor, scores: Tensor, score_threshold: typing.S Returns: None """ -def normalize(src: Tensor, base: Tensor, scale: Tensor, flags: typing.SupportsInt | typing.SupportsIndex | None = None, globalscale: typing.SupportsFloat | typing.SupportsIndex = 1.0, globalshift: typing.SupportsFloat | typing.SupportsIndex = 0.0, epsilon: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor: +def normalize(src: Tensor, base: Tensor, scale: Tensor, flags: typing.SupportsInt | typing.SupportsIndex | None = None, globalscale: typing.SupportsFloat | typing.SupportsIndex = 1.0, globalshift: typing.SupportsFloat | typing.SupportsIndex = 0.0, epsilon: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor: """ Executes the Normalize operation on the given HIP stream. @@ -1211,7 +1217,7 @@ def normalize(src: Tensor, base: Tensor, scale: Tensor, flags: typing.SupportsIn Returns: rocpycv.Tensor: The output tensor. """ -def normalize_into(dst: Tensor, src: Tensor, base: Tensor, scale: Tensor, flags: typing.SupportsInt | typing.SupportsIndex | None = None, globalscale: typing.SupportsFloat | typing.SupportsIndex = 1.0, globalshift: typing.SupportsFloat | typing.SupportsIndex = 0.0, epsilon: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None: +def normalize_into(dst: Tensor, src: Tensor, base: Tensor, scale: Tensor, flags: typing.SupportsInt | typing.SupportsIndex | None = None, globalscale: typing.SupportsFloat | typing.SupportsIndex = 1.0, globalshift: typing.SupportsFloat | typing.SupportsIndex = 0.0, epsilon: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None: """ Executes the Normalize operation on the given HIP stream. @@ -1233,7 +1239,7 @@ def normalize_into(dst: Tensor, src: Tensor, base: Tensor, scale: Tensor, flags: Returns: None """ -def reformat(input: Tensor, out_layout: eTensorLayout, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor: +def reformat(input: Tensor, out_layout: eTensorLayout, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor: """ Executes the Reformat operation and returns the result as a new tensor. @@ -1249,7 +1255,7 @@ def reformat(input: Tensor, out_layout: eTensorLayout, stream: rocpycv.Stream | Returns: rocpycv.Tensor: The reformatted tensor. """ -def reformat_into(output: Tensor, input: Tensor, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None: +def reformat_into(output: Tensor, input: Tensor, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None: """ Executes the Reformat operation on the given HIP stream. @@ -1265,7 +1271,7 @@ def reformat_into(output: Tensor, input: Tensor, stream: rocpycv.Stream | None = Returns: None """ -def remap(src: Tensor, map: Tensor, in_interpolation: eInterpolationType, map_interpolation: eInterpolationType, map_value_type: eRemapType, align_corners: bool, border_type: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor: +def remap(src: Tensor, map: Tensor, in_interpolation: eInterpolationType, map_interpolation: eInterpolationType, map_value_type: eRemapType, align_corners: bool, border_type: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor: """ Executes the Remap operation on the given HIP stream. @@ -1287,7 +1293,7 @@ def remap(src: Tensor, map: Tensor, in_interpolation: eInterpolationType, map_in Returns: rocpycv.Tensor: The output tensor. """ -def remap_into(dst: Tensor, src: Tensor, map: Tensor, in_interpolation: eInterpolationType, map_interpolation: eInterpolationType, map_value_type: eRemapType, align_corners: bool, border_type: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None: +def remap_into(dst: Tensor, src: Tensor, map: Tensor, in_interpolation: eInterpolationType, map_interpolation: eInterpolationType, map_value_type: eRemapType, align_corners: bool, border_type: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None: """ Executes the Remap operation on the given HIP stream. @@ -1310,7 +1316,7 @@ def remap_into(dst: Tensor, src: Tensor, map: Tensor, in_interpolation: eInterpo Returns: None """ -def resize(src: Tensor, shape: tuple, interp: eInterpolationType, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor: +def resize(src: Tensor, shape: tuple, interp: eInterpolationType, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor: """ Executes the Resize operation on the given HIP stream. @@ -1327,7 +1333,7 @@ def resize(src: Tensor, shape: tuple, interp: eInterpolationType, stream: rocpyc Returns: rocpycv.Tensor: The output tensor. """ -def resize_into(dst: Tensor, src: Tensor, interp: eInterpolationType, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None: +def resize_into(dst: Tensor, src: Tensor, interp: eInterpolationType, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None: """ Executes the Resize operation on the given HIP stream. @@ -1344,7 +1350,7 @@ def resize_into(dst: Tensor, src: Tensor, interp: eInterpolationType, stream: ro Returns: None """ -def rotate(src: Tensor, angle_deg: typing.SupportsFloat | typing.SupportsIndex, shift: tuple, interpolation: eInterpolationType, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor: +def rotate(src: Tensor, angle_deg: typing.SupportsFloat | typing.SupportsIndex, shift: tuple, interpolation: eInterpolationType, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor: """ Executes the Rotate operation on the given HIP stream. @@ -1362,7 +1368,7 @@ def rotate(src: Tensor, angle_deg: typing.SupportsFloat | typing.SupportsIndex, Returns: rocpycv.Tensor: The output tensor. """ -def rotate_into(dst: Tensor, src: Tensor, angle_deg: typing.SupportsFloat | typing.SupportsIndex, shift: tuple, interpolation: eInterpolationType, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None: +def rotate_into(dst: Tensor, src: Tensor, angle_deg: typing.SupportsFloat | typing.SupportsIndex, shift: tuple, interpolation: eInterpolationType, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None: """ Executes the Rotate operation on the given HIP stream. @@ -1381,7 +1387,7 @@ def rotate_into(dst: Tensor, src: Tensor, angle_deg: typing.SupportsFloat | typi Returns: None """ -def threshold(src: Tensor, thresh: Tensor, maxVal: Tensor, maxBatchSize: typing.SupportsInt | typing.SupportsIndex, threshType: eThresholdType, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor: +def threshold(src: Tensor, thresh: Tensor, maxVal: Tensor, maxBatchSize: typing.SupportsInt | typing.SupportsIndex, threshType: eThresholdType, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor: """ Executes the Thresholding operation on the given HIP stream. @@ -1397,7 +1403,7 @@ def threshold(src: Tensor, thresh: Tensor, maxVal: Tensor, maxBatchSize: typing. stream (rocpycv.Stream, optional): HIP stream to run this operation on. device (rocpycv.Device, optional): The device to run this operation on. Defaults to GPU. """ -def threshold_into(dst: Tensor, src: Tensor, thresh: Tensor, maxVal: Tensor, maxBatchSize: typing.SupportsInt | typing.SupportsIndex, threshType: eThresholdType, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None: +def threshold_into(dst: Tensor, src: Tensor, thresh: Tensor, maxVal: Tensor, maxBatchSize: typing.SupportsInt | typing.SupportsIndex, threshType: eThresholdType, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None: """ Executes the Thresholding operation on the given HIP stream. @@ -1414,7 +1420,7 @@ def threshold_into(dst: Tensor, src: Tensor, thresh: Tensor, maxVal: Tensor, max stream (rocpycv.Stream, optional): HIP stream to run this operation on. device (rocpycv.Device, optional): The device to run this operation on. Defaults to GPU. """ -def warp_affine(src: Tensor, xform: list, inverted: bool, interp: eInterpolationType, border_mode: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor: +def warp_affine(src: Tensor, xform: list, inverted: bool, interp: eInterpolationType, border_mode: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor: """ Executes the Warp Affine operation on the given HIP stream. @@ -1434,7 +1440,7 @@ def warp_affine(src: Tensor, xform: list, inverted: bool, interp: eInterpolation Returns: rocpycv.Tensor: The output tensor. """ -def warp_affine_into(dst: Tensor, src: Tensor, xform: list, inverted: bool, interp: eInterpolationType, border_mode: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None: +def warp_affine_into(dst: Tensor, src: Tensor, xform: list, inverted: bool, interp: eInterpolationType, border_mode: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None: """ Executes the Warp Affine operation on the given HIP stream. @@ -1455,7 +1461,7 @@ def warp_affine_into(dst: Tensor, src: Tensor, xform: list, inverted: bool, inte Returns: None """ -def warp_perspective(src: Tensor, xform: list, inverted: bool, interp: eInterpolationType, border_mode: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> Tensor: +def warp_perspective(src: Tensor, xform: list, inverted: bool, interp: eInterpolationType, border_mode: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor: """ Executes the Warp Perspective operation on the given HIP stream. @@ -1475,7 +1481,7 @@ def warp_perspective(src: Tensor, xform: list, inverted: bool, interp: eInterpol Returns: rocpycv.Tensor: The output tensor. """ -def warp_perspective_into(dst: Tensor, src: Tensor, xform: list, inverted: bool, interp: eInterpolationType, border_mode: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = ...) -> None: +def warp_perspective_into(dst: Tensor, src: Tensor, xform: list, inverted: bool, interp: eInterpolationType, border_mode: eBorderType, border_value: list, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None: """ Executes the Warp Perspective operation on the given HIP stream. From 12c85cf5f46e5ef067d2601c5e345cded3d20819 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Wed, 29 Apr 2026 11:14:08 -0400 Subject: [PATCH 04/20] Add migraphx classification example --- .../python/migraphx-classification/README.md | 52 ++++++ .../migraphx_classification.py | 173 ++++++++++++++++++ 2 files changed, 225 insertions(+) create mode 100644 samples/python/migraphx-classification/README.md create mode 100644 samples/python/migraphx-classification/migraphx_classification.py diff --git a/samples/python/migraphx-classification/README.md b/samples/python/migraphx-classification/README.md new file mode 100644 index 00000000..0de2d90d --- /dev/null +++ b/samples/python/migraphx-classification/README.md @@ -0,0 +1,52 @@ +# rocCV MIGraphX Classification Sample + +This sample demonstrates how to use rocCV to preprocess an image on the GPU and run inference with a ResNet50 ONNX model through MIGraphX. The preprocessed tensor is handed off to MIGraphX via a raw GPU pointer for zero-copy interop, so no host round-trip is needed between preprocessing and inference. + +## Dependencies + +- A rocCV build with the Python bindings (`rocpycv`) on `PYTHONPATH`. Build rocCV with Python 3.11 by passing the following to cmake: + ```shell + -DPYTHON_VERSION_SUGGESTED=3.11 + ``` +- [MIGraphX](https://github.com/ROCm/AMDMIGraphX) with its Python bindings. +- `opencv-python` and `numpy`. +- A ResNet50 ONNX model with input name `data` and shape `[N, 3, 224, 224]` (e.g. the ONNX Model Zoo `resnet50-v1-7.onnx`). +- Optional: a newline-separated ImageNet class labels file for human-readable output. + +## Command line + +```shell +python3.11 migraphx_classification.py \ + --input path/to/image.jpg \ + --model path/to/resnet50.onnx \ + --labels path/to/imagenet_classes.txt \ + --top-k 5 +``` + +Arguments: +- `--input` (required): path to the input image. +- `--model` (required): path to the ResNet50 ONNX file. +- `--labels` (optional): path to an ImageNet class label file. If omitted, classes are reported by index. +- `--top-k` (optional, default 5): number of top predictions to print. + +On the first run, the script compiles the ONNX model for the GPU and caches the result alongside the ONNX file as `_b1.mxr`. Subsequent runs load the cached `.mxr` directly and skip compilation. + +## Preprocessing Operators + +The preprocessing pipeline runs entirely on the GPU through `rocpycv`: + +1. **CvtColor**: Converts the OpenCV BGR image to RGB. +2. **Resize**: Resizes to 224x224 using cubic interpolation. +3. **Convert To**: Casts U8 pixels to float32 (no scaling — the `/255` step is folded into the normalize parameters). +4. **Normalize**: Applies ImageNet mean/std normalization. The mean and std constants are pre-multiplied by 255 so the operator can normalize directly from the [0, 255] float pixel range in a single pass. +5. **Reformat**: Converts the tensor from NHWC to NCHW, the layout MIGraphX/ONNX expects. + +## MIGraphX Interop + +The compiled MIGraphX program is built with `offload_copy=False`, so input and output buffers must already live on the GPU. The sample binds: +- The rocCV preprocessed tensor's GPU pointer (`tensor.data_ptr()`) as the `data` input via `migraphx.argument_from_pointer`. +- A `migraphx.allocate_gpu` buffer as the output. + +Inference is launched with `model.run_async` using the same HIP stream as the preprocessing pipeline (`stream.handle()`), so preprocessing and inference are serialized on a single stream with no extra synchronization until the final `stream.synchronize()`. + +The output logits are copied back to the host with `migraphx.from_gpu`, passed through softmax, and the top-K classes are printed. diff --git a/samples/python/migraphx-classification/migraphx_classification.py b/samples/python/migraphx-classification/migraphx_classification.py new file mode 100644 index 00000000..451df9e3 --- /dev/null +++ b/samples/python/migraphx-classification/migraphx_classification.py @@ -0,0 +1,173 @@ +# ############################################################################## +# Copyright (c) - 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +# ############################################################################## + +"""Classification with rocCV preprocessing and MIGraphX inference.""" + +from __future__ import annotations + +import argparse +import os + +import cv2 +import migraphx +import numpy as np +import rocpycv + + +# ImageNet normalization, scaled to the [0, 255] pixel range so we can apply +# them directly to U8-derived float pixels without a separate /255 step: +# (pixel/255 - mean) / std == (pixel - mean*255) / (std*255) +IMAGENET_MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32) * 255.0 +IMAGENET_STD = np.array([0.229, 0.224, 0.225], dtype=np.float32) * 255.0 + +INPUT_H, INPUT_W = 224, 224 + + +def read_image(image_path: str) -> np.ndarray: + """Read an image from disk as an NHWC uint8 BGR numpy array.""" + bgr = cv2.imread(image_path) + if bgr is None: + raise FileNotFoundError(f"Unable to load image: {image_path}") + return np.stack([bgr]) + + +def load_or_compile_model(onnx_path: str) -> migraphx.program: + """Load a cached compiled model, or parse + compile + cache the ONNX file.""" + # TODO: Support other batch sizes later + batch_size = 1 + cache_path = f"{os.path.splitext(onnx_path)[0]}_b{batch_size}.mxr" + + if os.path.exists(cache_path): + print(f"Loading cached compiled model: {cache_path}") + return migraphx.load(cache_path, format="msgpack") + + print(f"Parsing ONNX: {onnx_path}") + model = migraphx.parse_onnx( + onnx_path, + map_input_dims={"data": [batch_size, 3, INPUT_H, INPUT_W]}, + ) + + print("Compiling for GPU...") + # offload_copy=False allows us to bind GPU buffers directly to allow for + # zero-copy interop. + model.compile(migraphx.get_target("gpu"), offload_copy=False) + + print(f"Caching compiled model to: {cache_path}") + migraphx.save(model, cache_path, format="msgpack") + return model + + +def load_labels(labels_path: str | None) -> list[str] | None: + if labels_path is None: + return None + with open(labels_path) as f: + return [line.strip() for line in f if line.strip()] + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Classification with rocCV preprocessing and MIGraphX inference" + ) + parser.add_argument("--input", required=True, help="Path to input image") + parser.add_argument( + "--model", required=True, help="Path to a ResNet50 ONNX model" + ) + parser.add_argument( + "--labels", + default=None, + help="Optional path to a newline-separated ImageNet class labels file", + ) + parser.add_argument("--top-k", type=int, default=5) + return parser.parse_args() + + +def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray: + x = x - np.max(x, axis=axis, keepdims=True) + e = np.exp(x) + return e / np.sum(e, axis=axis, keepdims=True) + + +def main() -> None: + args = parse_args() + + # 1. Load the model + model = load_or_compile_model(args.model) + + print(f"Reading image: {args.input}") + np_image = read_image(args.input) + print(f"Input image shape: {np_image.shape}") + + print("Preprocessing with rocCV...") + stream = rocpycv.Stream() + + # 2. Convert the image to a rocCV tensor in NHWC layout. + tensor = rocpycv.from_dlpack(np_image, rocpycv.NHWC).copy_to(rocpycv.GPU) + + # 3. Convert from BGR to RGB for MIGraphX. + tensor = rocpycv.cvtcolor(tensor, rocpycv.COLOR_BGR2RGB, stream, rocpycv.GPU) + + # 4. Resize to 224x224. + tensor = rocpycv.resize(tensor, (1, INPUT_H, INPUT_W, 3), rocpycv.CUBIC, stream, rocpycv.GPU) + + # 5. Cast U8 -> F32 (no scaling; normalize step folds in /255). + tensor = rocpycv.convert_to(tensor, rocpycv.eDataType.F32, 1.0, 0.0, stream, rocpycv.GPU) + + # 6. ImageNet normalize: (pixel - mean) / std. + mean_t = rocpycv.from_dlpack(IMAGENET_MEAN.reshape(1, 1, 1, 3), rocpycv.NHWC).copy_to(rocpycv.GPU) + std_t = rocpycv.from_dlpack(IMAGENET_STD.reshape(1, 1, 1, 3), rocpycv.NHWC).copy_to(rocpycv.GPU) + tensor = rocpycv.normalize(tensor, mean_t, std_t, rocpycv.NormalizeFlags.SCALE_IS_STDDEV, 1.0, 0.0, 0.0, stream, rocpycv.GPU) + + # 7. NHWC -> NCHW (MIGraphX / ONNX expects NCHW). + tensor = rocpycv.reformat(tensor, rocpycv.eTensorLayout.NCHW, stream, rocpycv.GPU) + print(f"Preprocessed tensor shape (NCHW): {tensor.shape()}") + + print("Running MIGraphX inference...") + + # Setup MIGraphX arguments/shapes + in_shape = migraphx.shape(type="float_type", lens=tensor.shape()) + out_shape = migraphx.shape(type="float_type", lens=[1, 1000]) + in_arg = migraphx.argument_from_pointer(in_shape, tensor.data_ptr()) + out_buf = migraphx.allocate_gpu(out_shape) + + outputs = model.run_async( + {"data": in_arg, "main:#output_0": out_buf}, + stream.handle(), + "ihipStream_t", + ) + stream.synchronize() + + logits = np.array(migraphx.from_gpu(outputs[0])) + probs = softmax(logits, axis=1) + + labels = load_labels(args.labels) + + # Report top-K for the first image in the batch. + print(f"\nTop {args.top_k} predictions:") + top = np.argsort(probs[0])[::-1][: args.top_k] + for rank, idx in enumerate(top, start=1): + name = labels[idx] if labels is not None and idx < len(labels) else f"class {idx}" + print(f" {rank}. {name}: {probs[0][idx]:.6f}") + + +if __name__ == "__main__": + main() From 8fb6c4f6ce575d6494f00792bd37c797fb29d36c Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Wed, 29 Apr 2026 11:14:23 -0400 Subject: [PATCH 05/20] Remove 4S16 from pyenums --- python/src/py_enums.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/python/src/py_enums.cpp b/python/src/py_enums.cpp index 0ce39d08..6059c280 100644 --- a/python/src/py_enums.cpp +++ b/python/src/py_enums.cpp @@ -48,7 +48,6 @@ void PyEnums::Export(py::module& m) { .value("S32", DATA_TYPE_S32) .value("F32", DATA_TYPE_F32) .value("F64", DATA_TYPE_F64) - .value("4S16", DATA_TYPE_4S16) .export_values(); py::enum_(m, "eDeviceType") From e024eefcf86c1b01fb10d8bb03e49f7cc3bece5c Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Wed, 29 Apr 2026 12:10:06 -0400 Subject: [PATCH 06/20] Move classification sample location --- samples/{python => }/migraphx-classification/README.md | 0 .../migraphx-classification/migraphx_classification.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename samples/{python => }/migraphx-classification/README.md (100%) rename samples/{python => }/migraphx-classification/migraphx_classification.py (100%) diff --git a/samples/python/migraphx-classification/README.md b/samples/migraphx-classification/README.md similarity index 100% rename from samples/python/migraphx-classification/README.md rename to samples/migraphx-classification/README.md diff --git a/samples/python/migraphx-classification/migraphx_classification.py b/samples/migraphx-classification/migraphx_classification.py similarity index 100% rename from samples/python/migraphx-classification/migraphx_classification.py rename to samples/migraphx-classification/migraphx_classification.py From 25e2b9dcb16be5e6805764a6dc68373f7b9aacd9 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Wed, 29 Apr 2026 12:11:06 -0400 Subject: [PATCH 07/20] Move pytorch classification sample location --- samples/{classification => pytorch-classification}/README.md | 0 .../pytorch_classification.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename samples/{classification => pytorch-classification}/README.md (100%) rename samples/{classification => pytorch-classification}/pytorch_classification.py (100%) diff --git a/samples/classification/README.md b/samples/pytorch-classification/README.md similarity index 100% rename from samples/classification/README.md rename to samples/pytorch-classification/README.md diff --git a/samples/classification/pytorch_classification.py b/samples/pytorch-classification/pytorch_classification.py similarity index 100% rename from samples/classification/pytorch_classification.py rename to samples/pytorch-classification/pytorch_classification.py From c8da0a2e1e0bf2a00ad32c9914f500379904c952 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Wed, 29 Apr 2026 17:31:30 -0400 Subject: [PATCH 08/20] Cleanup migraphx classification sample --- samples/migraphx-classification/README.md | 4 +- .../migraphx_classification.py | 62 +++++++++++-------- 2 files changed, 37 insertions(+), 29 deletions(-) diff --git a/samples/migraphx-classification/README.md b/samples/migraphx-classification/README.md index 0de2d90d..ca1dfdc1 100644 --- a/samples/migraphx-classification/README.md +++ b/samples/migraphx-classification/README.md @@ -35,8 +35,8 @@ On the first run, the script compiles the ONNX model for the GPU and caches the The preprocessing pipeline runs entirely on the GPU through `rocpycv`: -1. **CvtColor**: Converts the OpenCV BGR image to RGB. -2. **Resize**: Resizes to 224x224 using cubic interpolation. +1. **Resize**: Resizes to 224x224 using cubic interpolation. +2. **CvtColor**: Converts the OpenCV BGR image to RGB. 3. **Convert To**: Casts U8 pixels to float32 (no scaling — the `/255` step is folded into the normalize parameters). 4. **Normalize**: Applies ImageNet mean/std normalization. The mean and std constants are pre-multiplied by 255 so the operator can normalize directly from the [0, 255] float pixel range in a single pass. 5. **Reformat**: Converts the tensor from NHWC to NCHW, the layout MIGraphX/ONNX expects. diff --git a/samples/migraphx-classification/migraphx_classification.py b/samples/migraphx-classification/migraphx_classification.py index 451df9e3..17d7a81e 100644 --- a/samples/migraphx-classification/migraphx_classification.py +++ b/samples/migraphx-classification/migraphx_classification.py @@ -41,6 +41,7 @@ IMAGENET_STD = np.array([0.229, 0.224, 0.225], dtype=np.float32) * 255.0 INPUT_H, INPUT_W = 224, 224 +BATCH_SIZE = 1 def read_image(image_path: str) -> np.ndarray: @@ -51,11 +52,12 @@ def read_image(image_path: str) -> np.ndarray: return np.stack([bgr]) -def load_or_compile_model(onnx_path: str) -> migraphx.program: +def load_or_compile_model(onnx_path: str, use_fp16: bool = True) -> migraphx.program: """Load a cached compiled model, or parse + compile + cache the ONNX file.""" # TODO: Support other batch sizes later batch_size = 1 - cache_path = f"{os.path.splitext(onnx_path)[0]}_b{batch_size}.mxr" + precision_tag = "fp16" if use_fp16 else "fp32" + cache_path = f"{os.path.splitext(onnx_path)[0]}_b{batch_size}_{precision_tag}.mxr" if os.path.exists(cache_path): print(f"Loading cached compiled model: {cache_path}") @@ -67,6 +69,12 @@ def load_or_compile_model(onnx_path: str) -> migraphx.program: map_input_dims={"data": [batch_size, 3, INPUT_H, INPUT_W]}, ) + if use_fp16: + print("Quantizing to FP16...") + # Inserts internal float -> half conversions; model inputs/outputs stay + # float32, so the existing F32 buffer setup remains unchanged. + migraphx.quantize_fp16(model) + print("Compiling for GPU...") # offload_copy=False allows us to bind GPU buffers directly to allow for # zero-copy interop. @@ -110,45 +118,44 @@ def softmax(x: np.ndarray, axis: int = -1) -> np.ndarray: def main() -> None: args = parse_args() - # 1. Load the model model = load_or_compile_model(args.model) print(f"Reading image: {args.input}") np_image = read_image(args.input) print(f"Input image shape: {np_image.shape}") - print("Preprocessing with rocCV...") - stream = rocpycv.Stream() + # Load/allocate tensors on the GPU + input_tensor : rocpycv.Tensor = rocpycv.from_dlpack(np_image, rocpycv.NHWC).copy_to(rocpycv.GPU) + resized : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.U8) + rgb : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.U8) + f32 : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.F32) + normalized : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.F32) + nchw : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, 3, INPUT_H, INPUT_W), rocpycv.NCHW, rocpycv.F32) - # 2. Convert the image to a rocCV tensor in NHWC layout. - tensor = rocpycv.from_dlpack(np_image, rocpycv.NHWC).copy_to(rocpycv.GPU) + mean_t : rocpycv.Tensor = rocpycv.from_dlpack(IMAGENET_MEAN.reshape(1, 1, 1, 3), rocpycv.NHWC).copy_to(rocpycv.GPU) + std_t : rocpycv.Tensor = rocpycv.from_dlpack(IMAGENET_STD.reshape(1, 1, 1, 3), rocpycv.NHWC).copy_to(rocpycv.GPU) - # 3. Convert from BGR to RGB for MIGraphX. - tensor = rocpycv.cvtcolor(tensor, rocpycv.COLOR_BGR2RGB, stream, rocpycv.GPU) - - # 4. Resize to 224x224. - tensor = rocpycv.resize(tensor, (1, INPUT_H, INPUT_W, 3), rocpycv.CUBIC, stream, rocpycv.GPU) + # Setup MIGraphX arguments/shapes + in_shape : migraphx.shape = migraphx.shape(type="float_type", lens=nchw.shape()) + out_shape : migraphx.shape = migraphx.shape(type="float_type", lens=[BATCH_SIZE, 1000]) - # 5. Cast U8 -> F32 (no scaling; normalize step folds in /255). - tensor = rocpycv.convert_to(tensor, rocpycv.eDataType.F32, 1.0, 0.0, stream, rocpycv.GPU) + in_arg : migraphx.argument = migraphx.argument_from_pointer(in_shape, nchw.data_ptr()) + out_buf : migraphx.buffer = migraphx.allocate_gpu(out_shape) - # 6. ImageNet normalize: (pixel - mean) / std. - mean_t = rocpycv.from_dlpack(IMAGENET_MEAN.reshape(1, 1, 1, 3), rocpycv.NHWC).copy_to(rocpycv.GPU) - std_t = rocpycv.from_dlpack(IMAGENET_STD.reshape(1, 1, 1, 3), rocpycv.NHWC).copy_to(rocpycv.GPU) - tensor = rocpycv.normalize(tensor, mean_t, std_t, rocpycv.NormalizeFlags.SCALE_IS_STDDEV, 1.0, 0.0, 0.0, stream, rocpycv.GPU) + # Begin preprocessing + print("Preprocessing with rocCV...") + stream = rocpycv.Stream() - # 7. NHWC -> NCHW (MIGraphX / ONNX expects NCHW). - tensor = rocpycv.reformat(tensor, rocpycv.eTensorLayout.NCHW, stream, rocpycv.GPU) - print(f"Preprocessed tensor shape (NCHW): {tensor.shape()}") + rocpycv.resize_into(resized, input_tensor, rocpycv.CUBIC, stream) + rocpycv.cvtcolor_into(rgb, resized, rocpycv.COLOR_BGR2RGB, stream) + rocpycv.convert_to_into(f32, rgb, 1.0, 0.0, stream) + rocpycv.normalize_into(normalized, f32, mean_t, std_t, rocpycv.NormalizeFlags.SCALE_IS_STDDEV, 1.0, 0.0, 0.0, stream) + rocpycv.reformat_into(nchw, normalized, stream) + + print(f"Preprocessed tensor shape (NCHW): {nchw.shape()}") print("Running MIGraphX inference...") - # Setup MIGraphX arguments/shapes - in_shape = migraphx.shape(type="float_type", lens=tensor.shape()) - out_shape = migraphx.shape(type="float_type", lens=[1, 1000]) - in_arg = migraphx.argument_from_pointer(in_shape, tensor.data_ptr()) - out_buf = migraphx.allocate_gpu(out_shape) - outputs = model.run_async( {"data": in_arg, "main:#output_0": out_buf}, stream.handle(), @@ -156,6 +163,7 @@ def main() -> None: ) stream.synchronize() + # Postprocess the inference results logits = np.array(migraphx.from_gpu(outputs[0])) probs = softmax(logits, axis=1) From 666226fa62af41ea7983fb847d11e7f0529e5535 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Wed, 29 Apr 2026 17:35:24 -0400 Subject: [PATCH 09/20] Minor cleanup --- samples/migraphx-classification/README.md | 7 ++----- .../migraphx-classification/migraphx_classification.py | 8 +++----- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/samples/migraphx-classification/README.md b/samples/migraphx-classification/README.md index ca1dfdc1..c2edcfdd 100644 --- a/samples/migraphx-classification/README.md +++ b/samples/migraphx-classification/README.md @@ -4,10 +4,7 @@ This sample demonstrates how to use rocCV to preprocess an image on the GPU and ## Dependencies -- A rocCV build with the Python bindings (`rocpycv`) on `PYTHONPATH`. Build rocCV with Python 3.11 by passing the following to cmake: - ```shell - -DPYTHON_VERSION_SUGGESTED=3.11 - ``` +- A rocCV build with the Python bindings (`rocpycv`) on `PYTHONPATH`. - [MIGraphX](https://github.com/ROCm/AMDMIGraphX) with its Python bindings. - `opencv-python` and `numpy`. - A ResNet50 ONNX model with input name `data` and shape `[N, 3, 224, 224]` (e.g. the ONNX Model Zoo `resnet50-v1-7.onnx`). @@ -16,7 +13,7 @@ This sample demonstrates how to use rocCV to preprocess an image on the GPU and ## Command line ```shell -python3.11 migraphx_classification.py \ +python3 migraphx_classification.py \ --input path/to/image.jpg \ --model path/to/resnet50.onnx \ --labels path/to/imagenet_classes.txt \ diff --git a/samples/migraphx-classification/migraphx_classification.py b/samples/migraphx-classification/migraphx_classification.py index 17d7a81e..c144e534 100644 --- a/samples/migraphx-classification/migraphx_classification.py +++ b/samples/migraphx-classification/migraphx_classification.py @@ -54,10 +54,8 @@ def read_image(image_path: str) -> np.ndarray: def load_or_compile_model(onnx_path: str, use_fp16: bool = True) -> migraphx.program: """Load a cached compiled model, or parse + compile + cache the ONNX file.""" - # TODO: Support other batch sizes later - batch_size = 1 precision_tag = "fp16" if use_fp16 else "fp32" - cache_path = f"{os.path.splitext(onnx_path)[0]}_b{batch_size}_{precision_tag}.mxr" + cache_path = f"{os.path.splitext(onnx_path)[0]}_b{BATCH_SIZE}_{precision_tag}.mxr" if os.path.exists(cache_path): print(f"Loading cached compiled model: {cache_path}") @@ -66,7 +64,7 @@ def load_or_compile_model(onnx_path: str, use_fp16: bool = True) -> migraphx.pro print(f"Parsing ONNX: {onnx_path}") model = migraphx.parse_onnx( onnx_path, - map_input_dims={"data": [batch_size, 3, INPUT_H, INPUT_W]}, + map_input_dims={"data": [BATCH_SIZE, 3, INPUT_H, INPUT_W]}, ) if use_fp16: @@ -144,7 +142,7 @@ def main() -> None: # Begin preprocessing print("Preprocessing with rocCV...") - stream = rocpycv.Stream() + stream : rocpycv.Stream = rocpycv.Stream() rocpycv.resize_into(resized, input_tensor, rocpycv.CUBIC, stream) rocpycv.cvtcolor_into(rgb, resized, rocpycv.COLOR_BGR2RGB, stream) From d0ee8993170a3c7ab352bcafe99cc0e1fe6ee2af Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Wed, 29 Apr 2026 17:39:44 -0400 Subject: [PATCH 10/20] Minor variable name changes --- samples/migraphx-classification/README.md | 2 +- .../migraphx_classification.py | 24 +++++++++---------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/samples/migraphx-classification/README.md b/samples/migraphx-classification/README.md index c2edcfdd..83c7f4e2 100644 --- a/samples/migraphx-classification/README.md +++ b/samples/migraphx-classification/README.md @@ -41,7 +41,7 @@ The preprocessing pipeline runs entirely on the GPU through `rocpycv`: ## MIGraphX Interop The compiled MIGraphX program is built with `offload_copy=False`, so input and output buffers must already live on the GPU. The sample binds: -- The rocCV preprocessed tensor's GPU pointer (`tensor.data_ptr()`) as the `data` input via `migraphx.argument_from_pointer`. +- The rocCV preprocessed tensor's GPU pointer (`preprocessed.data_ptr()`) as the `data` input via `migraphx.argument_from_pointer`. - A `migraphx.allocate_gpu` buffer as the output. Inference is launched with `model.run_async` using the same HIP stream as the preprocessing pipeline (`stream.handle()`), so preprocessing and inference are serialized on a single stream with no extra synchronization until the final `stream.synchronize()`. diff --git a/samples/migraphx-classification/migraphx_classification.py b/samples/migraphx-classification/migraphx_classification.py index c144e534..e36b0c85 100644 --- a/samples/migraphx-classification/migraphx_classification.py +++ b/samples/migraphx-classification/migraphx_classification.py @@ -123,21 +123,21 @@ def main() -> None: print(f"Input image shape: {np_image.shape}") # Load/allocate tensors on the GPU - input_tensor : rocpycv.Tensor = rocpycv.from_dlpack(np_image, rocpycv.NHWC).copy_to(rocpycv.GPU) - resized : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.U8) - rgb : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.U8) - f32 : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.F32) - normalized : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.F32) - nchw : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, 3, INPUT_H, INPUT_W), rocpycv.NCHW, rocpycv.F32) + input_tensor : rocpycv.Tensor = rocpycv.from_dlpack(np_image, rocpycv.NHWC).copy_to(rocpycv.GPU) + resized : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.U8) + rgb : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.U8) + f32 : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.F32) + normalized : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.F32) + preprocessed : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, 3, INPUT_H, INPUT_W), rocpycv.NCHW, rocpycv.F32) - mean_t : rocpycv.Tensor = rocpycv.from_dlpack(IMAGENET_MEAN.reshape(1, 1, 1, 3), rocpycv.NHWC).copy_to(rocpycv.GPU) - std_t : rocpycv.Tensor = rocpycv.from_dlpack(IMAGENET_STD.reshape(1, 1, 1, 3), rocpycv.NHWC).copy_to(rocpycv.GPU) + mean_t : rocpycv.Tensor = rocpycv.from_dlpack(IMAGENET_MEAN.reshape(1, 1, 1, 3), rocpycv.NHWC).copy_to(rocpycv.GPU) + std_t : rocpycv.Tensor = rocpycv.from_dlpack(IMAGENET_STD.reshape(1, 1, 1, 3), rocpycv.NHWC).copy_to(rocpycv.GPU) # Setup MIGraphX arguments/shapes - in_shape : migraphx.shape = migraphx.shape(type="float_type", lens=nchw.shape()) + in_shape : migraphx.shape = migraphx.shape(type="float_type", lens=preprocessed.shape()) out_shape : migraphx.shape = migraphx.shape(type="float_type", lens=[BATCH_SIZE, 1000]) - in_arg : migraphx.argument = migraphx.argument_from_pointer(in_shape, nchw.data_ptr()) + in_arg : migraphx.argument = migraphx.argument_from_pointer(in_shape, preprocessed.data_ptr()) out_buf : migraphx.buffer = migraphx.allocate_gpu(out_shape) # Begin preprocessing @@ -148,9 +148,9 @@ def main() -> None: rocpycv.cvtcolor_into(rgb, resized, rocpycv.COLOR_BGR2RGB, stream) rocpycv.convert_to_into(f32, rgb, 1.0, 0.0, stream) rocpycv.normalize_into(normalized, f32, mean_t, std_t, rocpycv.NormalizeFlags.SCALE_IS_STDDEV, 1.0, 0.0, 0.0, stream) - rocpycv.reformat_into(nchw, normalized, stream) + rocpycv.reformat_into(preprocessed, normalized, stream) - print(f"Preprocessed tensor shape (NCHW): {nchw.shape()}") + print(f"Preprocessed tensor shape (NCHW): {preprocessed.shape()}") print("Running MIGraphX inference...") From 8ad82ba014b8a7f65410eb7190bde9e0854bba08 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Wed, 29 Apr 2026 18:15:51 -0400 Subject: [PATCH 11/20] Allow use of numpy types to specify rocpycv.Tensor types --- python/include/py_helpers.hpp | 21 +++++++- python/src/py_helpers.cpp | 49 +++++++++++++++++++ python/src/py_tensor.cpp | 29 ++++++++--- python/src/rocpycv.pyi | 12 ++--- .../migraphx_classification.py | 18 +++---- 5 files changed, 107 insertions(+), 22 deletions(-) diff --git a/python/include/py_helpers.hpp b/python/include/py_helpers.hpp index 369cac10..2e3dc117 100644 --- a/python/include/py_helpers.hpp +++ b/python/include/py_helpers.hpp @@ -86,4 +86,23 @@ extern double2 GetDouble2FromTuple(py::tuple src); * @param src A python tuple of size 2. * @return int2 */ -extern int2 GetInt2FromTuple(py::tuple src); \ No newline at end of file +extern int2 GetInt2FromTuple(py::tuple src); + +/** + * @brief Resolves a Python object to an eTensorLayout. Accepts either an rocpycv.eTensorLayout enum + * value, or a layout string such as "NHWC". Throws std::runtime_error for unsupported inputs. + * + * @param obj A Python object describing the tensor layout. + * @return eTensorLayout + */ +extern eTensorLayout LayoutFromPyObject(py::object obj); + +/** + * @brief Resolves a Python object to an eDataType. Accepts either an rocpycv.eDataType enum value, + * or anything coercible to a NumPy dtype (e.g. ``np.float32``, ``np.dtype("uint8")``, + * ``"float32"``). Throws std::runtime_error for unsupported inputs. + * + * @param obj A Python object describing the tensor data type. + * @return eDataType + */ +extern eDataType DataTypeFromPyObject(py::object obj); \ No newline at end of file diff --git a/python/src/py_helpers.cpp b/python/src/py_helpers.cpp index 97902915..8d4dce55 100644 --- a/python/src/py_helpers.cpp +++ b/python/src/py_helpers.cpp @@ -22,7 +22,11 @@ THE SOFTWARE. #include "py_helpers.hpp" +#include +#include + #include +#include eDataType DLTypeToRoccvType(DLDataType dtype) { if (dtype.bits == 8) { @@ -147,4 +151,49 @@ int2 GetInt2FromTuple(py::tuple src) { std::runtime_error("Cannot convert py::tuple to int2. py::tuple.size() != 2."); } return make_int2(src[0].cast(), src[1].cast()); +} + +eTensorLayout LayoutFromPyObject(py::object obj) { + if (py::isinstance(obj)) { + return obj.cast(); + } + + if (py::isinstance(obj)) { + std::string s = obj.cast(); + for (const auto& [layout, name] : roccv::TensorLayout::layoutStringTable) { + if (name == s) return layout; + } + throw std::runtime_error("Unknown tensor layout string: '" + s + "'."); + } + + throw std::runtime_error("layout must be an rocpycv.eTensorLayout or a layout string (e.g. 'NHWC')."); +} + +eDataType DataTypeFromPyObject(py::object obj) { + if (py::isinstance(obj)) { + return obj.cast(); + } + + // np.dtype() accepts numpy scalar types (np.float32), dtype instances, and dtype strings, + // so we delegate the parsing to NumPy itself rather than enumerating cases here. + py::dtype dt; + try { + static const py::object np_dtype = py::module_::import("numpy").attr("dtype"); + dt = np_dtype(obj).cast(); + } catch (const std::exception&) { + throw std::runtime_error( + "dtype must be an rocpycv.eDataType or a NumPy dtype/scalar type (e.g. np.float32)."); + } + + DLDataTypeCode code; + switch (dt.kind()) { + case 'u': code = kDLUInt; break; + case 'i': code = kDLInt; break; + case 'f': code = kDLFloat; break; + default: + throw std::runtime_error("Unsupported NumPy dtype for rocpycv.Tensor (kind '" + + std::string(1, dt.kind()) + "')."); + } + DLDataType dl{static_cast(code), static_cast(dt.itemsize() * 8), 1}; + return DLTypeToRoccvType(dl); } \ No newline at end of file diff --git a/python/src/py_tensor.cpp b/python/src/py_tensor.cpp index 8c864b46..4fa4dd1b 100644 --- a/python/src/py_tensor.cpp +++ b/python/src/py_tensor.cpp @@ -202,8 +202,15 @@ void PyTensor::Export(pybind11::module& m) { pybind11::class_> tensor(m, "Tensor"); tensor - .def(pybind11::init, eTensorLayout, eDataType, eDeviceType>(), "shape"_a, "layout"_a, - "dtype"_a, "device"_a = eDeviceType::GPU, "Constructs a tensor object.") + .def(pybind11::init([](std::vector shape, py::object layout, py::object dtype, eDeviceType device) { + return std::make_shared(shape, LayoutFromPyObject(layout), DataTypeFromPyObject(dtype), + device); + }), + "shape"_a, "layout"_a, "dtype"_a, "device"_a = eDeviceType::GPU, + "Constructs a tensor object. ``layout`` may be an ``rocpycv.eTensorLayout`` (e.g. " + "``rocpycv.NHWC``) or a layout string (``\"NHWC\"``). ``dtype`` may be an " + "``rocpycv.eDataType`` (e.g. ``rocpycv.F32``) or a NumPy dtype/scalar type " + "(e.g. ``np.float32``).") .def("copy_to", &PyTensor::copyTo, "device"_a, "Returns a deep copy of the tensor with data copied to a specified device type.") .def("__dlpack__", &PyTensor::toDLPack, "stream"_a = py::none(), @@ -221,8 +228,18 @@ void PyTensor::Export(pybind11::module& m) { .def("dtype", &PyTensor::getDataType, "Returns the data type of the tensor.") .def("__dlpack_device__", &PyTensor::getDLDevice, "Returns a tuple containing the DLPack device and device id for the tensor.") - .def("reshape", &PyTensor::reshape, "new_shape"_a, "layout"_a, - "Creates a new tensor with the specified shape."); - m.def("from_dlpack", &PyTensor::fromDLPack, "buffer"_a, "layout"_a, - "Wraps a DLPack supported tensor in a rocpycv tensor."); + .def( + "reshape", + [](PyTensor& self, std::vector newShape, py::object layout) { + return self.reshape(newShape, LayoutFromPyObject(layout)); + }, + "new_shape"_a, "layout"_a, + "Creates a new tensor with the specified shape. ``layout`` may be an " + "``rocpycv.eTensorLayout`` or a layout string (e.g. ``\"NHWC\"``)."); + m.def( + "from_dlpack", + [](pybind11::object src, py::object layout) { return PyTensor::fromDLPack(src, LayoutFromPyObject(layout)); }, + "buffer"_a, "layout"_a, + "Wraps a DLPack supported tensor in a rocpycv tensor. ``layout`` may be an " + "``rocpycv.eTensorLayout`` or a layout string (e.g. ``\"NHWC\"``)."); } \ No newline at end of file diff --git a/python/src/rocpycv.pyi b/python/src/rocpycv.pyi index c6bc639c..ce2e7584 100644 --- a/python/src/rocpycv.pyi +++ b/python/src/rocpycv.pyi @@ -170,9 +170,9 @@ class Tensor: """ Returns a tuple containing the DLPack device and device id for the tensor. """ - def __init__(self, shape: collections.abc.Sequence[typing.SupportsInt | typing.SupportsIndex], layout: eTensorLayout, dtype: eDataType, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None: + def __init__(self, shape: collections.abc.Sequence[typing.SupportsInt | typing.SupportsIndex], layout: typing.Any, dtype: typing.Any, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None: """ - Constructs a tensor object. + Constructs a tensor object. ``layout`` may be an ``rocpycv.eTensorLayout`` (e.g. ``rocpycv.NHWC``) or a layout string (``"NHWC"``). ``dtype`` may be an ``rocpycv.eDataType`` (e.g. ``rocpycv.F32``) or a NumPy dtype/scalar type (e.g. ``np.float32``). """ def copy_to(self, device: eDeviceType) -> Tensor: """ @@ -198,9 +198,9 @@ class Tensor: """ Returns the number of dimensions of the tensor. """ - def reshape(self, new_shape: collections.abc.Sequence[typing.SupportsInt | typing.SupportsIndex], layout: eTensorLayout) -> Tensor: + def reshape(self, new_shape: collections.abc.Sequence[typing.SupportsInt | typing.SupportsIndex], layout: typing.Any) -> Tensor: """ - Creates a new tensor with the specified shape. + Creates a new tensor with the specified shape. ``layout`` may be an ``rocpycv.eTensorLayout`` or a layout string (e.g. ``"NHWC"``). """ def shape(self) -> list[int]: """ @@ -1089,9 +1089,9 @@ def flip_into(dst: Tensor, src: Tensor, flip_code: typing.SupportsInt | typing.S Returns: None """ -def from_dlpack(buffer: typing.Any, layout: eTensorLayout) -> Tensor: +def from_dlpack(buffer: typing.Any, layout: typing.Any) -> Tensor: """ - Wraps a DLPack supported tensor in a rocpycv tensor. + Wraps a DLPack supported tensor in a rocpycv tensor. ``layout`` may be an ``rocpycv.eTensorLayout`` or a layout string (e.g. ``"NHWC"``). """ def gamma_contrast(src: Tensor, gamma: typing.SupportsFloat | typing.SupportsIndex, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor: """ diff --git a/samples/migraphx-classification/migraphx_classification.py b/samples/migraphx-classification/migraphx_classification.py index e36b0c85..b064433d 100644 --- a/samples/migraphx-classification/migraphx_classification.py +++ b/samples/migraphx-classification/migraphx_classification.py @@ -123,15 +123,15 @@ def main() -> None: print(f"Input image shape: {np_image.shape}") # Load/allocate tensors on the GPU - input_tensor : rocpycv.Tensor = rocpycv.from_dlpack(np_image, rocpycv.NHWC).copy_to(rocpycv.GPU) - resized : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.U8) - rgb : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.U8) - f32 : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.F32) - normalized : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), rocpycv.NHWC, rocpycv.F32) - preprocessed : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, 3, INPUT_H, INPUT_W), rocpycv.NCHW, rocpycv.F32) - - mean_t : rocpycv.Tensor = rocpycv.from_dlpack(IMAGENET_MEAN.reshape(1, 1, 1, 3), rocpycv.NHWC).copy_to(rocpycv.GPU) - std_t : rocpycv.Tensor = rocpycv.from_dlpack(IMAGENET_STD.reshape(1, 1, 1, 3), rocpycv.NHWC).copy_to(rocpycv.GPU) + input_tensor : rocpycv.Tensor = rocpycv.from_dlpack(np_image, "NHWC").copy_to(rocpycv.GPU) + resized : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), "NHWC", np.uint8) + rgb : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), "NHWC", np.uint8) + f32 : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), "NHWC", np.float32) + normalized : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), "NHWC", np.float32) + preprocessed : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, 3, INPUT_H, INPUT_W), "NCHW", np.float32) + + mean_t : rocpycv.Tensor = rocpycv.from_dlpack(IMAGENET_MEAN.reshape(1, 1, 1, 3), "NHWC").copy_to(rocpycv.GPU) + std_t : rocpycv.Tensor = rocpycv.from_dlpack(IMAGENET_STD.reshape(1, 1, 1, 3), "NHWC").copy_to(rocpycv.GPU) # Setup MIGraphX arguments/shapes in_shape : migraphx.shape = migraphx.shape(type="float_type", lens=preprocessed.shape()) From 6a0e0faf4516e974985bafe4a2235efbfbb58db9 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Wed, 29 Apr 2026 18:26:07 -0400 Subject: [PATCH 12/20] Swap dtype <-> layout position in tensor construction --- python/include/py_tensor.hpp | 4 ++-- python/src/py_tensor.cpp | 16 ++++++++-------- python/src/rocpycv.pyi | 4 ++-- .../migraphx_classification.py | 10 +++++----- tests/roccv/python/test_op_adv_cvt_color.py | 6 +++--- tests/roccv/python/test_op_bilateral_filter.py | 2 +- tests/roccv/python/test_op_bnd_box.py | 3 +-- tests/roccv/python/test_op_center_crop.py | 3 +-- tests/roccv/python/test_op_composite.py | 2 +- tests/roccv/python/test_op_convert_to.py | 2 +- tests/roccv/python/test_op_copy_make_border.py | 2 +- tests/roccv/python/test_op_custom_crop.py | 3 +-- tests/roccv/python/test_op_cvt_color.py | 2 +- tests/roccv/python/test_op_flip.py | 2 +- tests/roccv/python/test_op_gamma_contrast.py | 2 +- tests/roccv/python/test_op_histogram.py | 2 +- .../roccv/python/test_op_non_max_suppression.py | 2 +- tests/roccv/python/test_op_normalize.py | 2 +- tests/roccv/python/test_op_reformat.py | 2 +- tests/roccv/python/test_op_remap.py | 2 +- tests/roccv/python/test_op_resize.py | 2 +- tests/roccv/python/test_op_rotate.py | 2 +- tests/roccv/python/test_op_thresholding.py | 2 +- tests/roccv/python/test_op_warp_affine.py | 2 +- tests/roccv/python/test_op_warp_perspective.py | 2 +- 25 files changed, 40 insertions(+), 43 deletions(-) diff --git a/python/include/py_tensor.hpp b/python/include/py_tensor.hpp index f0c3e1d2..180006d2 100644 --- a/python/include/py_tensor.hpp +++ b/python/include/py_tensor.hpp @@ -38,11 +38,11 @@ class PyTensor : public std::enable_shared_from_this { * wrap. * * @param shape The shape of the tensor. - * @param layout The layout of the tensor. * @param dtype The data type of the tensor. + * @param layout The layout of the tensor. * @param device The device of the tensor. */ - PyTensor(std::vector shape, eTensorLayout layout, eDataType dtype, eDeviceType device); + PyTensor(std::vector shape, eDataType dtype, eTensorLayout layout, eDeviceType device); /** * @brief Wraps an existing roccv::Tensor inside of a newly constructed PyTensor. diff --git a/python/src/py_tensor.cpp b/python/src/py_tensor.cpp index 4fa4dd1b..5c757c02 100644 --- a/python/src/py_tensor.cpp +++ b/python/src/py_tensor.cpp @@ -59,7 +59,7 @@ DLManagedTensor* createDLManagedTensor(std::shared_ptr tensor, st return dlTensor; } -PyTensor::PyTensor(std::vector shape, eTensorLayout layout, eDataType dtype, eDeviceType device) { +PyTensor::PyTensor(std::vector shape, eDataType dtype, eTensorLayout layout, eDeviceType device) { roccv::TensorShape tShape(roccv::TensorShape(roccv::TensorLayout(layout), shape)); m_tensor = std::make_shared(tShape, roccv::DataType(dtype), device); } @@ -202,15 +202,15 @@ void PyTensor::Export(pybind11::module& m) { pybind11::class_> tensor(m, "Tensor"); tensor - .def(pybind11::init([](std::vector shape, py::object layout, py::object dtype, eDeviceType device) { - return std::make_shared(shape, LayoutFromPyObject(layout), DataTypeFromPyObject(dtype), + .def(pybind11::init([](std::vector shape, py::object dtype, py::object layout, eDeviceType device) { + return std::make_shared(shape, DataTypeFromPyObject(dtype), LayoutFromPyObject(layout), device); }), - "shape"_a, "layout"_a, "dtype"_a, "device"_a = eDeviceType::GPU, - "Constructs a tensor object. ``layout`` may be an ``rocpycv.eTensorLayout`` (e.g. " - "``rocpycv.NHWC``) or a layout string (``\"NHWC\"``). ``dtype`` may be an " - "``rocpycv.eDataType`` (e.g. ``rocpycv.F32``) or a NumPy dtype/scalar type " - "(e.g. ``np.float32``).") + "shape"_a, "dtype"_a, "layout"_a, "device"_a = eDeviceType::GPU, + "Constructs a tensor object. ``dtype`` may be an ``rocpycv.eDataType`` (e.g. " + "``rocpycv.F32``) or a NumPy dtype/scalar type (e.g. ``np.float32``). ``layout`` " + "may be an ``rocpycv.eTensorLayout`` (e.g. ``rocpycv.NHWC``) or a layout string " + "(``\"NHWC\"``).") .def("copy_to", &PyTensor::copyTo, "device"_a, "Returns a deep copy of the tensor with data copied to a specified device type.") .def("__dlpack__", &PyTensor::toDLPack, "stream"_a = py::none(), diff --git a/python/src/rocpycv.pyi b/python/src/rocpycv.pyi index ce2e7584..f24fb615 100644 --- a/python/src/rocpycv.pyi +++ b/python/src/rocpycv.pyi @@ -170,9 +170,9 @@ class Tensor: """ Returns a tuple containing the DLPack device and device id for the tensor. """ - def __init__(self, shape: collections.abc.Sequence[typing.SupportsInt | typing.SupportsIndex], layout: typing.Any, dtype: typing.Any, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None: + def __init__(self, shape: collections.abc.Sequence[typing.SupportsInt | typing.SupportsIndex], dtype: typing.Any, layout: typing.Any, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> None: """ - Constructs a tensor object. ``layout`` may be an ``rocpycv.eTensorLayout`` (e.g. ``rocpycv.NHWC``) or a layout string (``"NHWC"``). ``dtype`` may be an ``rocpycv.eDataType`` (e.g. ``rocpycv.F32``) or a NumPy dtype/scalar type (e.g. ``np.float32``). + Constructs a tensor object. ``dtype`` may be an ``rocpycv.eDataType`` (e.g. ``rocpycv.F32``) or a NumPy dtype/scalar type (e.g. ``np.float32``). ``layout`` may be an ``rocpycv.eTensorLayout`` (e.g. ``rocpycv.NHWC``) or a layout string (``"NHWC"``). """ def copy_to(self, device: eDeviceType) -> Tensor: """ diff --git a/samples/migraphx-classification/migraphx_classification.py b/samples/migraphx-classification/migraphx_classification.py index b064433d..be7fc66e 100644 --- a/samples/migraphx-classification/migraphx_classification.py +++ b/samples/migraphx-classification/migraphx_classification.py @@ -124,11 +124,11 @@ def main() -> None: # Load/allocate tensors on the GPU input_tensor : rocpycv.Tensor = rocpycv.from_dlpack(np_image, "NHWC").copy_to(rocpycv.GPU) - resized : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), "NHWC", np.uint8) - rgb : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), "NHWC", np.uint8) - f32 : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), "NHWC", np.float32) - normalized : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), "NHWC", np.float32) - preprocessed : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, 3, INPUT_H, INPUT_W), "NCHW", np.float32) + resized : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), np.uint8, "NHWC") + rgb : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), np.uint8, "NHWC") + f32 : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), np.float32, "NHWC") + normalized : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, INPUT_H, INPUT_W, 3), np.float32, "NHWC") + preprocessed : rocpycv.Tensor = rocpycv.Tensor((BATCH_SIZE, 3, INPUT_H, INPUT_W), np.float32, "NCHW") mean_t : rocpycv.Tensor = rocpycv.from_dlpack(IMAGENET_MEAN.reshape(1, 1, 1, 3), "NHWC").copy_to(rocpycv.GPU) std_t : rocpycv.Tensor = rocpycv.from_dlpack(IMAGENET_STD.reshape(1, 1, 1, 3), "NHWC").copy_to(rocpycv.GPU) diff --git a/tests/roccv/python/test_op_adv_cvt_color.py b/tests/roccv/python/test_op_adv_cvt_color.py index 8a243f68..32b7f686 100644 --- a/tests/roccv/python/test_op_adv_cvt_color.py +++ b/tests/roccv/python/test_op_adv_cvt_color.py @@ -62,7 +62,7 @@ @pytest.mark.parametrize("samples,width,height", [[1, 64, 48], [2, 128, 72]]) def test_op_advcvtcolor_interleaved444(samples, height, width, code, spec, dtype, device): input_tensor = generate_tensor(samples, width, height, 3, dtype, device) - output_golden = rocpycv.Tensor([samples, height, width, 3], rocpycv.eTensorLayout.NHWC, dtype, device) + output_golden = rocpycv.Tensor([samples, height, width, 3], dtype, rocpycv.eTensorLayout.NHWC, device) stream = rocpycv.Stream() output = rocpycv.advcvtcolor(input_tensor, code, spec, stream, device) @@ -79,7 +79,7 @@ def test_op_advcvtcolor_interleaved444(samples, height, width, code, spec, dtype @pytest.mark.parametrize("samples,width,height", [[1, 64, 48], [2, 128, 72]]) def test_op_advcvtcolor_interleaved_to_semiplanar(samples, height, width, code, spec, dtype, device): input_tensor = generate_tensor(samples, width, height, 3, dtype, device) - output_golden = rocpycv.Tensor([samples, (height * 3) // 2, width, 1], rocpycv.eTensorLayout.NHWC, dtype, device) + output_golden = rocpycv.Tensor([samples, (height * 3) // 2, width, 1], dtype, rocpycv.eTensorLayout.NHWC, device) stream = rocpycv.Stream() output = rocpycv.advcvtcolor(input_tensor, code, spec, stream, device) @@ -96,7 +96,7 @@ def test_op_advcvtcolor_interleaved_to_semiplanar(samples, height, width, code, @pytest.mark.parametrize("samples,width,height", [[1, 64, 48], [2, 128, 72]]) def test_op_advcvtcolor_semiplanar_to_interleaved(samples, height, width, code, spec, dtype, device): input_tensor = generate_tensor(samples, width, (height * 3) // 2, 1, dtype, device) - output_golden = rocpycv.Tensor([samples, height, width, 3], rocpycv.eTensorLayout.NHWC, dtype, device) + output_golden = rocpycv.Tensor([samples, height, width, 3], dtype, rocpycv.eTensorLayout.NHWC, device) stream = rocpycv.Stream() output = rocpycv.advcvtcolor(input_tensor, code, spec, stream, device) diff --git a/tests/roccv/python/test_op_bilateral_filter.py b/tests/roccv/python/test_op_bilateral_filter.py index 6e0abfd3..72e6708d 100644 --- a/tests/roccv/python/test_op_bilateral_filter.py +++ b/tests/roccv/python/test_op_bilateral_filter.py @@ -44,7 +44,7 @@ ]) def test_op_bilateral_filter(samples, height, width, channels, border_mode, border_val, diameter, sigma_color, sigma_space, dtype, device): input = generate_tensor(samples, width, height, channels, dtype, device) - output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device) + output_golden = rocpycv.Tensor([samples, height, width, channels], dtype, rocpycv.eTensorLayout.NHWC, device) stream = rocpycv.Stream() rocpycv.bilateral_filter_into(output_golden, input, diameter, sigma_color, diff --git a/tests/roccv/python/test_op_bnd_box.py b/tests/roccv/python/test_op_bnd_box.py index d6fd8045..f5377b65 100644 --- a/tests/roccv/python/test_op_bnd_box.py +++ b/tests/roccv/python/test_op_bnd_box.py @@ -52,8 +52,7 @@ def generate_boxes(samples: int, height: int, width: int) -> rocpycv.BndBoxes: def test_op_remap(samples, height, width, channels, device): input = generate_tensor(samples, width, height, channels, rocpycv.eDataType.U8, device) boxes = generate_boxes(samples, height, width) - output_golden = rocpycv.Tensor([samples, height, width, channels], - rocpycv.eTensorLayout.NHWC, rocpycv.eDataType.U8, device) + output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eDataType.U8, rocpycv.eTensorLayout.NHWC, device) stream = rocpycv.Stream() output = rocpycv.bndbox(input, boxes, stream, device) diff --git a/tests/roccv/python/test_op_center_crop.py b/tests/roccv/python/test_op_center_crop.py index a8054dcc..8daa507f 100644 --- a/tests/roccv/python/test_op_center_crop.py +++ b/tests/roccv/python/test_op_center_crop.py @@ -41,8 +41,7 @@ ]) def test_op_center_crop(samples, height, width, channels, dtype, box, device): input = generate_tensor(samples, width, height, channels, dtype, device) - output_golden = rocpycv.Tensor([samples, box[1], box[0], channels], - rocpycv.eTensorLayout.NHWC, dtype, device) + output_golden = rocpycv.Tensor([samples, box[1], box[0], channels], dtype, rocpycv.eTensorLayout.NHWC, device) if device == rocpycv.eDeviceType.GPU: stream = rocpycv.Stream() diff --git a/tests/roccv/python/test_op_composite.py b/tests/roccv/python/test_op_composite.py index e5d37538..12f43668 100644 --- a/tests/roccv/python/test_op_composite.py +++ b/tests/roccv/python/test_op_composite.py @@ -39,7 +39,7 @@ def test_op_composite(samples, height, width, out_channels, dtype, device): foreground = generate_tensor(samples, width, height, 3, dtype, device) background = generate_tensor(samples, width, height, 3, dtype, device) mask = generate_tensor(samples, width, height, 1, dtype, device) - output_golden = rocpycv.Tensor([samples, height, width, out_channels], rocpycv.eTensorLayout.NHWC, dtype, device) + output_golden = rocpycv.Tensor([samples, height, width, out_channels], dtype, rocpycv.eTensorLayout.NHWC, device) stream = rocpycv.Stream() output = rocpycv.composite(foreground, background, mask, out_channels, stream, device) diff --git a/tests/roccv/python/test_op_convert_to.py b/tests/roccv/python/test_op_convert_to.py index 9e9f389f..c7555531 100644 --- a/tests/roccv/python/test_op_convert_to.py +++ b/tests/roccv/python/test_op_convert_to.py @@ -40,7 +40,7 @@ ]) def test_op_convert_to(samples, height, width, channels, device, dtype, out_dtype, alpha, beta): input = generate_tensor(samples, width, height, channels, dtype, device) - output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, out_dtype, device) + output_golden = rocpycv.Tensor([samples, height, width, channels], out_dtype, rocpycv.eTensorLayout.NHWC, device) stream = rocpycv.Stream() rocpycv.convert_to_into(output_golden, input, alpha, beta, stream, device) diff --git a/tests/roccv/python/test_op_copy_make_border.py b/tests/roccv/python/test_op_copy_make_border.py index 27994e00..fcafe36f 100644 --- a/tests/roccv/python/test_op_copy_make_border.py +++ b/tests/roccv/python/test_op_copy_make_border.py @@ -46,7 +46,7 @@ def test_op_copy_make_border(samples, height, width, channels, top, right, bottom, left, border_mode, border_value, dtype, device): input = generate_tensor(samples, width, height, channels, dtype, device) output_golden = rocpycv.Tensor([samples, height + top + bottom, width + right + left, - channels], rocpycv.eTensorLayout.NHWC, dtype, device) + channels], dtype, rocpycv.eTensorLayout.NHWC, device) stream = rocpycv.Stream() output = rocpycv.copymakeborder(input, border_mode, border_value, top, bottom, left, right, stream, device) diff --git a/tests/roccv/python/test_op_custom_crop.py b/tests/roccv/python/test_op_custom_crop.py index 026130c5..47c3f1f1 100644 --- a/tests/roccv/python/test_op_custom_crop.py +++ b/tests/roccv/python/test_op_custom_crop.py @@ -41,8 +41,7 @@ ]) def test_op_custom_crop(samples, height, width, channels, dtype, box, device): input = generate_tensor(samples, width, height, channels, dtype, device) - output_golden = rocpycv.Tensor([samples, box.height, box.width, channels], - rocpycv.eTensorLayout.NHWC, dtype, device) + output_golden = rocpycv.Tensor([samples, box.height, box.width, channels], dtype, rocpycv.eTensorLayout.NHWC, device) if device == rocpycv.eDeviceType.GPU: stream = rocpycv.Stream() rocpycv.custom_crop_into(output_golden, input, box, stream, device) diff --git a/tests/roccv/python/test_op_cvt_color.py b/tests/roccv/python/test_op_cvt_color.py index 448a598b..637fbbfe 100644 --- a/tests/roccv/python/test_op_cvt_color.py +++ b/tests/roccv/python/test_op_cvt_color.py @@ -51,7 +51,7 @@ def test_op_cvtcolor(samples, height, width, code, dtype, device): out_channels = 1 input = generate_tensor(samples, width, height, in_channels, dtype, device) - output_golden = rocpycv.Tensor([samples, height, width, out_channels], rocpycv.eTensorLayout.NHWC, dtype, device) + output_golden = rocpycv.Tensor([samples, height, width, out_channels], dtype, rocpycv.eTensorLayout.NHWC, device) stream = rocpycv.Stream() output = rocpycv.cvtcolor(input, code, stream, device) diff --git a/tests/roccv/python/test_op_flip.py b/tests/roccv/python/test_op_flip.py index 32ed6215..c4980cd6 100644 --- a/tests/roccv/python/test_op_flip.py +++ b/tests/roccv/python/test_op_flip.py @@ -40,7 +40,7 @@ def test_op_flip(samples, width, height, channels, dtype, flip_code, device): input_tensor = generate_tensor(samples, width, height, channels, dtype, device) stream = rocpycv.Stream() - output_tensor_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device) + output_tensor_golden = rocpycv.Tensor([samples, height, width, channels], dtype, rocpycv.eTensorLayout.NHWC, device) rocpycv.flip_into(output_tensor_golden, input_tensor, flip_code, stream, device) output_tensor = rocpycv.flip(input_tensor, flip_code, stream, device) stream.synchronize() diff --git a/tests/roccv/python/test_op_gamma_contrast.py b/tests/roccv/python/test_op_gamma_contrast.py index 68343ec9..38893594 100644 --- a/tests/roccv/python/test_op_gamma_contrast.py +++ b/tests/roccv/python/test_op_gamma_contrast.py @@ -38,7 +38,7 @@ ]) def test_op_gamma_contrast(samples, height, width, channels, gamma, dtype, device): input = generate_tensor(samples, width, height, channels, dtype, device) - output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device) + output_golden = rocpycv.Tensor([samples, height, width, channels], dtype, rocpycv.eTensorLayout.NHWC, device) stream = rocpycv.Stream() output = rocpycv.gamma_contrast(input, gamma, stream, device) diff --git a/tests/roccv/python/test_op_histogram.py b/tests/roccv/python/test_op_histogram.py index e980a057..17c741d2 100644 --- a/tests/roccv/python/test_op_histogram.py +++ b/tests/roccv/python/test_op_histogram.py @@ -37,7 +37,7 @@ ]) def test_op_histogram(samples, height, width, in_dtype, out_dtype, device): input = generate_tensor(samples, width, height, 1, in_dtype, device) - output_golden = rocpycv.Tensor([samples, 256, 1], rocpycv.eTensorLayout.HWC, out_dtype, device) + output_golden = rocpycv.Tensor([samples, 256, 1], out_dtype, rocpycv.eTensorLayout.HWC, device) stream = rocpycv.Stream() output = rocpycv.histogram(input, None, stream, device) diff --git a/tests/roccv/python/test_op_non_max_suppression.py b/tests/roccv/python/test_op_non_max_suppression.py index 163a0a8f..e39c1e44 100644 --- a/tests/roccv/python/test_op_non_max_suppression.py +++ b/tests/roccv/python/test_op_non_max_suppression.py @@ -44,7 +44,7 @@ def generate_boxes(samples: int, num_boxes: int, device: rocpycv.eDeviceType) -> def test_op_non_max_suppression(samples, num_boxes, device): boxes = generate_boxes(samples, num_boxes, device) scores = generate_tensor_generic([samples, num_boxes], rocpycv.eTensorLayout.NW, rocpycv.eDataType.F32, device) - output_golden = rocpycv.Tensor([samples, num_boxes], rocpycv.eTensorLayout.NW, rocpycv.eDataType.U8, device) + output_golden = rocpycv.Tensor([samples, num_boxes], rocpycv.eDataType.U8, rocpycv.eTensorLayout.NW, device) stream = rocpycv.Stream() # Hardcoding the score and IoU threshold here. The only thing we care about is the resulting size of the diff --git a/tests/roccv/python/test_op_normalize.py b/tests/roccv/python/test_op_normalize.py index e25bc300..72036515 100644 --- a/tests/roccv/python/test_op_normalize.py +++ b/tests/roccv/python/test_op_normalize.py @@ -39,7 +39,7 @@ def test_op_normalize(samples, height, width, channels, device, dtype): input = generate_tensor(samples, width, height, channels, dtype, device) base = generate_tensor(1, 1, 1, channels, rocpycv.eDataType.F32, device) scale = generate_tensor(1, 1, 1, channels, rocpycv.eDataType.F32, device) - output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device) + output_golden = rocpycv.Tensor([samples, height, width, channels], dtype, rocpycv.eTensorLayout.NHWC, device) stream = rocpycv.Stream() rocpycv.normalize_into(output_golden, input, base, scale, None, 1.0, 0.0, 0.0, stream, device) diff --git a/tests/roccv/python/test_op_reformat.py b/tests/roccv/python/test_op_reformat.py index 62715ff9..10d2b7a6 100644 --- a/tests/roccv/python/test_op_reformat.py +++ b/tests/roccv/python/test_op_reformat.py @@ -54,7 +54,7 @@ def test_op_reformat(samples, height, width, channels, inLayout, outLayout, devi input_shape = create_tensor_shape(inLayout, samples, channels, height, width) output_shape = create_tensor_shape(outLayout, samples, channels, height, width) input_tensor = generate_tensor_generic(input_shape, inLayout, dtype, device) - output_golden = rocpycv.Tensor(output_shape, outLayout, dtype, device) + output_golden = rocpycv.Tensor(output_shape, dtype, outLayout, device) stream = rocpycv.Stream() rocpycv.reformat_into(input_tensor, output_golden, stream, device) diff --git a/tests/roccv/python/test_op_remap.py b/tests/roccv/python/test_op_remap.py index 25321f9e..e9ccc394 100644 --- a/tests/roccv/python/test_op_remap.py +++ b/tests/roccv/python/test_op_remap.py @@ -44,7 +44,7 @@ def test_op_remap(samples, width, height, channels, dtype, map_interp, interp, map_type, align_corners, border_mode, border_val, device): input_tensor = generate_tensor(samples, width, height, channels, dtype, device) - output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device) + output_golden = rocpycv.Tensor([samples, height, width, channels], dtype, rocpycv.eTensorLayout.NHWC, device) if (map_type == rocpycv.REMAP_ABSOLUTE): diff --git a/tests/roccv/python/test_op_resize.py b/tests/roccv/python/test_op_resize.py index 721c1d7f..70e0c897 100644 --- a/tests/roccv/python/test_op_resize.py +++ b/tests/roccv/python/test_op_resize.py @@ -38,7 +38,7 @@ def test_op_resize(out_shape, in_shape, samples, channels, interp, dtype, device # Input/Output shapes are passed in as format [width, height] input = generate_tensor(samples, in_shape[0], in_shape[1], channels, dtype, device) output_shape = (samples, out_shape[1], out_shape[0], channels) - output_golden = rocpycv.Tensor(output_shape, rocpycv.eTensorLayout.NHWC, dtype, device) + output_golden = rocpycv.Tensor(output_shape, dtype, rocpycv.eTensorLayout.NHWC, device) stream = rocpycv.Stream() rocpycv.resize_into(output_golden, input, interp, stream, device) diff --git a/tests/roccv/python/test_op_rotate.py b/tests/roccv/python/test_op_rotate.py index 13c2fceb..1fa1e8e8 100644 --- a/tests/roccv/python/test_op_rotate.py +++ b/tests/roccv/python/test_op_rotate.py @@ -46,7 +46,7 @@ def calc_center_shift(center_x, center_y, angle) -> tuple[float, float]: ]) def test_op_rotate(samples, width, height, channels, angle, dtype, interp, device): input = generate_tensor(samples, width, height, channels, dtype, device) - output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device) + output_golden = rocpycv.Tensor([samples, height, width, channels], dtype, rocpycv.eTensorLayout.NHWC, device) center_x = (width - 1) / 2 center_y = (height - 1) / 2 diff --git a/tests/roccv/python/test_op_thresholding.py b/tests/roccv/python/test_op_thresholding.py index 1eae8d97..8613a266 100644 --- a/tests/roccv/python/test_op_thresholding.py +++ b/tests/roccv/python/test_op_thresholding.py @@ -41,7 +41,7 @@ def test_op_thresholding(samples, height, width, channels, dtype, thresh, mvdata, threshType, device): input_tensor = generate_tensor(samples, width, height, channels, dtype, device) - output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device) + output_golden = rocpycv.Tensor([samples, height, width, channels], dtype, rocpycv.eTensorLayout.NHWC, device) thresh_array = np.full(samples, thresh, np.float64) maxval_array = np.full(samples, mvdata, np.float64) diff --git a/tests/roccv/python/test_op_warp_affine.py b/tests/roccv/python/test_op_warp_affine.py index b4fd0dfb..9b9656b0 100644 --- a/tests/roccv/python/test_op_warp_affine.py +++ b/tests/roccv/python/test_op_warp_affine.py @@ -41,7 +41,7 @@ ]) def test_op_warp_affine(samples, width, height, channels, dtype, mat, inverted, interp, border_mode, border_val, device): input = generate_tensor(samples, width, height, channels, dtype, device) - output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device) + output_golden = rocpycv.Tensor([samples, height, width, channels], dtype, rocpycv.eTensorLayout.NHWC, device) stream = rocpycv.Stream() output = rocpycv.warp_affine(input, mat, inverted, interp, border_mode, border_val, stream, device) diff --git a/tests/roccv/python/test_op_warp_perspective.py b/tests/roccv/python/test_op_warp_perspective.py index c72754a6..3164a951 100644 --- a/tests/roccv/python/test_op_warp_perspective.py +++ b/tests/roccv/python/test_op_warp_perspective.py @@ -41,7 +41,7 @@ ]) def test_op_warp_perspective(samples, width, height, channels, dtype, mat, inverted, interp, border_mode, border_val, device): input = generate_tensor(samples, width, height, channels, dtype, device) - output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eTensorLayout.NHWC, dtype, device) + output_golden = rocpycv.Tensor([samples, height, width, channels], dtype, rocpycv.eTensorLayout.NHWC, device) print(output_golden.shape()) stream = rocpycv.Stream() From ea1caff478cf1b806e5cac736ac570add7825741 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Wed, 29 Apr 2026 18:30:43 -0400 Subject: [PATCH 13/20] Add numpy/string layout options for convert_to and reformat python operators --- python/src/operators/py_op_convert_to.cpp | 18 +++++++++++++----- python/src/operators/py_op_reformat.cpp | 13 ++++++++++--- python/src/rocpycv.pyi | 12 +++++++----- 3 files changed, 30 insertions(+), 13 deletions(-) diff --git a/python/src/operators/py_op_convert_to.cpp b/python/src/operators/py_op_convert_to.cpp index 9100ce3f..ad3a32a5 100644 --- a/python/src/operators/py_op_convert_to.cpp +++ b/python/src/operators/py_op_convert_to.cpp @@ -24,6 +24,8 @@ THE SOFTWARE. #include +#include "py_helpers.hpp" + PyTensor PyOpConvertTo::Execute(PyTensor& input, eDataType dtype, double alpha, double beta, std::optional> stream, eDeviceType device) { hipStream_t hipStream = stream.has_value() ? stream.value().get().getStream() : nullptr; @@ -44,17 +46,23 @@ void PyOpConvertTo::ExecuteInto(PyTensor& output, PyTensor& input, double alpha, void PyOpConvertTo::Export(py::module& m) { using namespace py::literals; - m.def("convert_to", &PyOpConvertTo::Execute, "src"_a, "dtype"_a, "alpha"_a = 1.0, "beta"_a = 0.0, - "stream"_a = nullptr, "device"_a = eDeviceType::GPU, R"pbdoc( - + m.def("convert_to", + [](PyTensor& input, py::object dtype, double alpha, double beta, + std::optional> stream, eDeviceType device) { + return PyOpConvertTo::Execute(input, DataTypeFromPyObject(dtype), alpha, beta, stream, device); + }, + "src"_a, "dtype"_a, "alpha"_a = 1.0, "beta"_a = 0.0, "stream"_a = nullptr, + "device"_a = eDeviceType::GPU, R"pbdoc( + Executes the Convert To operation on the given HIP stream. See also: Refer to the rocCV C++ API reference for more information on this operation. - + Args: src (rocpycv.Tensor): Input tensor containing one or more images. - dtype (eDataType): Datatype of the output tensor. + dtype: Datatype of the output tensor. Either an ``rocpycv.eDataType`` + (e.g. ``rocpycv.F32``) or a NumPy dtype/scalar type (e.g. ``np.float32``). alpha (double, optional): Scalar for output data. Defaults to 1.0. beta (double, optional): Offset for the data. Defaults to 0.0. stream (rocpycv.Stream, optional): HIP stream to run this operation on. diff --git a/python/src/operators/py_op_reformat.cpp b/python/src/operators/py_op_reformat.cpp index 6cfa4105..1351ca53 100644 --- a/python/src/operators/py_op_reformat.cpp +++ b/python/src/operators/py_op_reformat.cpp @@ -22,6 +22,8 @@ THE SOFTWARE. #include "operators/py_op_reformat.hpp" +#include "py_helpers.hpp" + void PyOpReformat::ExecuteInto(PyTensor& output, PyTensor& input, std::optional> stream, eDeviceType device) { hipStream_t hipStream = stream.has_value() ? stream.value().get().getStream() : nullptr; @@ -46,8 +48,12 @@ PyTensor PyOpReformat::Execute(PyTensor& input, eTensorLayout outLayout, void PyOpReformat::Export(py::module& m) { using namespace py::literals; - m.def("reformat", &PyOpReformat::Execute, "input"_a, "out_layout"_a, "stream"_a = nullptr, - "device"_a = eDeviceType::GPU, R"pbdoc( + m.def("reformat", + [](PyTensor& input, py::object outLayout, + std::optional> stream, eDeviceType device) { + return PyOpReformat::Execute(input, LayoutFromPyObject(outLayout), stream, device); + }, + "input"_a, "out_layout"_a, "stream"_a = nullptr, "device"_a = eDeviceType::GPU, R"pbdoc( Executes the Reformat operation and returns the result as a new tensor. See also: @@ -55,7 +61,8 @@ void PyOpReformat::Export(py::module& m) { Args: input (rocpycv.Tensor): Input tensor to reformat. - out_layout (rocpycv.eTensorLayout): The layout to reformat the input tensor to. + out_layout: The layout to reformat the input tensor to. Either an + ``rocpycv.eTensorLayout`` (e.g. ``rocpycv.NCHW``) or a layout string (``"NCHW"``). stream (rocpycv.Stream, optional): HIP stream to run this operation on. device (rocpycv.Device, optional): The device to run this operation on. Defaults to GPU. diff --git a/python/src/rocpycv.pyi b/python/src/rocpycv.pyi index f24fb615..b9a0aee7 100644 --- a/python/src/rocpycv.pyi +++ b/python/src/rocpycv.pyi @@ -913,16 +913,17 @@ def composite_into(dst: Tensor, foreground: Tensor, background: Tensor, fgmask: Returns: None """ -def convert_to(src: Tensor, dtype: eDataType, alpha: typing.SupportsFloat | typing.SupportsIndex = 1.0, beta: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor: +def convert_to(src: Tensor, dtype: typing.Any, alpha: typing.SupportsFloat | typing.SupportsIndex = 1.0, beta: typing.SupportsFloat | typing.SupportsIndex = 0.0, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor: """ Executes the Convert To operation on the given HIP stream. See also: Refer to the rocCV C++ API reference for more information on this operation. - + Args: src (rocpycv.Tensor): Input tensor containing one or more images. - dtype (eDataType): Datatype of the output tensor. + dtype: Datatype of the output tensor. Either an ``rocpycv.eDataType`` + (e.g. ``rocpycv.F32``) or a NumPy dtype/scalar type (e.g. ``np.float32``). alpha (double, optional): Scalar for output data. Defaults to 1.0. beta (double, optional): Offset for the data. Defaults to 0.0. stream (rocpycv.Stream, optional): HIP stream to run this operation on. @@ -1239,7 +1240,7 @@ def normalize_into(dst: Tensor, src: Tensor, base: Tensor, scale: Tensor, flags: Returns: None """ -def reformat(input: Tensor, out_layout: eTensorLayout, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor: +def reformat(input: Tensor, out_layout: typing.Any, stream: rocpycv.Stream | None = None, device: eDeviceType = eDeviceType.eDeviceType.GPU) -> Tensor: """ Executes the Reformat operation and returns the result as a new tensor. @@ -1248,7 +1249,8 @@ def reformat(input: Tensor, out_layout: eTensorLayout, stream: rocpycv.Stream | Args: input (rocpycv.Tensor): Input tensor to reformat. - out_layout (rocpycv.eTensorLayout): The layout to reformat the input tensor to. + out_layout: The layout to reformat the input tensor to. Either an + ``rocpycv.eTensorLayout`` (e.g. ``rocpycv.NCHW``) or a layout string (``"NCHW"``). stream (rocpycv.Stream, optional): HIP stream to run this operation on. device (rocpycv.Device, optional): The device to run this operation on. Defaults to GPU. From b4c39d5d2ccf062a23f0b34a6010cdf8713ef237 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Wed, 29 Apr 2026 18:56:01 -0400 Subject: [PATCH 14/20] Improve initial rocpycv module documentation --- python/src/main.cpp | 57 +++++++++++++++++++++++++++++++++++++++--- python/src/rocpycv.pyi | 57 +++++++++++++++++++++++++++++++++++++++--- 2 files changed, 108 insertions(+), 6 deletions(-) diff --git a/python/src/main.cpp b/python/src/main.cpp index ddbec687..e34c6fba 100644 --- a/python/src/main.cpp +++ b/python/src/main.cpp @@ -52,9 +52,60 @@ THE SOFTWARE. PYBIND11_MODULE(rocpycv, m) { m.doc() = R"pbdoc( - Python API reference - ----------------------- - This is the Python API reference for rocCV. + rocpycv — AMD GPU-accelerated image pre/post-processing + ======================================================= + + rocpycv is the Python binding for rocCV, a HIP/ROCm image processing + library. It exposes a NumPy-friendly :class:`Tensor` and a suite of + operators (resize, normalize, color conversion, geometric warps, ...) + that run on either GPU (default) or CPU. + + Quick start + ----------- + .. code-block:: python + + import numpy as np + import rocpycv + + # Wrap a NumPy array as a CPU Tensor (zero-copy via DLPack), then + # copy it to the GPU (explicit H2D transfer). + host = np.zeros((1, 480, 640, 3), np.uint8) + src = rocpycv.from_dlpack(host, "NHWC").copy_to(rocpycv.GPU) + + # Functional form: operators allocate and return a new Tensor. + resized = rocpycv.resize(src, (1, 224, 224, 3), rocpycv.LINEAR) + chw = rocpycv.reformat(resized, "NCHW") + + # ``*_into`` form: write into a caller-allocated output, optionally + # on a stream — useful in hot preprocessing loops. + stream = rocpycv.Stream() + out = rocpycv.Tensor((1, 224, 224, 3), np.uint8, "NHWC") + rocpycv.resize_into(out, src, rocpycv.LINEAR, stream) + stream.synchronize() + + Tensors + ------- + :class:`Tensor` arguments accept either rocpycv enums or familiar + Python types: + + * ``dtype`` — ``rocpycv.F32`` or any NumPy dtype/scalar (``np.float32``). + * ``layout`` — ``rocpycv.NHWC`` or a layout string (``"NHWC"``). + + For zero-copy interop, tensors implement the DLPack protocol — pass any + ``__dlpack__``-supporting object (NumPy array, PyTorch tensor, ...) to + :func:`from_dlpack`, and use :meth:`Tensor.data_ptr` to hand a raw GPU + pointer to inference frameworks such as MIGraphX. + + Operators + --------- + Most operators come in two forms: + + * ``op(src, ...)`` — allocates and returns a new :class:`Tensor`. + * ``op_into(dst, src, ...)`` — writes into a pre-allocated output, + avoiding per-call allocation in tight loops. + + All operators accept an optional ``stream`` (a :class:`Stream` wrapping + a ``hipStream_t``) and a ``device`` argument (defaults to GPU). )pbdoc"; PyException::Export(m); PyEnums::Export(m); diff --git a/python/src/rocpycv.pyi b/python/src/rocpycv.pyi index b9a0aee7..270fb848 100644 --- a/python/src/rocpycv.pyi +++ b/python/src/rocpycv.pyi @@ -1,8 +1,59 @@ """ - Python API reference - ----------------------- - This is the Python API reference for rocCV. + rocpycv — AMD GPU-accelerated image pre/post-processing + ======================================================= + + rocpycv is the Python binding for rocCV, a HIP/ROCm image processing + library. It exposes a NumPy-friendly :class:`Tensor` and a suite of + operators (resize, normalize, color conversion, geometric warps, ...) + that run on either GPU (default) or CPU. + + Quick start + ----------- + .. code-block:: python + + import numpy as np + import rocpycv + + # Wrap a NumPy array as a CPU Tensor (zero-copy via DLPack), then + # copy it to the GPU (explicit H2D transfer). + host = np.zeros((1, 480, 640, 3), np.uint8) + src = rocpycv.from_dlpack(host, "NHWC").copy_to(rocpycv.GPU) + + # Functional form: operators allocate and return a new Tensor. + resized = rocpycv.resize(src, (1, 224, 224, 3), rocpycv.LINEAR) + chw = rocpycv.reformat(resized, "NCHW") + + # ``*_into`` form: write into a caller-allocated output, optionally + # on a stream — useful in hot preprocessing loops. + stream = rocpycv.Stream() + out = rocpycv.Tensor((1, 224, 224, 3), np.uint8, "NHWC") + rocpycv.resize_into(out, src, rocpycv.LINEAR, stream) + stream.synchronize() + + Tensors + ------- + :class:`Tensor` arguments accept either rocpycv enums or familiar + Python types: + + * ``dtype`` — ``rocpycv.F32`` or any NumPy dtype/scalar (``np.float32``). + * ``layout`` — ``rocpycv.NHWC`` or a layout string (``"NHWC"``). + + For zero-copy interop, tensors implement the DLPack protocol — pass any + ``__dlpack__``-supporting object (NumPy array, PyTorch tensor, ...) to + :func:`from_dlpack`, and use :meth:`Tensor.data_ptr` to hand a raw GPU + pointer to inference frameworks such as MIGraphX. + + Operators + --------- + Most operators come in two forms: + + * ``op(src, ...)`` — allocates and returns a new :class:`Tensor`. + * ``op_into(dst, src, ...)`` — writes into a pre-allocated output, + avoiding per-call allocation in tight loops. + + All operators accept an optional ``stream`` (a :class:`Stream` wrapping + a ``hipStream_t``) and a ``device`` argument (defaults to GPU). """ from __future__ import annotations From edc8dac8cce3b2886730b415512b1cc1bb7341b8 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Thu, 30 Apr 2026 11:35:10 -0400 Subject: [PATCH 15/20] Rename test_op_remap -> test_op_bndbox --- tests/roccv/python/test_op_bnd_box.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/roccv/python/test_op_bnd_box.py b/tests/roccv/python/test_op_bnd_box.py index f5377b65..1b203a5a 100644 --- a/tests/roccv/python/test_op_bnd_box.py +++ b/tests/roccv/python/test_op_bnd_box.py @@ -49,7 +49,7 @@ def generate_boxes(samples: int, height: int, width: int) -> rocpycv.BndBoxes: (3, 150, 50), (7, 15, 23) ]) -def test_op_remap(samples, height, width, channels, device): +def test_op_bndbox(samples, height, width, channels, device): input = generate_tensor(samples, width, height, channels, rocpycv.eDataType.U8, device) boxes = generate_boxes(samples, height, width) output_golden = rocpycv.Tensor([samples, height, width, channels], rocpycv.eDataType.U8, rocpycv.eTensorLayout.NHWC, device) From e52f8dcaf438a645d4cb1c85edca91270978acd6 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Thu, 30 Apr 2026 11:38:36 -0400 Subject: [PATCH 16/20] Remove support for 4S16 in DLTypeToRoccvType --- python/src/py_helpers.cpp | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/python/src/py_helpers.cpp b/python/src/py_helpers.cpp index 8d4dce55..15ea92fc 100644 --- a/python/src/py_helpers.cpp +++ b/python/src/py_helpers.cpp @@ -22,23 +22,23 @@ THE SOFTWARE. #include "py_helpers.hpp" -#include #include +#include #include #include eDataType DLTypeToRoccvType(DLDataType dtype) { + if (dtype.lanes != 1) { + throw std::runtime_error("Datatype is not supported."); + } + if (dtype.bits == 8) { if (dtype.code == kDLUInt) return eDataType::DATA_TYPE_U8; if (dtype.code == kDLInt) return eDataType::DATA_TYPE_S8; } else if (dtype.bits == 16) { - if (dtype.lanes == 4) { - return eDataType::DATA_TYPE_4S16; - } else if (dtype.lanes == 1) { - if (dtype.code == kDLUInt) return eDataType::DATA_TYPE_U16; - if (dtype.code == kDLInt) return eDataType::DATA_TYPE_S16; - } + if (dtype.code == kDLUInt) return eDataType::DATA_TYPE_U16; + if (dtype.code == kDLInt) return eDataType::DATA_TYPE_S16; } else if (dtype.bits == 32) { if (dtype.code == kDLFloat) return eDataType::DATA_TYPE_F32; if (dtype.code == kDLUInt) return eDataType::DATA_TYPE_U32; @@ -181,18 +181,23 @@ eDataType DataTypeFromPyObject(py::object obj) { static const py::object np_dtype = py::module_::import("numpy").attr("dtype"); dt = np_dtype(obj).cast(); } catch (const std::exception&) { - throw std::runtime_error( - "dtype must be an rocpycv.eDataType or a NumPy dtype/scalar type (e.g. np.float32)."); + throw std::runtime_error("dtype must be an rocpycv.eDataType or a NumPy dtype/scalar type (e.g. np.float32)."); } DLDataTypeCode code; switch (dt.kind()) { - case 'u': code = kDLUInt; break; - case 'i': code = kDLInt; break; - case 'f': code = kDLFloat; break; + case 'u': + code = kDLUInt; + break; + case 'i': + code = kDLInt; + break; + case 'f': + code = kDLFloat; + break; default: - throw std::runtime_error("Unsupported NumPy dtype for rocpycv.Tensor (kind '" + - std::string(1, dt.kind()) + "')."); + throw std::runtime_error("Unsupported NumPy dtype for rocpycv.Tensor (kind '" + std::string(1, dt.kind()) + + "')."); } DLDataType dl{static_cast(code), static_cast(dt.itemsize() * 8), 1}; return DLTypeToRoccvType(dl); From ac7451787ccda810716536a789d6800b391d1515 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Thu, 30 Apr 2026 13:24:56 -0400 Subject: [PATCH 17/20] Add test for PyStream handle() --- tests/roccv/python/test_py_stream.py | 32 ++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 tests/roccv/python/test_py_stream.py diff --git a/tests/roccv/python/test_py_stream.py b/tests/roccv/python/test_py_stream.py new file mode 100644 index 00000000..8b35b16d --- /dev/null +++ b/tests/roccv/python/test_py_stream.py @@ -0,0 +1,32 @@ +# ############################################################################## +# Copyright (c) - 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +# ############################################################################## + +import rocpycv + + +def test_stream_handle_returns_valid_pointer(): + stream = rocpycv.Stream() + handle = stream.handle() + + assert isinstance(handle, int) + assert handle != 0 From c150f7927043e1c148518baee2df391c2e734cc0 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Thu, 30 Apr 2026 13:31:37 -0400 Subject: [PATCH 18/20] Add basic PyTensor test --- tests/roccv/python/test_py_tensor.py | 46 ++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 tests/roccv/python/test_py_tensor.py diff --git a/tests/roccv/python/test_py_tensor.py b/tests/roccv/python/test_py_tensor.py new file mode 100644 index 00000000..684f5d45 --- /dev/null +++ b/tests/roccv/python/test_py_tensor.py @@ -0,0 +1,46 @@ +# ############################################################################## +# Copyright (c) - 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +# ############################################################################## + +import pytest +import rocpycv + + +@pytest.mark.parametrize("device", [rocpycv.eDeviceType.GPU, rocpycv.eDeviceType.CPU]) +@pytest.mark.parametrize("dtype", [rocpycv.eDataType.U8, rocpycv.eDataType.F32, rocpycv.eDataType.S32]) +@pytest.mark.parametrize( + "shape, layout", + [ + ([2, 32, 64, 3], rocpycv.eTensorLayout.NHWC), + ([1, 3, 16, 16], rocpycv.eTensorLayout.NCHW), + ([8, 8, 4], rocpycv.eTensorLayout.HWC), + ], +) +def test_tensor_basic_properties(shape, layout, dtype, device): + tensor = rocpycv.Tensor(shape, dtype, layout, device) + + assert tensor.shape() == shape + assert tensor.ndim() == len(shape) + assert tensor.layout() == layout + assert tensor.device() == device + assert tensor.dtype() == dtype + assert tensor.data_ptr() != 0 From 07126e7253508e826e524ddf2be197d080d0d357 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Thu, 30 Apr 2026 13:31:54 -0400 Subject: [PATCH 19/20] Update year --- tests/roccv/python/test_py_stream.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/roccv/python/test_py_stream.py b/tests/roccv/python/test_py_stream.py index 8b35b16d..1560e4f5 100644 --- a/tests/roccv/python/test_py_stream.py +++ b/tests/roccv/python/test_py_stream.py @@ -1,5 +1,5 @@ # ############################################################################## -# Copyright (c) - 2025 Advanced Micro Devices, Inc. +# Copyright (c) - 2026 Advanced Micro Devices, Inc. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal From f8c60eb06afd34796fe9377d5126f1f62b1db330 Mon Sep 17 00:00:00 2001 From: Zach Vincze Date: Thu, 30 Apr 2026 14:09:07 -0400 Subject: [PATCH 20/20] Add dtype/layout tests for PyTensor --- tests/roccv/python/test_py_tensor.py | 39 ++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tests/roccv/python/test_py_tensor.py b/tests/roccv/python/test_py_tensor.py index 684f5d45..d394de8b 100644 --- a/tests/roccv/python/test_py_tensor.py +++ b/tests/roccv/python/test_py_tensor.py @@ -21,6 +21,7 @@ # # ############################################################################## +import numpy as np import pytest import rocpycv @@ -44,3 +45,41 @@ def test_tensor_basic_properties(shape, layout, dtype, device): assert tensor.device() == device assert tensor.dtype() == dtype assert tensor.data_ptr() != 0 + + +@pytest.mark.parametrize( + "dtype_in, expected_dtype", + [ + (rocpycv.eDataType.U8, rocpycv.eDataType.U8), + (rocpycv.eDataType.F32, rocpycv.eDataType.F32), + (np.uint8, rocpycv.eDataType.U8), + (np.float32, rocpycv.eDataType.F32), + (np.int32, rocpycv.eDataType.S32), + (np.dtype("uint16"), rocpycv.eDataType.U16), + ], +) +@pytest.mark.parametrize( + "layout_in, expected_layout, shape", + [ + (rocpycv.eTensorLayout.NHWC, rocpycv.eTensorLayout.NHWC, [2, 32, 64, 3]), + ("NHWC", rocpycv.eTensorLayout.NHWC, [2, 32, 64, 3]), + ("NCHW", rocpycv.eTensorLayout.NCHW, [1, 3, 16, 16]), + ("HWC", rocpycv.eTensorLayout.HWC, [8, 8, 4]), + ], +) +def test_tensor_construction_from_numpy_and_strings(dtype_in, expected_dtype, layout_in, expected_layout, shape): + tensor = rocpycv.Tensor(shape, dtype_in, layout_in, rocpycv.eDeviceType.GPU) + + assert tensor.shape() == shape + assert tensor.dtype() == expected_dtype + assert tensor.layout() == expected_layout + + +def test_tensor_construction_invalid_dtype_raises(): + with pytest.raises(Exception): + rocpycv.Tensor([1, 8, 8, 3], "not_a_dtype", rocpycv.eTensorLayout.NHWC, rocpycv.eDeviceType.GPU) + + +def test_tensor_construction_invalid_layout_raises(): + with pytest.raises(Exception): + rocpycv.Tensor([1, 8, 8, 3], rocpycv.eDataType.U8, "ZYXW", rocpycv.eDeviceType.GPU)