Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion cuda_core/cuda/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,4 +61,9 @@
)
from cuda.core._module import Kernel, ObjectCode # noqa: E402
from cuda.core._program import Program, ProgramOptions # noqa: E402
from cuda.core._stream import Stream, StreamOptions # noqa: E402
from cuda.core._stream import ( # noqa: E402
LEGACY_DEFAULT_STREAM,
PER_THREAD_DEFAULT_STREAM,
Stream,
StreamOptions,
)
48 changes: 6 additions & 42 deletions cuda_core/cuda/core/_stream.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -107,49 +107,13 @@ cdef class Stream:
return s

@classmethod
def legacy_default(cls):
"""Return the legacy default stream.

The legacy default stream is an implicit stream which synchronizes
with all other streams in the same CUDA context except for non-blocking
streams. When any operation is launched on the legacy default stream,
it waits for all previously launched operations in blocking streams to
complete, and all subsequent operations in blocking streams wait for
the legacy default stream operation to complete.

Returns
-------
Stream
The legacy default stream instance for the current context.

See Also
--------
per_thread_default : Per-thread default stream alternative.

"""
def _legacy_default(cls):
"""Return the legacy default stream (supports subclassing)."""
return Stream._from_handle(cls, get_legacy_stream())

@classmethod
def per_thread_default(cls):
"""Return the per-thread default stream.

The per-thread default stream is local to both the calling thread and
the CUDA context. Unlike the legacy default stream, it does not
synchronize with other streams and behaves like an explicitly created
non-blocking stream. This allows for better concurrency in multi-threaded
applications.

Returns
-------
Stream
The per-thread default stream instance for the current thread
and context.

See Also
--------
legacy_default : Legacy default stream alternative.

"""
def _per_thread_default(cls):
"""Return the per-thread default stream (supports subclassing)."""
return Stream._from_handle(cls, get_per_thread_stream())

@classmethod
Expand Down Expand Up @@ -404,8 +368,8 @@ cdef class Stream:


# c-only python objects, not public
cdef Stream C_LEGACY_DEFAULT_STREAM = Stream.legacy_default()
cdef Stream C_PER_THREAD_DEFAULT_STREAM = Stream.per_thread_default()
cdef Stream C_LEGACY_DEFAULT_STREAM = Stream._legacy_default()
cdef Stream C_PER_THREAD_DEFAULT_STREAM = Stream._per_thread_default()

# standard python objects, public
LEGACY_DEFAULT_STREAM = C_LEGACY_DEFAULT_STREAM
Expand Down
33 changes: 33 additions & 0 deletions cuda_core/docs/source/release/0.6.0-notes.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
.. SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
.. SPDX-License-Identifier: Apache-2.0

.. currentmodule:: cuda.core

``cuda.core`` 0.6.0 Release Notes
==================================

New features
------------

- Added public access to default CUDA streams via module-level constants ``LEGACY_DEFAULT_STREAM`` and ``PER_THREAD_DEFAULT_STREAM``

Users can now access default streams directly from the ``cuda.core`` namespace:

.. code-block:: python

from cuda.core import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM

# Use legacy default stream (synchronizes with all blocking streams)
LEGACY_DEFAULT_STREAM.sync()

# Use per-thread default stream (non-blocking, thread-local)
PER_THREAD_DEFAULT_STREAM.sync()

The legacy default stream synchronizes with all blocking streams in the same CUDA context, ensuring strict ordering but potentially limiting concurrency. The per-thread default stream is local to the calling thread and does not synchronize with other streams, enabling concurrent execution in multi-threaded applications.

This replaces the previous undocumented workaround of using ``Stream.from_handle(0)`` to access the legacy default stream.

Fixes and enhancements
-----------------------

None.
20 changes: 2 additions & 18 deletions cuda_core/tests/test_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,34 +117,18 @@ def test_stream_legacy_default_subclassing():
class MyStream(Stream):
pass

stream = MyStream.legacy_default()
stream = MyStream._legacy_default()
assert isinstance(stream, MyStream)


def test_stream_per_thread_default_subclassing():
class MyStream(Stream):
pass

stream = MyStream.per_thread_default()
stream = MyStream._per_thread_default()
assert isinstance(stream, MyStream)


def test_stream_legacy_default_public_api(init_cuda):
"""Test public legacy_default() method."""
stream = Stream.legacy_default()
assert isinstance(stream, Stream)
# Verify it's the same as LEGACY_DEFAULT_STREAM
assert stream == LEGACY_DEFAULT_STREAM


def test_stream_per_thread_default_public_api(init_cuda):
"""Test public per_thread_default() method."""
stream = Stream.per_thread_default()
assert isinstance(stream, Stream)
# Verify it's the same as PER_THREAD_DEFAULT_STREAM
assert stream == PER_THREAD_DEFAULT_STREAM


# ============================================================================
# Stream Equality Tests
# ============================================================================
Expand Down