From 98bf0d490a7db8106a3ccfebba557f0856e3560e Mon Sep 17 00:00:00 2001 From: Monishver Chandrasekaran Date: Wed, 14 Jan 2026 20:00:33 -0500 Subject: [PATCH 1/5] fix: Make legacy_default() and per_thread_default() return singletons - Fixes #1494 Signed-off-by: Monishver Chandrasekaran --- cuda_core/cuda/core/_stream.pyx | 51 +++++++++++++++++++++++++++++---- cuda_core/tests/test_stream.py | 36 ++++++++++++++++------- 2 files changed, 72 insertions(+), 15 deletions(-) diff --git a/cuda_core/cuda/core/_stream.pyx b/cuda_core/cuda/core/_stream.pyx index 05cbcce76a..65ca362121 100644 --- a/cuda_core/cuda/core/_stream.pyx +++ b/cuda_core/cuda/core/_stream.pyx @@ -117,17 +117,38 @@ cdef class Stream: complete, and all subsequent operations in blocking streams wait for the legacy default stream operation to complete. + This stream is useful for ensuring strict ordering of operations but + may limit concurrency. For better performance in concurrent scenarios, + consider using per_thread_default() or creating explicit streams. + + This method returns the same singleton instance on every call for the + base Stream class. Subclasses will receive new instances of the subclass + type that wrap the same underlying CUDA stream. + Returns ------- Stream - The legacy default stream instance for the current context. + The legacy default stream singleton instance for the current context. See Also -------- per_thread_default : Per-thread default stream alternative. + from_handle : Create stream from existing handle. + Examples + -------- + >>> from cuda.core import Stream + >>> stream1 = Stream.legacy_default() + >>> stream2 = Stream.legacy_default() + >>> stream1 is stream2 # True - returns same singleton + True """ - return Stream._from_handle(cls, get_legacy_stream()) + # Return the singleton for the base Stream class + if cls is Stream: + return C_LEGACY_DEFAULT_STREAM + # For subclasses, create a new instance of the subclass type + else: + return Stream._from_handle(cls, get_legacy_stream()) @classmethod def per_thread_default(cls): @@ -139,18 +160,38 @@ cdef class Stream: non-blocking stream. This allows for better concurrency in multi-threaded applications. + Each thread has its own per-thread default stream, enabling true + concurrent execution without implicit synchronization barriers. + + This method returns the same singleton instance on every call for the + base Stream class. Subclasses will receive new instances of the subclass + type that wrap the same underlying CUDA stream. + Returns ------- Stream - The per-thread default stream instance for the current thread - and context. + The per-thread default stream singleton instance for the current + thread and context. See Also -------- legacy_default : Legacy default stream alternative. + from_handle : Create stream from existing handle. + Examples + -------- + >>> from cuda.core import Stream + >>> stream1 = Stream.per_thread_default() + >>> stream2 = Stream.per_thread_default() + >>> stream1 is stream2 # True - returns same singleton + True """ - return Stream._from_handle(cls, get_per_thread_stream()) + # Return the singleton for the base Stream class + if cls is Stream: + return C_PER_THREAD_DEFAULT_STREAM + # For subclasses, create a new instance of the subclass type + else: + return Stream._from_handle(cls, get_per_thread_stream()) @classmethod def _init(cls, obj: IsStreamT | None = None, options=None, device_id: int = None, diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py index a40910dbf4..dff6a3e3a4 100644 --- a/cuda_core/tests/test_stream.py +++ b/cuda_core/tests/test_stream.py @@ -130,19 +130,35 @@ class MyStream(Stream): def test_stream_legacy_default_public_api(init_cuda): - """Test public legacy_default() method.""" - stream = Stream.legacy_default() - assert isinstance(stream, Stream) - # Verify it's the same as LEGACY_DEFAULT_STREAM - assert stream == LEGACY_DEFAULT_STREAM + """Test public legacy_default() method returns singleton.""" + stream1 = Stream.legacy_default() + stream2 = Stream.legacy_default() + + assert isinstance(stream1, Stream) + assert isinstance(stream2, Stream) + + # Verify singleton behavior - same Python object + assert stream1 is stream2, "Should return same singleton instance" + + # Verify it's the same as the module constant + assert stream1 is LEGACY_DEFAULT_STREAM, "Should be the same object as LEGACY_DEFAULT_STREAM" + assert stream2 is LEGACY_DEFAULT_STREAM, "Should be the same object as LEGACY_DEFAULT_STREAM" def test_stream_per_thread_default_public_api(init_cuda): - """Test public per_thread_default() method.""" - stream = Stream.per_thread_default() - assert isinstance(stream, Stream) - # Verify it's the same as PER_THREAD_DEFAULT_STREAM - assert stream == PER_THREAD_DEFAULT_STREAM + """Test public per_thread_default() method returns singleton.""" + stream1 = Stream.per_thread_default() + stream2 = Stream.per_thread_default() + + assert isinstance(stream1, Stream) + assert isinstance(stream2, Stream) + + # Verify singleton behavior - same Python object + assert stream1 is stream2, "Should return same singleton instance" + + # Verify it's the same as the module constant + assert stream1 is PER_THREAD_DEFAULT_STREAM, "Should be the same object as PER_THREAD_DEFAULT_STREAM" + assert stream2 is PER_THREAD_DEFAULT_STREAM, "Should be the same object as PER_THREAD_DEFAULT_STREAM" # ============================================================================ From ff9b8af3986cb32fbba0ac3a5588ddcaaa587228 Mon Sep 17 00:00:00 2001 From: Monishver Chandrasekaran Date: Wed, 14 Jan 2026 22:50:52 -0500 Subject: [PATCH 2/5] fix to avoid the circular initialization issue Signed-off-by: Monishver Chandrasekaran --- cuda_core/cuda/core/_stream.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/_stream.pyx b/cuda_core/cuda/core/_stream.pyx index 65ca362121..c3093e674a 100644 --- a/cuda_core/cuda/core/_stream.pyx +++ b/cuda_core/cuda/core/_stream.pyx @@ -455,8 +455,8 @@ cdef class Stream: # c-only python objects, not public -cdef Stream C_LEGACY_DEFAULT_STREAM = Stream.legacy_default() -cdef Stream C_PER_THREAD_DEFAULT_STREAM = Stream.per_thread_default() +cdef Stream C_LEGACY_DEFAULT_STREAM = Stream._from_handle(Stream, get_legacy_stream()) +cdef Stream C_PER_THREAD_DEFAULT_STREAM = Stream._from_handle(Stream, get_per_thread_stream()) # standard python objects, public LEGACY_DEFAULT_STREAM = C_LEGACY_DEFAULT_STREAM From 04586be0537f5b651c895328c29edea949b8aef8 Mon Sep 17 00:00:00 2001 From: Monishver Chandrasekaran Date: Fri, 16 Jan 2026 10:58:01 -0500 Subject: [PATCH 3/5] Revert "feat: Make legacy_default and per_thread_default public - Fixes #1445" This reverts commit 61b5de2b6bc1b64e3f55c33393fd26f1202c65f6. --- cuda_core/cuda/core/_stream.pyx | 93 +++------------------------------ cuda_core/tests/test_stream.py | 36 +------------ 2 files changed, 10 insertions(+), 119 deletions(-) diff --git a/cuda_core/cuda/core/_stream.pyx b/cuda_core/cuda/core/_stream.pyx index c3093e674a..d1747abe2d 100644 --- a/cuda_core/cuda/core/_stream.pyx +++ b/cuda_core/cuda/core/_stream.pyx @@ -107,91 +107,14 @@ cdef class Stream: return s @classmethod - def legacy_default(cls): - """Return the legacy default stream. - - The legacy default stream is an implicit stream which synchronizes - with all other streams in the same CUDA context except for non-blocking - streams. When any operation is launched on the legacy default stream, - it waits for all previously launched operations in blocking streams to - complete, and all subsequent operations in blocking streams wait for - the legacy default stream operation to complete. - - This stream is useful for ensuring strict ordering of operations but - may limit concurrency. For better performance in concurrent scenarios, - consider using per_thread_default() or creating explicit streams. - - This method returns the same singleton instance on every call for the - base Stream class. Subclasses will receive new instances of the subclass - type that wrap the same underlying CUDA stream. - - Returns - ------- - Stream - The legacy default stream singleton instance for the current context. - - See Also - -------- - per_thread_default : Per-thread default stream alternative. - from_handle : Create stream from existing handle. - - Examples - -------- - >>> from cuda.core import Stream - >>> stream1 = Stream.legacy_default() - >>> stream2 = Stream.legacy_default() - >>> stream1 is stream2 # True - returns same singleton - True - """ - # Return the singleton for the base Stream class - if cls is Stream: - return C_LEGACY_DEFAULT_STREAM - # For subclasses, create a new instance of the subclass type - else: - return Stream._from_handle(cls, get_legacy_stream()) + def _legacy_default(cls): + """Return the legacy default stream (supports subclassing).""" + return Stream._from_handle(cls, get_legacy_stream()) @classmethod - def per_thread_default(cls): - """Return the per-thread default stream. - - The per-thread default stream is local to both the calling thread and - the CUDA context. Unlike the legacy default stream, it does not - synchronize with other streams and behaves like an explicitly created - non-blocking stream. This allows for better concurrency in multi-threaded - applications. - - Each thread has its own per-thread default stream, enabling true - concurrent execution without implicit synchronization barriers. - - This method returns the same singleton instance on every call for the - base Stream class. Subclasses will receive new instances of the subclass - type that wrap the same underlying CUDA stream. - - Returns - ------- - Stream - The per-thread default stream singleton instance for the current - thread and context. - - See Also - -------- - legacy_default : Legacy default stream alternative. - from_handle : Create stream from existing handle. - - Examples - -------- - >>> from cuda.core import Stream - >>> stream1 = Stream.per_thread_default() - >>> stream2 = Stream.per_thread_default() - >>> stream1 is stream2 # True - returns same singleton - True - """ - # Return the singleton for the base Stream class - if cls is Stream: - return C_PER_THREAD_DEFAULT_STREAM - # For subclasses, create a new instance of the subclass type - else: - return Stream._from_handle(cls, get_per_thread_stream()) + def _per_thread_default(cls): + """Return the per-thread default stream (supports subclassing).""" + return Stream._from_handle(cls, get_per_thread_stream()) @classmethod def _init(cls, obj: IsStreamT | None = None, options=None, device_id: int = None, @@ -455,8 +378,8 @@ cdef class Stream: # c-only python objects, not public -cdef Stream C_LEGACY_DEFAULT_STREAM = Stream._from_handle(Stream, get_legacy_stream()) -cdef Stream C_PER_THREAD_DEFAULT_STREAM = Stream._from_handle(Stream, get_per_thread_stream()) +cdef Stream C_LEGACY_DEFAULT_STREAM = Stream._legacy_default() +cdef Stream C_PER_THREAD_DEFAULT_STREAM = Stream._per_thread_default() # standard python objects, public LEGACY_DEFAULT_STREAM = C_LEGACY_DEFAULT_STREAM diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py index dff6a3e3a4..925daa7cd5 100644 --- a/cuda_core/tests/test_stream.py +++ b/cuda_core/tests/test_stream.py @@ -117,7 +117,7 @@ def test_stream_legacy_default_subclassing(): class MyStream(Stream): pass - stream = MyStream.legacy_default() + stream = MyStream._legacy_default() assert isinstance(stream, MyStream) @@ -125,42 +125,10 @@ def test_stream_per_thread_default_subclassing(): class MyStream(Stream): pass - stream = MyStream.per_thread_default() + stream = MyStream._per_thread_default() assert isinstance(stream, MyStream) -def test_stream_legacy_default_public_api(init_cuda): - """Test public legacy_default() method returns singleton.""" - stream1 = Stream.legacy_default() - stream2 = Stream.legacy_default() - - assert isinstance(stream1, Stream) - assert isinstance(stream2, Stream) - - # Verify singleton behavior - same Python object - assert stream1 is stream2, "Should return same singleton instance" - - # Verify it's the same as the module constant - assert stream1 is LEGACY_DEFAULT_STREAM, "Should be the same object as LEGACY_DEFAULT_STREAM" - assert stream2 is LEGACY_DEFAULT_STREAM, "Should be the same object as LEGACY_DEFAULT_STREAM" - - -def test_stream_per_thread_default_public_api(init_cuda): - """Test public per_thread_default() method returns singleton.""" - stream1 = Stream.per_thread_default() - stream2 = Stream.per_thread_default() - - assert isinstance(stream1, Stream) - assert isinstance(stream2, Stream) - - # Verify singleton behavior - same Python object - assert stream1 is stream2, "Should return same singleton instance" - - # Verify it's the same as the module constant - assert stream1 is PER_THREAD_DEFAULT_STREAM, "Should be the same object as PER_THREAD_DEFAULT_STREAM" - assert stream2 is PER_THREAD_DEFAULT_STREAM, "Should be the same object as PER_THREAD_DEFAULT_STREAM" - - # ============================================================================ # Stream Equality Tests # ============================================================================ From ec2d8c96e039bd434f1d0932ee8964af156905b4 Mon Sep 17 00:00:00 2001 From: Monishver Chandrasekaran Date: Fri, 16 Jan 2026 11:11:38 -0500 Subject: [PATCH 4/5] Made LEGACY_DEFAULT_STREAM and PER_THREAD_DEFAULT_STREAM available from cuda.core namespace & Reverted helper methods to private (_legacy_default, _per_thread_default) Signed-off-by: Monishver Chandrasekaran --- cuda_core/cuda/core/__init__.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index 67a815d1de..f22bbc7f16 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -61,4 +61,9 @@ ) from cuda.core._module import Kernel, ObjectCode # noqa: E402 from cuda.core._program import Program, ProgramOptions # noqa: E402 -from cuda.core._stream import Stream, StreamOptions # noqa: E402 +from cuda.core._stream import ( # noqa: E402 + LEGACY_DEFAULT_STREAM, + PER_THREAD_DEFAULT_STREAM, + Stream, + StreamOptions, +) From 52086d4f54fc9650fd31c59e124c61ecad8f6e43 Mon Sep 17 00:00:00 2001 From: Monishver Chandrasekaran Date: Fri, 16 Jan 2026 11:31:13 -0500 Subject: [PATCH 5/5] Added release/0.6.0-notes.rst Signed-off-by: Monishver Chandrasekaran --- cuda_core/docs/source/release/0.6.0-notes.rst | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 cuda_core/docs/source/release/0.6.0-notes.rst diff --git a/cuda_core/docs/source/release/0.6.0-notes.rst b/cuda_core/docs/source/release/0.6.0-notes.rst new file mode 100644 index 0000000000..375394abaf --- /dev/null +++ b/cuda_core/docs/source/release/0.6.0-notes.rst @@ -0,0 +1,33 @@ +.. SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. SPDX-License-Identifier: Apache-2.0 + +.. currentmodule:: cuda.core + +``cuda.core`` 0.6.0 Release Notes +================================== + +New features +------------ + +- Added public access to default CUDA streams via module-level constants ``LEGACY_DEFAULT_STREAM`` and ``PER_THREAD_DEFAULT_STREAM`` + + Users can now access default streams directly from the ``cuda.core`` namespace: + + .. code-block:: python + + from cuda.core import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM + + # Use legacy default stream (synchronizes with all blocking streams) + LEGACY_DEFAULT_STREAM.sync() + + # Use per-thread default stream (non-blocking, thread-local) + PER_THREAD_DEFAULT_STREAM.sync() + + The legacy default stream synchronizes with all blocking streams in the same CUDA context, ensuring strict ordering but potentially limiting concurrency. The per-thread default stream is local to the calling thread and does not synchronize with other streams, enabling concurrent execution in multi-threaded applications. + + This replaces the previous undocumented workaround of using ``Stream.from_handle(0)`` to access the legacy default stream. + +Fixes and enhancements +----------------------- + +None.