Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions cuda_core/cuda/core/_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -746,6 +746,18 @@ def close(self):
"""Destroy the graph."""
self._mnff.close()

@property
def handle(self) -> driver.CUgraphExec:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do we decide if we want to expose CUgraph or CUgraphExec? Both can find their use cases.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a great question that I tried to sidestep in this change. (I'm working on another change to add resource handles to the graph module.)

My understanding is that class GraphBuilder corresponds to CUgraph and class Graph corresponds to CUgraphExec.

Following the existing pattern, after moving to resource handles, we would update the internal names to clarify:

  • GraphBuilder._mnff.graphGraphBuilder._mnff._h_graph (CUgraph)
  • Graph._mnff.graphGraph._mnff._h_graph_exec (CUgraphExec)

And the property types would be:

  • GraphBuilder.handle -> driver.CUgraph
  • Graph.handle -> driver.CUgraphExec

FYI, I added this property to expose CUgraphExec for the following code in test_device_launch.py:

inner_graph_handle = int(inner_graph.handle)
...
launch(gb_outer, LaunchConfig(grid=1, block=1), launcher_kernel, inner_graph_handle)

"""Return the underlying ``CUgraphExec`` object.
.. caution::
This handle is a Python object. To get the memory address of the underlying C
handle, call ``int()`` on the returned object.
"""
return self._mnff.graph

def update(self, builder: GraphBuilder):
"""Update the graph using new build configuration from the builder.
Expand Down
186 changes: 186 additions & 0 deletions cuda_core/tests/graph/test_advanced.py
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved from test_graph.py

Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

"""Advanced graph feature tests (child graphs, update, stream lifetime)."""

import numpy as np
import pytest
from cuda.core import Device, LaunchConfig, LegacyPinnedMemoryResource, launch
from helpers.graph_kernels import compile_common_kernels, compile_conditional_kernels


@pytest.mark.skipif(tuple(int(i) for i in np.__version__.split(".")[:2]) < (2, 1), reason="need numpy 2.1.0+")
def test_graph_child_graph(init_cuda):
mod = compile_common_kernels()
add_one = mod.get_kernel("add_one")

# Allocate memory
launch_stream = Device().create_stream()
mr = LegacyPinnedMemoryResource()
b = mr.allocate(8)
arr = np.from_dlpack(b).view(np.int32)
arr[0] = 0
arr[1] = 0

# Capture the child graph
gb_child = Device().create_graph_builder().begin_building()
launch(gb_child, LaunchConfig(grid=1, block=1), add_one, arr[1:].ctypes.data)
launch(gb_child, LaunchConfig(grid=1, block=1), add_one, arr[1:].ctypes.data)
launch(gb_child, LaunchConfig(grid=1, block=1), add_one, arr[1:].ctypes.data)
gb_child.end_building()

# Capture the parent graph
gb_parent = Device().create_graph_builder().begin_building()
launch(gb_parent, LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data)

## Add child
try:
gb_parent.add_child(gb_child)
except NotImplementedError as e:
with pytest.raises(
NotImplementedError,
match="^Launching child graphs is not implemented for versions older than CUDA 12",
):
raise e
gb_parent.end_building()
b.close()
pytest.skip("Launching child graphs is not implemented for versions older than CUDA 12")

launch(gb_parent, LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data)
graph = gb_parent.end_building().complete()

# Parent updates first value, child updates second value
assert arr[0] == 0
assert arr[1] == 0
graph.launch(launch_stream)
launch_stream.sync()
assert arr[0] == 2
assert arr[1] == 3

# Close the memory resource now because the garbage collected might
# de-allocate it during the next graph builder process
b.close()


@pytest.mark.skipif(tuple(int(i) for i in np.__version__.split(".")[:2]) < (2, 1), reason="need numpy 2.1.0+")
def test_graph_update(init_cuda):
mod = compile_conditional_kernels(int)
add_one = mod.get_kernel("add_one")

# Allocate memory
launch_stream = Device().create_stream()
mr = LegacyPinnedMemoryResource()
b = mr.allocate(12)
arr = np.from_dlpack(b).view(np.int32)
arr[0] = 0
arr[1] = 0
arr[2] = 0

def build_graph(condition_value):
# Begin capture
gb = Device().create_graph_builder().begin_building()

# Add Node A (sets condition)
handle = gb.create_conditional_handle(default_value=condition_value)

# Add Node B (while condition)
try:
gb_case = list(gb.switch(handle, 3))
except Exception as e:
with pytest.raises(RuntimeError, match="^(Driver|Binding) version"):
raise e
gb.end_building()
raise e

## Case 0
gb_case[0].begin_building()
launch(gb_case[0], LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data)
launch(gb_case[0], LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data)
launch(gb_case[0], LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data)
gb_case[0].end_building()

## Case 1
gb_case[1].begin_building()
launch(gb_case[1], LaunchConfig(grid=1, block=1), add_one, arr[1:].ctypes.data)
launch(gb_case[1], LaunchConfig(grid=1, block=1), add_one, arr[1:].ctypes.data)
launch(gb_case[1], LaunchConfig(grid=1, block=1), add_one, arr[1:].ctypes.data)
gb_case[1].end_building()

## Case 2
gb_case[2].begin_building()
launch(gb_case[2], LaunchConfig(grid=1, block=1), add_one, arr[2:].ctypes.data)
launch(gb_case[2], LaunchConfig(grid=1, block=1), add_one, arr[2:].ctypes.data)
launch(gb_case[2], LaunchConfig(grid=1, block=1), add_one, arr[2:].ctypes.data)
gb_case[2].end_building()

return gb.end_building()

try:
graph_variants = [build_graph(0), build_graph(1), build_graph(2)]
except Exception as e:
with pytest.raises(RuntimeError, match="^(Driver|Binding) version"):
raise e
b.close()
pytest.skip("Driver does not support conditional switch")

# Launch the first graph
assert arr[0] == 0
assert arr[1] == 0
assert arr[2] == 0
graph = graph_variants[0].complete()
graph.launch(launch_stream)
launch_stream.sync()
assert arr[0] == 3
assert arr[1] == 0
assert arr[2] == 0

# Update with second variant and launch again
graph.update(graph_variants[1])
graph.launch(launch_stream)
launch_stream.sync()
assert arr[0] == 3
assert arr[1] == 3
assert arr[2] == 0

# Update with third variant and launch again
graph.update(graph_variants[2])
graph.launch(launch_stream)
launch_stream.sync()
assert arr[0] == 3
assert arr[1] == 3
assert arr[2] == 3

# Close the memory resource now because the garbage collected might
# de-allocate it during the next graph builder process
b.close()


def test_graph_stream_lifetime(init_cuda):
mod = compile_common_kernels()
empty_kernel = mod.get_kernel("empty_kernel")

# Create simple graph from device
gb = Device().create_graph_builder().begin_building()
launch(gb, LaunchConfig(grid=1, block=1), empty_kernel)
graph = gb.end_building().complete()

# Destroy simple graph and builder
gb.close()
graph.close()

# Create simple graph from stream
stream = Device().create_stream()
gb = stream.create_graph_builder().begin_building()
launch(gb, LaunchConfig(grid=1, block=1), empty_kernel)
graph = gb.end_building().complete()

# Destroy simple graph and builder
gb.close()
graph.close()

# Verify the stream can still launch work
launch(stream, LaunchConfig(grid=1, block=1), empty_kernel)
stream.sync()

# Destroy the stream
stream.close()
164 changes: 164 additions & 0 deletions cuda_core/tests/graph/test_basic.py
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved from test_graph.py

Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

"""Basic graph construction and topology tests."""

import numpy as np
import pytest
from cuda.core import Device, GraphBuilder, LaunchConfig, LegacyPinnedMemoryResource, launch
from helpers.graph_kernels import compile_common_kernels


def test_graph_is_building(init_cuda):
gb = Device().create_graph_builder()
assert gb.is_building is False
gb.begin_building()
assert gb.is_building is True
gb.end_building()
assert gb.is_building is False


def test_graph_straight(init_cuda):
mod = compile_common_kernels()
empty_kernel = mod.get_kernel("empty_kernel")
launch_stream = Device().create_stream()

# Simple linear topology
gb = Device().create_graph_builder().begin_building()
launch(gb, LaunchConfig(grid=1, block=1), empty_kernel)
launch(gb, LaunchConfig(grid=1, block=1), empty_kernel)
launch(gb, LaunchConfig(grid=1, block=1), empty_kernel)
graph = gb.end_building().complete()

# Sanity upload and launch
graph.upload(launch_stream)
graph.launch(launch_stream)
launch_stream.sync()


def test_graph_fork_join(init_cuda):
mod = compile_common_kernels()
empty_kernel = mod.get_kernel("empty_kernel")
launch_stream = Device().create_stream()

# Simple diamond topology
gb = Device().create_graph_builder().begin_building()
launch(gb, LaunchConfig(grid=1, block=1), empty_kernel)

with pytest.raises(ValueError, match="^Invalid split count: expecting >= 2, got 1"):
gb.split(1)

left, right = gb.split(2)
launch(left, LaunchConfig(grid=1, block=1), empty_kernel)
launch(left, LaunchConfig(grid=1, block=1), empty_kernel)
launch(right, LaunchConfig(grid=1, block=1), empty_kernel)
launch(right, LaunchConfig(grid=1, block=1), empty_kernel)

with pytest.raises(ValueError, match="^Must join with at least two graph builders"):
GraphBuilder.join(left)

gb = GraphBuilder.join(left, right)

launch(gb, LaunchConfig(grid=1, block=1), empty_kernel)
graph = gb.end_building().complete()

# Sanity upload and launch
graph.upload(launch_stream)
graph.launch(launch_stream)
launch_stream.sync()


def test_graph_is_join_required(init_cuda):
mod = compile_common_kernels()
empty_kernel = mod.get_kernel("empty_kernel")

# Starting builder is always primary
gb = Device().create_graph_builder()
assert gb.is_join_required is False
gb.begin_building()

# Create root node
launch(gb, LaunchConfig(grid=1, block=1), empty_kernel)

# First returned builder is always the original
first_split_builders = gb.split(3)
assert first_split_builders[0] is gb

# Only the original builder need not join
assert first_split_builders[0].is_join_required is False
for builder in first_split_builders[1:]:
assert builder.is_join_required is True

# Launch kernel on each split
for builder in first_split_builders:
launch(builder, LaunchConfig(grid=1, block=1), empty_kernel)

# Splitting on new builder will all require joining
second_split_builders = first_split_builders[-1]
first_split_builders = first_split_builders[0:-1]
second_split_builders = second_split_builders.split(3)
for builder in second_split_builders:
assert builder.is_join_required is True

# Launch kernel on each second split
for builder in second_split_builders:
launch(builder, LaunchConfig(grid=1, block=1), empty_kernel)

# Joined builder requires joining if all builder need to join
gb = GraphBuilder.join(*second_split_builders)
assert gb.is_join_required is True
gb = GraphBuilder.join(gb, *first_split_builders)
assert gb.is_join_required is False

# Create final node
launch(gb, LaunchConfig(grid=1, block=1), empty_kernel)
gb.end_building().complete()


@pytest.mark.skipif(tuple(int(i) for i in np.__version__.split(".")[:2]) < (2, 1), reason="need numpy 2.1.0+")
def test_graph_repeat_capture(init_cuda):
mod = compile_common_kernels()
add_one = mod.get_kernel("add_one")

# Allocate memory
launch_stream = Device().create_stream()
mr = LegacyPinnedMemoryResource()
b = mr.allocate(4)
arr = np.from_dlpack(b).view(np.int32)
arr[0] = 0

# Launch the graph once
gb = launch_stream.create_graph_builder().begin_building()
launch(gb, LaunchConfig(grid=1, block=1), add_one, arr.ctypes.data)
graph = gb.end_building().complete()

# Run the graph once
graph.launch(launch_stream)
launch_stream.sync()
assert arr[0] == 1

# Continue capturing to extend the graph
with pytest.raises(RuntimeError, match="^Cannot resume building after building has ended."):
gb.begin_building()

# Graph can be re-launched
graph.launch(launch_stream)
graph.launch(launch_stream)
graph.launch(launch_stream)
launch_stream.sync()
assert arr[0] == 4

# Close the memory resource now because the garbage collected might
# de-allocate it during the next graph builder process
b.close()


def test_graph_capture_errors(init_cuda):
gb = Device().create_graph_builder()
with pytest.raises(RuntimeError, match="^Graph has not finished building."):
gb.complete()

gb.begin_building()
with pytest.raises(RuntimeError, match="^Graph has not finished building."):
gb.complete()
gb.end_building().complete()
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

"""Graph memory resource tests."""

import pytest
from cuda.core import (
Device,
Expand Down
Loading
Loading