I'm trying to port the classic GPU tree reduction via KernelAbstractions.jl.
See this for the direct CUDA implementation of what I'm trying to port from.
This is what I have implemented currently:
const TBSize = 1024::Int
const DotBlocks = 256::Int
@kernel function dot(@Const(a), @Const(b), size, partial)
local_i = @index(Local)
group_i = @index(Group)
tb_sum = @localmem T TBSize
@inbounds tb_sum[local_i] = 0.0
# do dot first
i = @index(Global)
while i <= size
@inbounds tb_sum[local_i] += a[i] * b[i]
i += TBSize * DotBlocks
end
# then tree reduction
offset = @private Int64 (1,)
@inbounds begin
offset[1] = @groupsize()[1] ÷ 2
while offset[1] > 0
@synchronize
if (local_i - 1) < offset[1]
tb_sum[local_i] += tb_sum[local_i+offset[1]]
end
offset[1] ÷= 2
end
end
if (local_i == 1)
@inbounds partial[group_i] = tb_sum[local_i]
end
end
# driver
wait(dot(backendDevice, TBSize)(a, b, size, partial_sum, ndrange = TBSize * DotBlocks))
I was able to get correct results and performance seems mostly on par with our CUDA.jl(https://github.com/UoB-HPC/BabelStream/blob/7c1e04a42b9b03b0e5c5d0b07c0ef9f4bdd59353/JuliaStream.jl/src/CUDAStream.jl#L112) and AMDGPU.jl(https://github.com/UoB-HPC/BabelStream/blob/7c1e04a42b9b03b0e5c5d0b07c0ef9f4bdd59353/JuliaStream.jl/src/AMDGPUStream.jl#L135) implementation.
On CPU however, I got the following error:
Using device: AMD Ryzen 9 3900X 12-Core Processor (1T)
ERROR: LoadError: TaskFailedException
Stacktrace:
[1] wait
@ ./task.jl:322 [inlined]
[2] wait
@ ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:65 [inlined]
[3] wait (repeats 2 times)
@ ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:29 [inlined]
[4] runDot(device::Tuple{UndefInitializer, String, Backend}, size::Int64, ainit::Float32, binit::Float32)
@ Main ~/babelstream-upstream/JuliaStream.jl/src/Test.jl:102
[5] top-level scope
@ ~/babelstream-upstream/JuliaStream.jl/src/Test.jl:107
nested task error: MethodError: no method matching isless(::Int64, ::SubArray{Int64, 1, StaticArrays.MMatrix{1, 1024, Int64, 1024}, Tuple{Base.Slice{StaticArrays.SOneTo{1}}, Int64}, true})
Closest candidates are:
isless(::AbstractVector{T} where T, ::AbstractVector{T} where T) at abstractarray.jl:1989
isless(::Any, ::Missing) at missing.jl:88
isless(::Missing, ::Any) at missing.jl:87
...
Stacktrace:
[1] call
@ ~/.julia/packages/Cassette/N5kbV/src/context.jl:456 [inlined]
[2] fallback
@ ~/.julia/packages/Cassette/N5kbV/src/context.jl:454 [inlined]
[3] _overdub_fallback(::Any, ::Vararg{Any, N} where N)
@ ~/.julia/packages/Cassette/N5kbV/src/overdub.jl:582 [inlined]
[4] overdub
@ ~/.julia/packages/Cassette/N5kbV/src/overdub.jl:582 [inlined]
[5] <(::Int64, ::SubArray{Int64, 1, StaticArrays.MMatrix{1, 1024, Int64, 1024}, Tuple{Base.Slice{StaticArrays.SOneTo{1}}, Int64}, true})
@ ./operators.jl:279 [inlined]
[6] overdub
@ ./operators.jl:279 [inlined]
[7] >(::SubArray{Int64, 1, StaticArrays.MMatrix{1, 1024, Int64, 1024}, Tuple{Base.Slice{StaticArrays.SOneTo{1}}, Int64}, true}, ::Int64)
@ ./operators.jl:305 [inlined]
[8] overdub
@ ./operators.jl:305 [inlined]
[9] overdub
@ ~/.julia/packages/KernelAbstractions/8W8KX/src/KernelAbstractions.jl:266 [inlined]
[10] __thread_run(tid::Int64, len::Int64, rem::Int64, obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(1024,)}, KernelAbstractions.NDIteration.DynamicSize, var"#cpu_dot#5"{Float32}}, ndrange::Tuple{Int64}, iterspace::KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(1024,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}, args::Tuple{Vector{Float32}, Vector{Float32}, Int64, Vector{Float32}}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck)
@ KernelAbstractions ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:157
[11] __run(obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(1024,)}, KernelAbstractions.NDIteration.DynamicSize, var"#cpu_dot#5"{Float32}}, ndrange::Tuple{Int64}, iterspace::KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(1024,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}, args::Tuple{Vector{Float32}, Vector{Float32}, Int64, Vector{Float32}}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck)
@ KernelAbstractions ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:130
[12] (::KernelAbstractions.var"#33#34"{Nothing, Nothing, typeof(KernelAbstractions.__run), Tuple{KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(1024,)}, KernelAbstractions.NDIteration.DynamicSize, var"#cpu_dot#5"{Float32}}, Tuple{Int64}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(1024,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}, Tuple{Vector{Float32}, Vector{Float32}, Int64, Vector{Float32}}, KernelAbstractions.NDIteration.NoDynamicCheck}})()
@ KernelAbstractions ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:22
in expression starting at /home/tom/babelstream-upstream/JuliaStream.jl/src/Test.jl:107
Removing the @synchronize macro in the while-loop makes the error go away but the answer becomes incorrect.
I've tried to do @print eltype(offset[1]), it prints the correct generic type (Float32 in the case) so I'm not sure what @synchronize is doing here.
For reference, here is what pkg status says:
[21141c5a] AMDGPU v0.2.7
[c7e460c6] ArgParse v1.1.4
[052768ef] CUDA v3.2.1
[72cfdca4] CUDAKernels v0.3.0
[e2ba6199] ExprTools v0.1.3 ⚲
[63c18a36] KernelAbstractions v0.7.0
[d96e819e] Parameters v0.12.2
[7eb9e9f0] ROCKernels v0.2.0
[8ba89e20] Distributed
And the complete Test.jl reproducer:
using Base: Float32
using ROCKernels, CUDAKernels, KernelAbstractions, CUDA, AMDGPU
const TBSize = 1024::Int
const DotBlocks = 256::Int
@enum Backend cuda rocm cpu
const DeviceWithRepr = Tuple{Any,String,Backend}
function list_rocm_devices()::Vector{DeviceWithRepr}
try
# AMDGPU.agents()'s internal iteration order isn't stable
sorted = sort(AMDGPU.get_agents(:gpu), by = repr)
map(x -> (x, repr(x), rocm), sorted)
catch
# probably unsupported
[]
end
end
function list_cuda_devices()::Vector{DeviceWithRepr}
return !CUDA.functional(false) ? [] :
map(d -> (d, "$(CUDA.name(d)) ($(repr(d)))", cuda), CUDA.devices())
end
function devices()::Vector{DeviceWithRepr}
cudas = list_cuda_devices()
rocms = list_rocm_devices()
cpus = [(undef, "$(Sys.cpu_info()[1].model) ($(Threads.nthreads())T)", cpu)]
vcat(cpus, cudas, rocms)
end
function runDot(device::DeviceWithRepr, size::Int, ainit::T, binit::T)::Tuple{T,T} where {T}
(actual, name, backend) = device
println("Using device: ", name)
as = fill(ainit, size)
bs = fill(binit, size)
if backend == cpu
partial_sum = Vector{T}(undef, DotBlocks)
a = Vector{T}(as)
b = Vector{T}(bs)
backendDevice = CPU()
elseif backend == cuda
CUDA.device!(actual)
partial_sum = CuArray{T}(undef, DotBlocks)
a = CuArray{T}(as)
b = CuArray{T}(bs)
backendDevice = CUDADevice()
elseif backend == rocm
AMDGPU.DEFAULT_AGENT[] = actual
partial_sum = ROCArray{T}(undef, DotBlocks)
a = ROCArray{T}(as)
b = ROCArray{T}(bs)
backendDevice = ROCDevice()
else
error("unsupported backend $(backend)")
end
@kernel function dot(@Const(a), @Const(b), size, partial)
local_i = @index(Local)
group_i = @index(Group)
tb_sum = @localmem T TBSize
@inbounds tb_sum[local_i] = 0.0
# do dot first
i = @index(Global)
while i <= size
@inbounds tb_sum[local_i] += a[i] * b[i]
i += TBSize * DotBlocks
end
# then tree reduction
offset = @private Int64 (1,)
@inbounds begin
offset[1] = @groupsize()[1] ÷ 2
while offset[1] > 0
@synchronize
if (local_i - 1) < offset[1]
tb_sum[local_i] += tb_sum[local_i+offset[1]]
end
offset[1] ÷= 2
end
end
if (local_i == 1)
@inbounds partial[group_i] = tb_sum[local_i]
end
end
function referenceDot()
sum = zero(T)
for i = 1:size
@inbounds sum += a[i] * b[i]
end
return sum
end
wait(dot(backendDevice, TBSize)(a, b, size, partial_sum, ndrange = TBSize * DotBlocks))
return (referenceDot(), sum(partial_sum))
end
device = devices()[1]
(expected, actual) = runDot(device, TBSize * 2, 1.0f0, 2.0f0)
println("actual=", actual, ", expected=", expected)
I'm trying to port the classic GPU tree reduction via KernelAbstractions.jl.
See this for the direct CUDA implementation of what I'm trying to port from.
This is what I have implemented currently:
I was able to get correct results and performance seems mostly on par with our CUDA.jl(https://github.com/UoB-HPC/BabelStream/blob/7c1e04a42b9b03b0e5c5d0b07c0ef9f4bdd59353/JuliaStream.jl/src/CUDAStream.jl#L112) and AMDGPU.jl(https://github.com/UoB-HPC/BabelStream/blob/7c1e04a42b9b03b0e5c5d0b07c0ef9f4bdd59353/JuliaStream.jl/src/AMDGPUStream.jl#L135) implementation.
On CPU however, I got the following error:
Removing the
@synchronizemacro in the while-loop makes the error go away but the answer becomes incorrect.I've tried to do
@print eltype(offset[1]), it prints the correct generic type (Float32in the case) so I'm not sure what@synchronizeis doing here.For reference, here is what
pkg statussays:And the complete
Test.jlreproducer: