ERROR: LoadError: GPU compilation of MethodInstance for (::GPUArrays.var"#gpu_broadcast_kernel_linear#_copyto!##1")(::KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, ::SubArray{Float32, 1, CuDeviceVector{Float32, 1}, Tuple{Vector{Int64}}, false}, ::Base.Broadcast.Broadcasted{CUDACore.CuArrayStyle{1, CUDACore.DeviceMemory}, Tuple{Base.OneTo{Int64}}, typeof(identity), Tuple{Base.Broadcast.Extruded{SubArray{Float32, 1, CuDeviceVector{Float32, 1}, Tuple{CuDeviceVector{Int64, 1}}, false}, Tuple{Bool}, Tuple{Int64}}}}) failed
KernelError: passing non-bitstype argument
Argument 3 to your kernel function is of type SubArray{Float32, 1, CUDACore.CuDeviceVector{Float32, 1}, Tuple{Vector{Int64}}, false}, which is not a bitstype:
.indices is of type Tuple{Vector{Int64}} which is not isbits.
.1 is of type Vector{Int64} which is not isbits.
.ref is of type MemoryRef{Int64} which is not isbits.
.mem is of type Memory{Int64} which is not isbits.
Only bitstypes, which are "plain data" types that are immutable
and contain no references to other values, can be used in GPU kernels.
For more information, see the `Base.isbitstype` function.
Stacktrace:
[1] check_invocation(job::GPUCompiler.CompilerJob)
@ GPUCompiler ~/.julia/packages/GPUCompiler/kLxLE/src/validation.jl:108
[2] compile_unhooked(output::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
@ GPUCompiler ~/.julia/packages/GPUCompiler/kLxLE/src/driver.jl:87
[3] compile_unhooked
@ ~/.julia/packages/GPUCompiler/kLxLE/src/driver.jl:80 [inlined]
[4] #compile#96
@ ~/.julia/packages/GPUCompiler/kLxLE/src/driver.jl:67 [inlined]
[5] compile(target::Symbol, job::GPUCompiler.CompilerJob)
@ GPUCompiler ~/.julia/packages/GPUCompiler/kLxLE/src/driver.jl:55
[6] #invoke_frozen#589
@ ~/.julia/dev/CUDA/CUDACore/src/initialization.jl:30 [inlined]
[7] invoke_frozen
@ ~/.julia/dev/CUDA/CUDACore/src/initialization.jl:26 [inlined]
[8] #compile##0
@ ~/.julia/dev/CUDA/CUDACore/src/compiler/compilation.jl:250 [inlined]
[9] JuliaContext(f::CUDACore.var"#compile##0#compile##1"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDACore.CUDACompilerParams}}; kwargs::@Kwargs{})
@ GPUCompiler ~/.julia/packages/GPUCompiler/kLxLE/src/driver.jl:34
[10] JuliaContext(f::Function)
@ GPUCompiler ~/.julia/packages/GPUCompiler/kLxLE/src/driver.jl:25
[11] compile(job::GPUCompiler.CompilerJob)
@ CUDACore ~/.julia/dev/CUDA/CUDACore/src/compiler/compilation.jl:249
[12] actual_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDACore.CUDACompilerParams}, compiler::typeof(CUDACore.compile), linker::typeof(CUDACore.link))
@ GPUCompiler ~/.julia/packages/GPUCompiler/kLxLE/src/execution.jl:245
[13] cached_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDACore.CUDACompilerParams}, compiler::Function, linker::Function)
@ GPUCompiler ~/.julia/packages/GPUCompiler/kLxLE/src/execution.jl:159
[14] macro expansion
@ ~/.julia/dev/CUDA/CUDACore/src/compiler/execution.jl:450 [inlined]
[15] macro expansion
@ ./lock.jl:376 [inlined]
[16] cufunction(f::GPUArrays.var"#gpu_broadcast_kernel_linear#_copyto!##1", tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, SubArray{Float32, 1, CuDeviceVector{Float32, 1}, Tuple{Vector{Int64}}, false}, Base.Broadcast.Broadcasted{CUDACore.CuArrayStyle{1, CUDACore.DeviceMemory}, Tuple{Base.OneTo{Int64}}, typeof(identity), Tuple{Base.Broadcast.Extruded{SubArray{Float32, 1, CuDeviceVector{Float32, 1}, Tuple{CuDeviceVector{Int64, 1}}, false}, Tuple{Bool}, Tuple{Int64}}}}}}; kwargs::@Kwargs{always_inline::Bool, maxthreads::Nothing})
@ CUDACore ~/.julia/dev/CUDA/CUDACore/src/compiler/execution.jl:445
[17] cufunction
@ ~/.julia/dev/CUDA/CUDACore/src/compiler/execution.jl:442 [inlined]
[18] #kernel_compile#737
@ ~/.julia/dev/CUDA/CUDACore/src/compiler/execution.jl:59 [inlined]
[19] macro expansion
@ ~/.julia/dev/CUDA/CUDACore/src/compiler/execution.jl:182 [inlined]
[20] (::KernelAbstractions.Kernel{CUDABackend, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, GPUArrays.var"#gpu_broadcast_kernel_linear#_copyto!##1"})(::SubArray{Float32, 1, CuArray{Float32, 1, CUDACore.DeviceMemory}, Tuple{Vector{Int64}}, false}, ::Vararg{Any}; ndrange::Tuple{Int64}, workgroupsize::Nothing)
@ CUDACore.CUDAKernels ~/.julia/dev/CUDA/CUDACore/src/CUDAKernels.jl:125
[21] Kernel
@ ~/.julia/dev/CUDA/CUDACore/src/CUDAKernels.jl:111 [inlined]
[22] _copyto!
@ ~/.julia/packages/GPUArrays/V2qKA/src/host/broadcast.jl:79 [inlined]
[23] materialize!
@ ~/.julia/packages/GPUArrays/V2qKA/src/host/broadcast.jl:43 [inlined]
[24] materialize!(dest::SubArray{Float32, 1, CuArray{Float32, 1, CUDACore.DeviceMemory}, Tuple{Vector{Int64}}, false}, bc::Base.Broadcast.Broadcasted{CUDACore.CuArrayStyle{1, CUDACore.DeviceMemory}, Nothing, typeof(identity), Tuple{SubArray{Float32, 1, CuArray{Float32, 1, CUDACore.DeviceMemory}, Tuple{CuArray{Int64, 1, CUDACore.DeviceMemory}}, false}}})
@ Base.Broadcast ./broadcast.jl:902
[25] top-level scope
@ ~/.julia/dev/CUDA/subarray_copy.jl:8
[26] include(mod::Module, _path::String)
@ Base ./Base.jl:306
[27] exec_options(opts::Base.JLOptions)
@ Base ./client.jl:317
[28] _start()
@ Base ./client.jl:550
in expression starting at /home/kshyatt/.julia/dev/CUDA/subarray_copy.jl:8
MWE:
Fails with:
I'm posting this here rather than at
CUDA.jlbecause I think this is more related to the existing bcast kernel. I understand these indices are not contiguous but this feels like it could be made to work?Julia version 1.12.6
GPUArrays version 11.5.4
GPUCompiler version 1.13.0
CUDA 6.1.0