Skip to content

Broadcasting on SubArray with inds of Vector{Int} fails with compiler issue #724

@kshyatt

Description

@kshyatt

MWE:

using CUDA, LinearAlgebra

A = CUDA.cuRAND.randn(64, 64)
B = CUDA.cuRAND.randn(32)

Ad = view(A, diagind(A))

view(Ad, [1, 3, 4, 16]) .= view(B, [2, 6, 8, 10])

Fails with:

ERROR: LoadError: GPU compilation of MethodInstance for (::GPUArrays.var"#gpu_broadcast_kernel_linear#_copyto!##1")(::KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, ::SubArray{Float32, 1, CuDeviceVector{Float32, 1}, Tuple{Vector{Int64}}, false}, ::Base.Broadcast.Broadcasted{CUDACore.CuArrayStyle{1, CUDACore.DeviceMemory}, Tuple{Base.OneTo{Int64}}, typeof(identity), Tuple{Base.Broadcast.Extruded{SubArray{Float32, 1, CuDeviceVector{Float32, 1}, Tuple{CuDeviceVector{Int64, 1}}, false}, Tuple{Bool}, Tuple{Int64}}}}) failed
KernelError: passing non-bitstype argument

Argument 3 to your kernel function is of type SubArray{Float32, 1, CUDACore.CuDeviceVector{Float32, 1}, Tuple{Vector{Int64}}, false}, which is not a bitstype:
  .indices is of type Tuple{Vector{Int64}} which is not isbits.
    .1 is of type Vector{Int64} which is not isbits.
      .ref is of type MemoryRef{Int64} which is not isbits.
        .mem is of type Memory{Int64} which is not isbits.


Only bitstypes, which are "plain data" types that are immutable
and contain no references to other values, can be used in GPU kernels.
For more information, see the `Base.isbitstype` function.

Stacktrace:
  [1] check_invocation(job::GPUCompiler.CompilerJob)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/kLxLE/src/validation.jl:108
  [2] compile_unhooked(output::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/kLxLE/src/driver.jl:87
  [3] compile_unhooked
    @ ~/.julia/packages/GPUCompiler/kLxLE/src/driver.jl:80 [inlined]
  [4] #compile#96
    @ ~/.julia/packages/GPUCompiler/kLxLE/src/driver.jl:67 [inlined]
  [5] compile(target::Symbol, job::GPUCompiler.CompilerJob)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/kLxLE/src/driver.jl:55
  [6] #invoke_frozen#589
    @ ~/.julia/dev/CUDA/CUDACore/src/initialization.jl:30 [inlined]
  [7] invoke_frozen
    @ ~/.julia/dev/CUDA/CUDACore/src/initialization.jl:26 [inlined]
  [8] #compile##0
    @ ~/.julia/dev/CUDA/CUDACore/src/compiler/compilation.jl:250 [inlined]
  [9] JuliaContext(f::CUDACore.var"#compile##0#compile##1"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDACore.CUDACompilerParams}}; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/kLxLE/src/driver.jl:34
 [10] JuliaContext(f::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/kLxLE/src/driver.jl:25
 [11] compile(job::GPUCompiler.CompilerJob)
    @ CUDACore ~/.julia/dev/CUDA/CUDACore/src/compiler/compilation.jl:249
 [12] actual_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDACore.CUDACompilerParams}, compiler::typeof(CUDACore.compile), linker::typeof(CUDACore.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/kLxLE/src/execution.jl:245
 [13] cached_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDACore.CUDACompilerParams}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/kLxLE/src/execution.jl:159
 [14] macro expansion
    @ ~/.julia/dev/CUDA/CUDACore/src/compiler/execution.jl:450 [inlined]
 [15] macro expansion
    @ ./lock.jl:376 [inlined]
 [16] cufunction(f::GPUArrays.var"#gpu_broadcast_kernel_linear#_copyto!##1", tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, SubArray{Float32, 1, CuDeviceVector{Float32, 1}, Tuple{Vector{Int64}}, false}, Base.Broadcast.Broadcasted{CUDACore.CuArrayStyle{1, CUDACore.DeviceMemory}, Tuple{Base.OneTo{Int64}}, typeof(identity), Tuple{Base.Broadcast.Extruded{SubArray{Float32, 1, CuDeviceVector{Float32, 1}, Tuple{CuDeviceVector{Int64, 1}}, false}, Tuple{Bool}, Tuple{Int64}}}}}}; kwargs::@Kwargs{always_inline::Bool, maxthreads::Nothing})
    @ CUDACore ~/.julia/dev/CUDA/CUDACore/src/compiler/execution.jl:445
 [17] cufunction
    @ ~/.julia/dev/CUDA/CUDACore/src/compiler/execution.jl:442 [inlined]
 [18] #kernel_compile#737
    @ ~/.julia/dev/CUDA/CUDACore/src/compiler/execution.jl:59 [inlined]
 [19] macro expansion
    @ ~/.julia/dev/CUDA/CUDACore/src/compiler/execution.jl:182 [inlined]
 [20] (::KernelAbstractions.Kernel{CUDABackend, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, GPUArrays.var"#gpu_broadcast_kernel_linear#_copyto!##1"})(::SubArray{Float32, 1, CuArray{Float32, 1, CUDACore.DeviceMemory}, Tuple{Vector{Int64}}, false}, ::Vararg{Any}; ndrange::Tuple{Int64}, workgroupsize::Nothing)
    @ CUDACore.CUDAKernels ~/.julia/dev/CUDA/CUDACore/src/CUDAKernels.jl:125
 [21] Kernel
    @ ~/.julia/dev/CUDA/CUDACore/src/CUDAKernels.jl:111 [inlined]
 [22] _copyto!
    @ ~/.julia/packages/GPUArrays/V2qKA/src/host/broadcast.jl:79 [inlined]
 [23] materialize!
    @ ~/.julia/packages/GPUArrays/V2qKA/src/host/broadcast.jl:43 [inlined]
 [24] materialize!(dest::SubArray{Float32, 1, CuArray{Float32, 1, CUDACore.DeviceMemory}, Tuple{Vector{Int64}}, false}, bc::Base.Broadcast.Broadcasted{CUDACore.CuArrayStyle{1, CUDACore.DeviceMemory}, Nothing, typeof(identity), Tuple{SubArray{Float32, 1, CuArray{Float32, 1, CUDACore.DeviceMemory}, Tuple{CuArray{Int64, 1, CUDACore.DeviceMemory}}, false}}})
    @ Base.Broadcast ./broadcast.jl:902
 [25] top-level scope
    @ ~/.julia/dev/CUDA/subarray_copy.jl:8
 [26] include(mod::Module, _path::String)
    @ Base ./Base.jl:306
 [27] exec_options(opts::Base.JLOptions)
    @ Base ./client.jl:317
 [28] _start()
    @ Base ./client.jl:550
in expression starting at /home/kshyatt/.julia/dev/CUDA/subarray_copy.jl:8

I'm posting this here rather than at CUDA.jl because I think this is more related to the existing bcast kernel. I understand these indices are not contiguous but this feels like it could be made to work?

Julia version 1.12.6
GPUArrays version 11.5.4
GPUCompiler version 1.13.0
CUDA 6.1.0

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions