Broadcasting on `SubArray` with inds of `Vector{Int}` fails with compiler issue

MWE:

```julia
using CUDA, LinearAlgebra

A = CUDA.cuRAND.randn(64, 64)
B = CUDA.cuRAND.randn(32)

Ad = view(A, diagind(A))

view(Ad, [1, 3, 4, 16]) .= view(B, [2, 6, 8, 10])
```

Fails with:

```
ERROR: LoadError: GPU compilation of MethodInstance for (::GPUArrays.var"#gpu_broadcast_kernel_linear#_copyto!##1")(::KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, ::SubArray{Float32, 1, CuDeviceVector{Float32, 1}, Tuple{Vector{Int64}}, false}, ::Base.Broadcast.Broadcasted{CUDACore.CuArrayStyle{1, CUDACore.DeviceMemory}, Tuple{Base.OneTo{Int64}}, typeof(identity), Tuple{Base.Broadcast.Extruded{SubArray{Float32, 1, CuDeviceVector{Float32, 1}, Tuple{CuDeviceVector{Int64, 1}}, false}, Tuple{Bool}, Tuple{Int64}}}}) failed
KernelError: passing non-bitstype argument

Argument 3 to your kernel function is of type SubArray{Float32, 1, CUDACore.CuDeviceVector{Float32, 1}, Tuple{Vector{Int64}}, false}, which is not a bitstype:
  .indices is of type Tuple{Vector{Int64}} which is not isbits.
    .1 is of type Vector{Int64} which is not isbits.
      .ref is of type MemoryRef{Int64} which is not isbits.
        .mem is of type Memory{Int64} which is not isbits.


Only bitstypes, which are "plain data" types that are immutable
and contain no references to other values, can be used in GPU kernels.
For more information, see the `Base.isbitstype` function.

Stacktrace:
  [1] check_invocation(job::GPUCompiler.CompilerJob)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/kLxLE/src/validation.jl:108
  [2] compile_unhooked(output::Symbol, job::GPUCompiler.CompilerJob; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/kLxLE/src/driver.jl:87
  [3] compile_unhooked
    @ ~/.julia/packages/GPUCompiler/kLxLE/src/driver.jl:80 [inlined]
  [4] #compile#96
    @ ~/.julia/packages/GPUCompiler/kLxLE/src/driver.jl:67 [inlined]
  [5] compile(target::Symbol, job::GPUCompiler.CompilerJob)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/kLxLE/src/driver.jl:55
  [6] #invoke_frozen#589
    @ ~/.julia/dev/CUDA/CUDACore/src/initialization.jl:30 [inlined]
  [7] invoke_frozen
    @ ~/.julia/dev/CUDA/CUDACore/src/initialization.jl:26 [inlined]
  [8] #compile##0
    @ ~/.julia/dev/CUDA/CUDACore/src/compiler/compilation.jl:250 [inlined]
  [9] JuliaContext(f::CUDACore.var"#compile##0#compile##1"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDACore.CUDACompilerParams}}; kwargs::@Kwargs{})
    @ GPUCompiler ~/.julia/packages/GPUCompiler/kLxLE/src/driver.jl:34
 [10] JuliaContext(f::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/kLxLE/src/driver.jl:25
 [11] compile(job::GPUCompiler.CompilerJob)
    @ CUDACore ~/.julia/dev/CUDA/CUDACore/src/compiler/compilation.jl:249
 [12] actual_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDACore.CUDACompilerParams}, compiler::typeof(CUDACore.compile), linker::typeof(CUDACore.link))
    @ GPUCompiler ~/.julia/packages/GPUCompiler/kLxLE/src/execution.jl:245
 [13] cached_compilation(cache::Dict{Any, CuFunction}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDACore.CUDACompilerParams}, compiler::Function, linker::Function)
    @ GPUCompiler ~/.julia/packages/GPUCompiler/kLxLE/src/execution.jl:159
 [14] macro expansion
    @ ~/.julia/dev/CUDA/CUDACore/src/compiler/execution.jl:450 [inlined]
 [15] macro expansion
    @ ./lock.jl:376 [inlined]
 [16] cufunction(f::GPUArrays.var"#gpu_broadcast_kernel_linear#_copyto!##1", tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, SubArray{Float32, 1, CuDeviceVector{Float32, 1}, Tuple{Vector{Int64}}, false}, Base.Broadcast.Broadcasted{CUDACore.CuArrayStyle{1, CUDACore.DeviceMemory}, Tuple{Base.OneTo{Int64}}, typeof(identity), Tuple{Base.Broadcast.Extruded{SubArray{Float32, 1, CuDeviceVector{Float32, 1}, Tuple{CuDeviceVector{Int64, 1}}, false}, Tuple{Bool}, Tuple{Int64}}}}}}; kwargs::@Kwargs{always_inline::Bool, maxthreads::Nothing})
    @ CUDACore ~/.julia/dev/CUDA/CUDACore/src/compiler/execution.jl:445
 [17] cufunction
    @ ~/.julia/dev/CUDA/CUDACore/src/compiler/execution.jl:442 [inlined]
 [18] #kernel_compile#737
    @ ~/.julia/dev/CUDA/CUDACore/src/compiler/execution.jl:59 [inlined]
 [19] macro expansion
    @ ~/.julia/dev/CUDA/CUDACore/src/compiler/execution.jl:182 [inlined]
 [20] (::KernelAbstractions.Kernel{CUDABackend, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, GPUArrays.var"#gpu_broadcast_kernel_linear#_copyto!##1"})(::SubArray{Float32, 1, CuArray{Float32, 1, CUDACore.DeviceMemory}, Tuple{Vector{Int64}}, false}, ::Vararg{Any}; ndrange::Tuple{Int64}, workgroupsize::Nothing)
    @ CUDACore.CUDAKernels ~/.julia/dev/CUDA/CUDACore/src/CUDAKernels.jl:125
 [21] Kernel
    @ ~/.julia/dev/CUDA/CUDACore/src/CUDAKernels.jl:111 [inlined]
 [22] _copyto!
    @ ~/.julia/packages/GPUArrays/V2qKA/src/host/broadcast.jl:79 [inlined]
 [23] materialize!
    @ ~/.julia/packages/GPUArrays/V2qKA/src/host/broadcast.jl:43 [inlined]
 [24] materialize!(dest::SubArray{Float32, 1, CuArray{Float32, 1, CUDACore.DeviceMemory}, Tuple{Vector{Int64}}, false}, bc::Base.Broadcast.Broadcasted{CUDACore.CuArrayStyle{1, CUDACore.DeviceMemory}, Nothing, typeof(identity), Tuple{SubArray{Float32, 1, CuArray{Float32, 1, CUDACore.DeviceMemory}, Tuple{CuArray{Int64, 1, CUDACore.DeviceMemory}}, false}}})
    @ Base.Broadcast ./broadcast.jl:902
 [25] top-level scope
    @ ~/.julia/dev/CUDA/subarray_copy.jl:8
 [26] include(mod::Module, _path::String)
    @ Base ./Base.jl:306
 [27] exec_options(opts::Base.JLOptions)
    @ Base ./client.jl:317
 [28] _start()
    @ Base ./client.jl:550
in expression starting at /home/kshyatt/.julia/dev/CUDA/subarray_copy.jl:8
```

I'm posting this here rather than at `CUDA.jl` because I think this is more related to the existing bcast kernel. I understand these indices are not contiguous but this feels like it could be made to work?

Julia version 1.12.6
GPUArrays version 11.5.4
GPUCompiler version 1.13.0
CUDA 6.1.0


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Broadcasting on `SubArray` with inds of `Vector{Int}` fails with compiler issue #724

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Broadcasting on SubArray with inds of Vector{Int} fails with compiler issue #724

Description

Metadata

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Issue actions

Broadcasting on `SubArray` with inds of `Vector{Int}` fails with compiler issue #724