From a1f150ac2b4a7868c8638946b6bac94521802bac Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Wed, 6 May 2026 12:48:01 -0500 Subject: [PATCH 1/3] Upstream commit 765d09dd1b04 ("clang/AMDGPU: Fix workgroup size builtins for nonuniform work group sizes") changed the semantics of __builtin_amdgcn_workgroup_size_{x,y,z}(): MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Before: returned the uniform/enqueued work‑group size (a direct load from dispatch->workgroup_size_x for COV4, or implicitarg.group_size for COV5). - After: when OffloadUniformBlock is false (the default for OpenCL ≥ 2.0 and for non‑uniform launches), the builtin emits umin(grid_size − workgroup_id*group_size, group_size) — the actual size of the current work‑group, which equals the uniform size for full groups but is smaller for the trailing partial group. The change broke every consumer that was relying on the historical "uniform size" contract. In particular, in rocm-device-libs/ockl/src/workitem.cl (legacy ABI / COV ≤ 4 path): ┌─────────────────────────────────┬───────────────────────────────────────────────────┐ │ Function │ Why it broke │ ├─────────────────────────────────┼───────────────────────────────────────────────────┤ │ get_global_id_{x,y,z} │ computes g*s + l + offset; needs s = uniform size │ ├─────────────────────────────────┼───────────────────────────────────────────────────┤ │ get_global_linear_id_{x,y,z} │ same g*s + l formula │ ├─────────────────────────────────┼───────────────────────────────────────────────────┤ │ get_enqueued_local_size_{x,y,z} │ by definition is the uniform size │ ├─────────────────────────────────┼───────────────────────────────────────────────────┤ │ get_num_groups_{x,y,z} │ n/d + (n > q*d); needs d = uniform size │ └─────────────────────────────────┴───────────────────────────────────────────────────┘ get_local_size_{x,y,z} already manually computes the remainder using grid_size, so it stays correct (just redundant) when s is uniform; with the post‑patch builtin it coincidentally still returns the right answer because min(remainder, actual) collapses to actual. clang/lib/Headers/amdgpuintrin.h (__gpu_num_blocks_*, __gpu_num_threads_*) has the same bug for OpenMP/HIP offload code. The OpenCL CTS test fails because the HSA runtime's blit kernel (rocm-systems/projects/rocr-runtime/runtime/hsa-runtime/image/blit_src/imageblit_kernels.cl, function copy_buffer_to_image) is built with -cl-std=CL2.0 -mcode-object-version=4, calls get_global_id, and links against device‑libs. After the patch its get_global_id_x returns g*actual_size + l, so threads in the partial trailing work‑group write the source buffer to the wrong destination pixels. When the test then reads coord 321 of a 329‑wide image it sees the never‑written zero‑initialized pixel — exactly the (0,0,0,1) that we observed. Two files modified: 1. compiler/amd-llvm/amd/device-libs/ockl/src/workitem.cl — in every OLD_ABI branch of the four affected function families, replace __builtin_amdgcn_workgroup_size_{x,y,z}() with a direct read of p->workgroup_size_{x,y,z} from the HSA dispatch packet (the value the builtin used to return). 12 call sites updated; added a header comment explaining why the builtin can't be used here. 2. compiler/amd-llvm/clang/lib/Headers/amdgpuintrin.h — added a private __GPU_AMDGCN_UNIFORM_WG_SIZE(idx) macro that reads the uniform work‑group size from the dispatch packet (uint16_t at byte offset 4 + 2*idx), and routed __gpu_num_blocks_{x,y,z} and __gpu_num_threads_{x,y,z} through it. Verification - Built artifact-group-opencl-runtime artifact-base amd-comgr+dist ROCR-Runtime+dist artifact-sysdeps. - Disassembled the rebuilt copy_buffer_to_image blit kernel: it now does workgroup_id * uniform_size + workitem_id directly (no s_sub_i32 / s_min_u32 remainder dance), matching the workaround output. - test_image_streams CL_RGB CL_UNORM_INT_101010 CL_FILTER_NEAREST CL_ADDRESS_CLAMP_TO_EDGE UNNORMALIZED read float 1D now passes 2/2 sub‑tests, three runs in a row. Co-authored-by: Claude --- amd/device-libs/ockl/src/workitem.cl | 53 ++++++++++++++++++---------- clang/lib/Headers/amdgpuintrin.h | 20 +++++++---- 2 files changed, 49 insertions(+), 24 deletions(-) diff --git a/amd/device-libs/ockl/src/workitem.cl b/amd/device-libs/ockl/src/workitem.cl index 9f29b01a60675..c6d448295f28d 100644 --- a/amd/device-libs/ockl/src/workitem.cl +++ b/amd/device-libs/ockl/src/workitem.cl @@ -86,6 +86,16 @@ get_global_size_z(void) } } +// Compute the uniform (enqueued) work-group size on the legacy ABI path. +// Historically __builtin_amdgcn_workgroup_size_{x,y,z}() returned this value +// (a direct load from the HSA dispatch packet), but a later clang change made +// the builtin emit the *actual* size of the current work-group, which differs +// from the uniform size for the trailing partial group when work-groups are +// non-uniform (OpenCL 2.0+ default). The functions in this file that compute +// global IDs, num-groups, enqueued-local-size, and global-linear-IDs all need +// the uniform size; using the builtin here would return wrong results for +// every work-item in the partial group. Read the dispatch packet directly +// instead. ATTR static size_t get_global_id_x(void) { @@ -93,7 +103,8 @@ get_global_id_x(void) uint g = __builtin_amdgcn_workgroup_id_x(); uint s; if (OLD_ABI) { - s = __builtin_amdgcn_workgroup_size_x(); + __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); + s = p->workgroup_size_x; } else { __constant amdhsa_implicit_kernarg_v5 *args = get_v5_implicitarg_ptr(); s = (uint)args->group_size[0]; @@ -108,7 +119,8 @@ get_global_id_y(void) uint g = __builtin_amdgcn_workgroup_id_y(); uint s; if (OLD_ABI) { - s = __builtin_amdgcn_workgroup_size_y(); + __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); + s = p->workgroup_size_y; } else { __constant amdhsa_implicit_kernarg_v5 *args = get_v5_implicitarg_ptr(); s = (uint)args->group_size[1]; @@ -123,7 +135,8 @@ get_global_id_z(void) uint g = __builtin_amdgcn_workgroup_id_z(); uint s; if (OLD_ABI) { - s = __builtin_amdgcn_workgroup_size_z(); + __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); + s = p->workgroup_size_z; } else { __constant amdhsa_implicit_kernarg_v5 *args = get_v5_implicitarg_ptr(); s = (uint)args->group_size[2]; @@ -137,7 +150,7 @@ get_local_size_x(void) if (OLD_ABI) { __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); uint group_id = __builtin_amdgcn_workgroup_id_x(); - uint group_size = __builtin_amdgcn_workgroup_size_x(); + uint group_size = p->workgroup_size_x; uint grid_size = p->grid_size_x; uint r = grid_size - group_id * group_size; return (r < group_size) ? r : group_size; @@ -154,7 +167,7 @@ get_local_size_y(void) if (OLD_ABI) { __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); uint group_id = __builtin_amdgcn_workgroup_id_y(); - uint group_size = __builtin_amdgcn_workgroup_size_y(); + uint group_size = p->workgroup_size_y; uint grid_size = p->grid_size_y; uint r = grid_size - group_id * group_size; return (r < group_size) ? r : group_size; @@ -171,7 +184,7 @@ get_local_size_z(void) if (OLD_ABI) { __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); uint group_id = __builtin_amdgcn_workgroup_id_z(); - uint group_size = __builtin_amdgcn_workgroup_size_z(); + uint group_size = p->workgroup_size_z; uint grid_size = p->grid_size_z; uint r = grid_size - group_id * group_size; return (r < group_size) ? r : group_size; @@ -186,7 +199,8 @@ ATTR static size_t get_enqueued_local_size_x(void) { if (OLD_ABI) { - return __builtin_amdgcn_workgroup_size_x(); + __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); + return p->workgroup_size_x; } else { return (size_t)get_v5_implicitarg_ptr()->group_size[0]; } @@ -196,7 +210,8 @@ ATTR static size_t get_enqueued_local_size_y(void) { if (OLD_ABI) { - return __builtin_amdgcn_workgroup_size_y(); + __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); + return p->workgroup_size_y; } else { return (size_t)get_v5_implicitarg_ptr()->group_size[1]; } @@ -206,7 +221,8 @@ ATTR static size_t get_enqueued_local_size_z(void) { if (OLD_ABI) { - return __builtin_amdgcn_workgroup_size_z(); + __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); + return p->workgroup_size_z; } else { return (size_t)get_v5_implicitarg_ptr()->group_size[2]; } @@ -218,7 +234,7 @@ get_num_groups_x(void) if (OLD_ABI) { __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); uint n = p->grid_size_x; - uint d = __builtin_amdgcn_workgroup_size_x(); + uint d = p->workgroup_size_x; uint q = n / d; return q + (n > q*d); } else { @@ -233,7 +249,7 @@ get_num_groups_y(void) if (OLD_ABI) { __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); uint n = p->grid_size_y; - uint d = __builtin_amdgcn_workgroup_size_y(); + uint d = p->workgroup_size_y; uint q = n / d; return q + (n > q*d); } else { @@ -248,7 +264,7 @@ get_num_groups_z(void) if (OLD_ABI) { __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); uint n = p->grid_size_z; - uint d = __builtin_amdgcn_workgroup_size_z(); + uint d = p->workgroup_size_z; uint q = n / d; return q + (n > q*d); } else { @@ -275,7 +291,8 @@ get_global_linear_id_x(void) uint g0 = __builtin_amdgcn_workgroup_id_x(); uint s0; if (OLD_ABI) { - s0 = __builtin_amdgcn_workgroup_size_x(); + __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); + s0 = p->workgroup_size_x; } else { s0 = (uint)get_v5_implicitarg_ptr()->group_size[0]; } @@ -294,8 +311,8 @@ get_global_linear_id_y(void) if (OLD_ABI) { __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); - s0 = __builtin_amdgcn_workgroup_size_x(); - s1 = __builtin_amdgcn_workgroup_size_y(); + s0 = p->workgroup_size_x; + s1 = p->workgroup_size_y; n0 = p->grid_size_x; } else { __constant amdhsa_implicit_kernarg_v5 *args = get_v5_implicitarg_ptr(); @@ -322,9 +339,9 @@ get_global_linear_id_z(void) if (OLD_ABI) { __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); - s0 = __builtin_amdgcn_workgroup_size_x(); - s1 = __builtin_amdgcn_workgroup_size_y(); - s2 = __builtin_amdgcn_workgroup_size_z(); + s0 = p->workgroup_size_x; + s1 = p->workgroup_size_y; + s2 = p->workgroup_size_z; n0 = p->grid_size_x; n1 = p->grid_size_y; } else { diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h index 3f4bba0a03930..95f8040b4f6cc 100644 --- a/clang/lib/Headers/amdgpuintrin.h +++ b/clang/lib/Headers/amdgpuintrin.h @@ -27,19 +27,27 @@ _Pragma("omp begin declare variant match(device = {arch(amdgcn)})"); #define __gpu_global __attribute__((address_space(1))) #define __gpu_generic __attribute__((address_space(0))) +// __builtin_amdgcn_workgroup_size_{x,y,z}() now returns the actual size of +// the current work-group, which differs from the uniform (enqueued) size for +// the trailing partial work-group. __gpu_num_blocks_* and __gpu_num_threads_* +// must report the uniform size, so read it directly from the HSA dispatch +// packet (workgroup_size_{x,y,z} live at offsets 4, 6, 8 as uint16_t). +#define __GPU_AMDGCN_UNIFORM_WG_SIZE(idx) \ + ((uint32_t)((const __gpu_constant uint16_t *)__builtin_amdgcn_dispatch_ptr())[2 + (idx)]) + // Returns the number of workgroups in the 'x' dimension of the grid. _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) { - return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x(); + return __builtin_amdgcn_grid_size_x() / __GPU_AMDGCN_UNIFORM_WG_SIZE(0); } // Returns the number of workgroups in the 'y' dimension of the grid. _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) { - return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y(); + return __builtin_amdgcn_grid_size_y() / __GPU_AMDGCN_UNIFORM_WG_SIZE(1); } // Returns the number of workgroups in the 'z' dimension of the grid. _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) { - return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z(); + return __builtin_amdgcn_grid_size_z() / __GPU_AMDGCN_UNIFORM_WG_SIZE(2); } // Returns the 'x' dimension of the current AMD workgroup's id. @@ -59,17 +67,17 @@ _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) { // Returns the number of workitems in the 'x' dimension. _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) { - return __builtin_amdgcn_workgroup_size_x(); + return __GPU_AMDGCN_UNIFORM_WG_SIZE(0); } // Returns the number of workitems in the 'y' dimension. _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) { - return __builtin_amdgcn_workgroup_size_y(); + return __GPU_AMDGCN_UNIFORM_WG_SIZE(1); } // Returns the number of workitems in the 'z' dimension. _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) { - return __builtin_amdgcn_workgroup_size_z(); + return __GPU_AMDGCN_UNIFORM_WG_SIZE(2); } // Returns the 'x' dimension id of the workitem in the current AMD workgroup. From 078be6dc6e4f079a388e86c5315f02ec3ed131d2 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Wed, 6 May 2026 17:07:44 -0500 Subject: [PATCH 2/3] Remove comment --- amd/device-libs/ockl/src/workitem.cl | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/amd/device-libs/ockl/src/workitem.cl b/amd/device-libs/ockl/src/workitem.cl index c6d448295f28d..08452bdf3a594 100644 --- a/amd/device-libs/ockl/src/workitem.cl +++ b/amd/device-libs/ockl/src/workitem.cl @@ -86,16 +86,6 @@ get_global_size_z(void) } } -// Compute the uniform (enqueued) work-group size on the legacy ABI path. -// Historically __builtin_amdgcn_workgroup_size_{x,y,z}() returned this value -// (a direct load from the HSA dispatch packet), but a later clang change made -// the builtin emit the *actual* size of the current work-group, which differs -// from the uniform size for the trailing partial group when work-groups are -// non-uniform (OpenCL 2.0+ default). The functions in this file that compute -// global IDs, num-groups, enqueued-local-size, and global-linear-IDs all need -// the uniform size; using the builtin here would return wrong results for -// every work-item in the partial group. Read the dispatch packet directly -// instead. ATTR static size_t get_global_id_x(void) { From 02757ac6d0721d036ed0c0f8bc8197e9ec0ce72c Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Mon, 11 May 2026 11:43:45 -0500 Subject: [PATCH 3/3] Revert gpuintrin.h changes. --- clang/lib/Headers/amdgpuintrin.h | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h index 95f8040b4f6cc..3f4bba0a03930 100644 --- a/clang/lib/Headers/amdgpuintrin.h +++ b/clang/lib/Headers/amdgpuintrin.h @@ -27,27 +27,19 @@ _Pragma("omp begin declare variant match(device = {arch(amdgcn)})"); #define __gpu_global __attribute__((address_space(1))) #define __gpu_generic __attribute__((address_space(0))) -// __builtin_amdgcn_workgroup_size_{x,y,z}() now returns the actual size of -// the current work-group, which differs from the uniform (enqueued) size for -// the trailing partial work-group. __gpu_num_blocks_* and __gpu_num_threads_* -// must report the uniform size, so read it directly from the HSA dispatch -// packet (workgroup_size_{x,y,z} live at offsets 4, 6, 8 as uint16_t). -#define __GPU_AMDGCN_UNIFORM_WG_SIZE(idx) \ - ((uint32_t)((const __gpu_constant uint16_t *)__builtin_amdgcn_dispatch_ptr())[2 + (idx)]) - // Returns the number of workgroups in the 'x' dimension of the grid. _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) { - return __builtin_amdgcn_grid_size_x() / __GPU_AMDGCN_UNIFORM_WG_SIZE(0); + return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x(); } // Returns the number of workgroups in the 'y' dimension of the grid. _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_y(void) { - return __builtin_amdgcn_grid_size_y() / __GPU_AMDGCN_UNIFORM_WG_SIZE(1); + return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y(); } // Returns the number of workgroups in the 'z' dimension of the grid. _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_z(void) { - return __builtin_amdgcn_grid_size_z() / __GPU_AMDGCN_UNIFORM_WG_SIZE(2); + return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z(); } // Returns the 'x' dimension of the current AMD workgroup's id. @@ -67,17 +59,17 @@ _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id_z(void) { // Returns the number of workitems in the 'x' dimension. _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_x(void) { - return __GPU_AMDGCN_UNIFORM_WG_SIZE(0); + return __builtin_amdgcn_workgroup_size_x(); } // Returns the number of workitems in the 'y' dimension. _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_y(void) { - return __GPU_AMDGCN_UNIFORM_WG_SIZE(1); + return __builtin_amdgcn_workgroup_size_y(); } // Returns the number of workitems in the 'z' dimension. _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads_z(void) { - return __GPU_AMDGCN_UNIFORM_WG_SIZE(2); + return __builtin_amdgcn_workgroup_size_z(); } // Returns the 'x' dimension id of the workitem in the current AMD workgroup.