From c253e49b988eef4495b024b65dc4e4f9467f395e Mon Sep 17 00:00:00 2001 From: Erik LaBianca Date: Mon, 27 Apr 2026 15:48:06 -0400 Subject: [PATCH] =?UTF-8?q?fix(ggml-cuda):=20skip=20sm=5F120=E2=86=92sm=5F?= =?UTF-8?q?120a=20for=20consumer=20Blackwell=20(no=20FP4=20MMA)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Consumer Blackwell GPUs (RTX 5090, SM 12.0) do not have FP4 tensor core instructions. The existing code unconditionally replaces sm_120 with sm_120a and compiles mmq-instance-mxfp4/nvfp4 with BLACKWELL_MMA_AVAILABLE, which emits .block_scale / mxf4 PTX that faults on sm_120 hardware. Add GGML_CUDA_BLACKWELL_CONSUMER option (set by parent build when nvidia-smi reports SM 12.x without an explicit 'a' variant): - Skip the 12X→12Xa arch replacement so ggml-cuda compiles for plain sm_120 - Exclude mmq-instance-mxfp4.cu and mmq-instance-nvfp4.cu from the build - Guard their dispatch cases in mmq.cu to prevent linker errors and surface a clear abort if FP4 types are somehow requested at runtime Co-Authored-By: Claude Sonnet 4.6 --- ggml/src/ggml-cuda/CMakeLists.txt | 18 ++++++++++++++++++ ggml/src/ggml-cuda/mmq.cu | 16 +++++++++++++--- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index 6ed2f61fedb..d00cd941d58 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -76,6 +76,11 @@ if (CUDAToolkit_FOUND) # Notably the Blackwell FP4 tensor core instructions are not forwards compatible and therefore need 12Xa. # But while 12X vs. 12Xa can be checked in device code there is (to my knowledge) no easy way to do the same check in host code. # So for now just replace all instances of 12X with 12Xa, this should be fine until Rubin is released. + # + # Set GGML_CUDA_BLACKWELL_CONSUMER=ON to skip this replacement for consumer Blackwell GPUs + # (e.g. RTX 5090, SM 12.0) that don't have FP4 tensor cores and will fault on 12Xa instructions. + option(GGML_CUDA_BLACKWELL_CONSUMER "Skip sm_12X→sm_12Xa replacement for consumer Blackwell" OFF) + if(NOT GGML_CUDA_BLACKWELL_CONSUMER) foreach(ARCHS IN ITEMS CMAKE_CUDA_ARCHITECTURES CMAKE_CUDA_ARCHITECTURES_NATIVE) set(FIXED_ARCHS "") foreach(ARCH IN LISTS ${ARCHS}) @@ -89,6 +94,7 @@ if (CUDAToolkit_FOUND) endforeach() set(${ARCHS} ${FIXED_ARCHS}) endforeach() + endif() # NOT GGML_CUDA_BLACKWELL_CONSUMER # If we try to compile a "native" build it will use the 12X architectures and fail. # So we should instead use the native architectures as determined by CMake after replacing 12X with 12Xa. @@ -111,6 +117,18 @@ if (CUDAToolkit_FOUND) file(GLOB SRCS "template-instances/mmf*.cu") list(APPEND GGML_SOURCES_CUDA ${SRCS}) + if(GGML_CUDA_BLACKWELL_CONSUMER) + # FP4 MMA kernels (mxfp4/nvfp4) require sm_120a instructions not present + # on consumer Blackwell (RTX 5090, SM 12.0). + list(REMOVE_ITEM GGML_SOURCES_CUDA + "${CMAKE_CURRENT_SOURCE_DIR}/template-instances/mmq-instance-mxfp4.cu" + "${CMAKE_CURRENT_SOURCE_DIR}/template-instances/mmq-instance-nvfp4.cu" + ) + # Let dispatch code in mmq.cu know to skip FP4 cases. + add_compile_definitions(GGML_CUDA_BLACKWELL_CONSUMER) + message(STATUS "ggml-cuda: Excluding FP4 MMA kernels (GGML_CUDA_BLACKWELL_CONSUMER)") + endif() + if (GGML_CUDA_FA_ALL_QUANTS) file(GLOB SRCS "template-instances/fattn-vec*.cu") list(APPEND GGML_SOURCES_CUDA ${SRCS}) diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index 27b4145ac9a..caa88c8112d 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -21,10 +21,16 @@ static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, con mul_mat_q_case(ctx, args, stream); break; case GGML_TYPE_MXFP4: - mul_mat_q_case(ctx, args, stream); - break; case GGML_TYPE_NVFP4: - mul_mat_q_case(ctx, args, stream); +#ifndef GGML_CUDA_BLACKWELL_CONSUMER + if (args.type_x == GGML_TYPE_MXFP4) { + mul_mat_q_case(ctx, args, stream); + } else { + mul_mat_q_case(ctx, args, stream); + } +#else + GGML_ABORT("FP4 quantization requires sm_120a, not supported on consumer Blackwell (SM 12.0)"); +#endif break; case GGML_TYPE_Q2_K: mul_mat_q_case(ctx, args, stream); @@ -277,6 +283,10 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t case GGML_TYPE_Q8_0: case GGML_TYPE_MXFP4: case GGML_TYPE_NVFP4: +#ifdef GGML_CUDA_BLACKWELL_CONSUMER + mmq_supported = false; + break; +#endif case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: case GGML_TYPE_Q4_K: