From c253e49b988eef4495b024b65dc4e4f9467f395e Mon Sep 17 00:00:00 2001
From: Erik LaBianca <erik.labianca@gmail.com>
Date: Mon, 27 Apr 2026 15:48:06 -0400
Subject: [PATCH] =?UTF-8?q?fix(ggml-cuda):=20skip=20sm=5F120=E2=86=92sm=5F?=
 =?UTF-8?q?120a=20for=20consumer=20Blackwell=20(no=20FP4=20MMA)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Consumer Blackwell GPUs (RTX 5090, SM 12.0) do not have FP4 tensor core
instructions. The existing code unconditionally replaces sm_120 with sm_120a
and compiles mmq-instance-mxfp4/nvfp4 with BLACKWELL_MMA_AVAILABLE, which
emits .block_scale / mxf4 PTX that faults on sm_120 hardware.

Add GGML_CUDA_BLACKWELL_CONSUMER option (set by parent build when nvidia-smi
reports SM 12.x without an explicit 'a' variant):
- Skip the 12X→12Xa arch replacement so ggml-cuda compiles for plain sm_120
- Exclude mmq-instance-mxfp4.cu and mmq-instance-nvfp4.cu from the build
- Guard their dispatch cases in mmq.cu to prevent linker errors and
  surface a clear abort if FP4 types are somehow requested at runtime

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 ggml/src/ggml-cuda/CMakeLists.txt | 18 ++++++++++++++++++
 ggml/src/ggml-cuda/mmq.cu         | 16 +++++++++++++---
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt
index 6ed2f61fedb..d00cd941d58 100644
--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@@ -76,6 +76,11 @@ if (CUDAToolkit_FOUND)
     # Notably the Blackwell FP4 tensor core instructions are not forwards compatible and therefore need 12Xa.
     # But while 12X vs. 12Xa can be checked in device code there is (to my knowledge) no easy way to do the same check in host code.
     # So for now just replace all instances of 12X with 12Xa, this should be fine until Rubin is released.
+    #
+    # Set GGML_CUDA_BLACKWELL_CONSUMER=ON to skip this replacement for consumer Blackwell GPUs
+    # (e.g. RTX 5090, SM 12.0) that don't have FP4 tensor cores and will fault on 12Xa instructions.
+    option(GGML_CUDA_BLACKWELL_CONSUMER "Skip sm_12X→sm_12Xa replacement for consumer Blackwell" OFF)
+    if(NOT GGML_CUDA_BLACKWELL_CONSUMER)
     foreach(ARCHS IN ITEMS CMAKE_CUDA_ARCHITECTURES CMAKE_CUDA_ARCHITECTURES_NATIVE)
         set(FIXED_ARCHS "")
         foreach(ARCH IN LISTS ${ARCHS})
@@ -89,6 +94,7 @@ if (CUDAToolkit_FOUND)
         endforeach()
         set(${ARCHS} ${FIXED_ARCHS})
     endforeach()
+    endif() # NOT GGML_CUDA_BLACKWELL_CONSUMER
 
     # If we try to compile a "native" build it will use the 12X architectures and fail.
     # So we should instead use the native architectures as determined by CMake after replacing 12X with 12Xa.
@@ -111,6 +117,18 @@ if (CUDAToolkit_FOUND)
     file(GLOB   SRCS "template-instances/mmf*.cu")
     list(APPEND GGML_SOURCES_CUDA ${SRCS})
 
+    if(GGML_CUDA_BLACKWELL_CONSUMER)
+        # FP4 MMA kernels (mxfp4/nvfp4) require sm_120a instructions not present
+        # on consumer Blackwell (RTX 5090, SM 12.0).
+        list(REMOVE_ITEM GGML_SOURCES_CUDA
+            "${CMAKE_CURRENT_SOURCE_DIR}/template-instances/mmq-instance-mxfp4.cu"
+            "${CMAKE_CURRENT_SOURCE_DIR}/template-instances/mmq-instance-nvfp4.cu"
+        )
+        # Let dispatch code in mmq.cu know to skip FP4 cases.
+        add_compile_definitions(GGML_CUDA_BLACKWELL_CONSUMER)
+        message(STATUS "ggml-cuda: Excluding FP4 MMA kernels (GGML_CUDA_BLACKWELL_CONSUMER)")
+    endif()
+
     if (GGML_CUDA_FA_ALL_QUANTS)
         file(GLOB   SRCS "template-instances/fattn-vec*.cu")
         list(APPEND GGML_SOURCES_CUDA ${SRCS})
diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
index 27b4145ac9a..caa88c8112d 100644
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -21,10 +21,16 @@ static void ggml_cuda_mul_mat_q_switch_type(ggml_backend_cuda_context & ctx, con
             mul_mat_q_case<GGML_TYPE_Q8_0>(ctx, args, stream);
             break;
         case GGML_TYPE_MXFP4:
-            mul_mat_q_case<GGML_TYPE_MXFP4>(ctx, args, stream);
-            break;
         case GGML_TYPE_NVFP4:
-            mul_mat_q_case<GGML_TYPE_NVFP4>(ctx, args, stream);
+#ifndef GGML_CUDA_BLACKWELL_CONSUMER
+            if (args.type_x == GGML_TYPE_MXFP4) {
+                mul_mat_q_case<GGML_TYPE_MXFP4>(ctx, args, stream);
+            } else {
+                mul_mat_q_case<GGML_TYPE_NVFP4>(ctx, args, stream);
+            }
+#else
+            GGML_ABORT("FP4 quantization requires sm_120a, not supported on consumer Blackwell (SM 12.0)");
+#endif
             break;
         case GGML_TYPE_Q2_K:
             mul_mat_q_case<GGML_TYPE_Q2_K>(ctx, args, stream);
@@ -277,6 +283,10 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11, int64_t
         case GGML_TYPE_Q8_0:
         case GGML_TYPE_MXFP4:
         case GGML_TYPE_NVFP4:
+#ifdef GGML_CUDA_BLACKWELL_CONSUMER
+            mmq_supported = false;
+            break;
+#endif
         case GGML_TYPE_Q2_K:
         case GGML_TYPE_Q3_K:
         case GGML_TYPE_Q4_K: