From 5ca0e81964b9a5ba73e56c265ebd61c38ea86b03 Mon Sep 17 00:00:00 2001
From: Heimdall Lab <pegly86@gmail.com>
Date: Wed, 3 Jun 2026 20:42:26 -0400
Subject: [PATCH 1/2] gemma4_assistant: protect n_layer_kv_from_start against
 shared_kv_layers == n_layer

The GEMMA4 hparam-loading path already disables KV reuse when shared_kv_layers
leaves no dedicated KV layers, but the GEMMA4_ASSISTANT path next to it does
not. For 26B/31B assistants where block_count == shared_kv_layers == 4, this
leaves hparams.n_layer_kv_from_start at 0 and downstream tensor-creation code
hits a 0-length vector subscript (visible on Windows debug-iterators as
"invalid vector subscript"; UB elsewhere).

Mirrors the existing GEMMA4 protection a few lines above. Reproduces with
google/gemma-4-26B-A4B-it-assistant converted via convert_hf_to_gguf.py.

Edge variants (E2B/E4B) and the new 2026-06-03 12B Unified assistant likely
have different shared_kv_layers values that avoid this edge case, which is
why current AtomicChat-published GGUFs do not exhibit it.
---
 src/llama-model.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index fd565d735dab..b4c504348d1a 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1650,6 +1650,13 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 hparams.n_layer_kv_from_start = hparams.n_layer - (int32_t) n_kv_shared_layers;
                 hparams.f_attention_scale     = 1.0f;
 
+                if (hparams.n_layer > 0 && hparams.n_layer_kv_from_start <= 0) {
+                    LLAMA_LOG_WARN("%s: gemma4_assistant KV sharing metadata leaves no dedicated KV layers "
+                                   "(n_layer=%u, shared_kv_layers=%u); disabling reuse\n",
+                            __func__, hparams.n_layer, n_kv_shared_layers);
+                    hparams.n_layer_kv_from_start = (int32_t) hparams.n_layer;
+                }
+
                 ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,          hparams.rope_freq_base_train_swa, false);
                 ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

From de17ae2d250da91a211d639732ff1466de78e5a8 Mon Sep 17 00:00:00 2001
From: Heimdall Lab <pegly86@gmail.com>
Date: Sat, 6 Jun 2026 07:18:37 -0400
Subject: [PATCH 2/2] Gemma-4 MTP: skip pooling/sampling epilogue for MTP
 graphs (fixes decode crash)

Gemma-4 MTP speculative decode crashed with a silent access violation on the
first draft step. Root cause: the speculative path calls llama_set_embeddings(
ctx, true) so the main decode emits backbone hidden states for the drafter, and
that flag stays set. llama_model::build_graph() runs a shared build_pooling()/
build_sampling() epilogue on every graph; build_pooling only early-returns when
!cparams.embeddings, so it ran against the MTP graph's t_embd (= h_post, the
backbone projection, which has no pooling inputs) and dereferenced bad memory.

Fix: gate the epilogue on params.gtype != LLM_GRAPH_TYPE_MTP. The MTP graph is
self-contained (own logits / h_post / on-device argmax) and must not get the
main-decode pooling or backend-sampling layers. This also stops build_sampling
from attaching backend samplers to MTP logits, which MTP never consumes.

Supporting changes found during the investigation:
- ggml-cuda/fattn.cu: route ALL DKQ=512 cases to the TILE kernel. The previous
  gqa_ratio<3 guard returned BEST_FATTN_KERNEL_NONE for Gemma-4 12B/26B/31B
  (gqa_ratio=8), which aborts in the MMA dispatcher. TILE supports DKQ=512 with
  ncols2 fallback. Required for those targets to reach the MTP path at all.
- llama-graph.cpp: guard llm_graph_input_embd::set_input on embd != null
  (mirrors the existing can_reuse() check) so gemma4's per-layer-token input,
  which reuses this class but only allocates tokens, can't deref a null embd.
- llama-context.cpp: re-reserve on a real set_embeddings() mode change (topology
  change); synchronize the main scheduler in decode_mtp_async before handing off
  so the MTP worker doesn't race the preceding decode's in-flight KV writes.
- convert_hf_to_gguf.py: register Gemma4UnifiedAssistantForCausalLM as an alias
  of Gemma4AssistantForCausalLM so newer assistant checkpoints convert.

Validated: E4B and 12B MTP speculative decode now run end-to-end (4/4 requests
each, server healthy, draft tokens accepted).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 convert_hf_to_gguf.py       |  2 +-
 ggml/src/ggml-cuda/fattn.cu | 10 ++++++----
 src/llama-context.cpp       | 15 ++++++++++++---
 src/llama-graph.cpp         |  8 +++++++-
 src/llama-model.cpp         | 20 +++++++++++++++-----
 5 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 264475cd6c2a..eb7cf6eec0a1 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -7817,7 +7817,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         yield from super().modify_tensors(data_torch, name, bid)
 
 
-@ModelBase.register("Gemma4AssistantForCausalLM")
+@ModelBase.register("Gemma4AssistantForCausalLM", "Gemma4UnifiedAssistantForCausalLM")
 class Gemma4AssistantModel(Gemma4Model):
     model_arch = gguf.MODEL_ARCH.GEMMA4_ASSISTANT
 
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
index 38d222b7016b..44995f0c1637 100644
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -409,10 +409,12 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
             if (V->ne[0] != K->ne[0]) {
                 return BEST_FATTN_KERNEL_NONE;
             }
-            if (!gqa_opt_applies) {
-                return BEST_FATTN_KERNEL_NONE;
-            }
-            break;
+            // MMA template instances for DKQ=512 only exist for ncols2 in {4,8},
+            // requiring gqa_ratio < 3. Gemma-4 (12B/26B/31B) has gqa_ratio=8 and
+            // aborts in switch_ncols2<512,512>. fattn-tile supports DKQ=512 with
+            // ncols2 fallback to {2,1}. Route ALL DKQ=512 cases to TILE here so
+            // the early-return path and the no-mask MTP cross-attn path are both covered.
+            return BEST_FATTN_KERNEL_TILE;
         case 576:
         case 640:
             if (V->ne[0] != 512) {
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 822d97b829dc..3954b040763d 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1105,10 +1105,14 @@ void llama_context::set_abort_callback(bool (*abort_callback)(void * data), void
 void llama_context::set_embeddings(bool value) {
     LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
 
+    const bool changed = (cparams.embeddings != value);
     cparams.embeddings = value;
-
-    // TODO: not sure yet if we want to reserve here
-    //sched_need_reserve = true;
+    // Changing embeddings mode changes the graph topology (adds/removes the backbone
+    // hidden-state output node). Re-reserve so compute buffers are sized correctly.
+    // Only trigger re-reserve on actual change — not on redundant same-value calls.
+    if (changed) {
+        sched_need_reserve = true;
+    }
 }
 
 void llama_context::set_embeddings_pre_norm(bool value) {
@@ -2692,6 +2696,11 @@ int32_t llama_context::decode_mtp_async(
         return -8;
     }
 
+    // Flush main scheduler so KV cache writes from the preceding llama_decode are
+    // visible to the MTP worker's CUDA reads before it starts. Without this sync
+    // the worker races the still-in-flight CUDA async writes and reads stale K/V.
+    ggml_backend_sched_synchronize(sched.get());
+
     {
         std::unique_lock<std::mutex> lk(mtp_mu);
         if (mtp_pending.has_value() || mtp_in_flight || mtp_completed.has_value()) {
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index e8db32545c75..1ead8cdc13f8 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -79,7 +79,13 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
         ggml_backend_tensor_set(tokens, ubatch->token, 0, n_tokens*ggml_element_size(tokens));
     }
 
-    if (ubatch->embd) {
+    // Guard on `embd` being non-null (mirrors the can_reuse() check below): gemma4's
+    // per-layer-token input reuses this same input class but only allocates `tokens`
+    // (never `embd`). When the batch carries raw embeddings (ubatch->embd != null) --
+    // e.g. the Gemma-4 MTP / embeddings verify path -- that per-layer input would
+    // otherwise dereference a null `embd` here. Only the input that actually built an
+    // `embd` tensor should consume ubatch->embd.
+    if (ubatch->embd && embd) {
         GGML_ASSERT(n_embd == embd->ne[0]);
 
         const int64_t n_tokens = ubatch->n_tokens;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index b4c504348d1a..8822f3f60e88 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -9580,11 +9580,21 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             GGML_ABORT("fatal error");
     }
 
-    // add on pooling layer
-    llm->build_pooling(cls, cls_b, cls_out, cls_out_b, cls_norm);
-
-    // add backend sampling layers (if any)
-    llm->build_sampling();
+    // The Gemma-4 MTP graph is self-contained: it produces its own logits / h_post /
+    // on-device argmax and is driven on a dedicated scheduler. The shared embedding
+    // pooling and backend-sampling epilogues are for the main decode path only. They
+    // must be skipped for MTP graphs: the speculative path forces cparams.embeddings=true
+    // (so the main decode emits backbone hidden states), and that flag would otherwise
+    // make build_pooling run against the MTP graph's t_embd (h_post) -- which has no
+    // pooling inputs and crashes -- and build_sampling attach backend samplers to the
+    // MTP logits, which MTP never consumes.
+    if (params.gtype != LLM_GRAPH_TYPE_MTP) {
+        // add on pooling layer
+        llm->build_pooling(cls, cls_b, cls_out, cls_out_b, cls_norm);
+
+        // add backend sampling layers (if any)
+        llm->build_sampling();
+    }
 
     // if the gguf model was converted with --sentence-transformers-dense-modules
     // there will be two additional dense projection layers