From 5ca0e81964b9a5ba73e56c265ebd61c38ea86b03 Mon Sep 17 00:00:00 2001 From: Heimdall Lab Date: Wed, 3 Jun 2026 20:42:26 -0400 Subject: [PATCH 1/2] gemma4_assistant: protect n_layer_kv_from_start against shared_kv_layers == n_layer The GEMMA4 hparam-loading path already disables KV reuse when shared_kv_layers leaves no dedicated KV layers, but the GEMMA4_ASSISTANT path next to it does not. For 26B/31B assistants where block_count == shared_kv_layers == 4, this leaves hparams.n_layer_kv_from_start at 0 and downstream tensor-creation code hits a 0-length vector subscript (visible on Windows debug-iterators as "invalid vector subscript"; UB elsewhere). Mirrors the existing GEMMA4 protection a few lines above. Reproduces with google/gemma-4-26B-A4B-it-assistant converted via convert_hf_to_gguf.py. Edge variants (E2B/E4B) and the new 2026-06-03 12B Unified assistant likely have different shared_kv_layers values that avoid this edge case, which is why current AtomicChat-published GGUFs do not exhibit it. --- src/llama-model.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index fd565d735dab..b4c504348d1a 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1650,6 +1650,13 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.n_layer_kv_from_start = hparams.n_layer - (int32_t) n_kv_shared_layers; hparams.f_attention_scale = 1.0f; + if (hparams.n_layer > 0 && hparams.n_layer_kv_from_start <= 0) { + LLAMA_LOG_WARN("%s: gemma4_assistant KV sharing metadata leaves no dedicated KV layers " + "(n_layer=%u, shared_kv_layers=%u); disabling reuse\n", + __func__, hparams.n_layer, n_kv_shared_layers); + hparams.n_layer_kv_from_start = (int32_t) hparams.n_layer; + } + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); From de17ae2d250da91a211d639732ff1466de78e5a8 Mon Sep 17 00:00:00 2001 From: Heimdall Lab Date: Sat, 6 Jun 2026 07:18:37 -0400 Subject: [PATCH 2/2] Gemma-4 MTP: skip pooling/sampling epilogue for MTP graphs (fixes decode crash) Gemma-4 MTP speculative decode crashed with a silent access violation on the first draft step. Root cause: the speculative path calls llama_set_embeddings( ctx, true) so the main decode emits backbone hidden states for the drafter, and that flag stays set. llama_model::build_graph() runs a shared build_pooling()/ build_sampling() epilogue on every graph; build_pooling only early-returns when !cparams.embeddings, so it ran against the MTP graph's t_embd (= h_post, the backbone projection, which has no pooling inputs) and dereferenced bad memory. Fix: gate the epilogue on params.gtype != LLM_GRAPH_TYPE_MTP. The MTP graph is self-contained (own logits / h_post / on-device argmax) and must not get the main-decode pooling or backend-sampling layers. This also stops build_sampling from attaching backend samplers to MTP logits, which MTP never consumes. Supporting changes found during the investigation: - ggml-cuda/fattn.cu: route ALL DKQ=512 cases to the TILE kernel. The previous gqa_ratio<3 guard returned BEST_FATTN_KERNEL_NONE for Gemma-4 12B/26B/31B (gqa_ratio=8), which aborts in the MMA dispatcher. TILE supports DKQ=512 with ncols2 fallback. Required for those targets to reach the MTP path at all. - llama-graph.cpp: guard llm_graph_input_embd::set_input on embd != null (mirrors the existing can_reuse() check) so gemma4's per-layer-token input, which reuses this class but only allocates tokens, can't deref a null embd. - llama-context.cpp: re-reserve on a real set_embeddings() mode change (topology change); synchronize the main scheduler in decode_mtp_async before handing off so the MTP worker doesn't race the preceding decode's in-flight KV writes. - convert_hf_to_gguf.py: register Gemma4UnifiedAssistantForCausalLM as an alias of Gemma4AssistantForCausalLM so newer assistant checkpoints convert. Validated: E4B and 12B MTP speculative decode now run end-to-end (4/4 requests each, server healthy, draft tokens accepted). Co-Authored-By: Claude Opus 4.8 (1M context) --- convert_hf_to_gguf.py | 2 +- ggml/src/ggml-cuda/fattn.cu | 10 ++++++---- src/llama-context.cpp | 15 ++++++++++++--- src/llama-graph.cpp | 8 +++++++- src/llama-model.cpp | 20 +++++++++++++++----- 5 files changed, 41 insertions(+), 14 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 264475cd6c2a..eb7cf6eec0a1 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -7817,7 +7817,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield from super().modify_tensors(data_torch, name, bid) -@ModelBase.register("Gemma4AssistantForCausalLM") +@ModelBase.register("Gemma4AssistantForCausalLM", "Gemma4UnifiedAssistantForCausalLM") class Gemma4AssistantModel(Gemma4Model): model_arch = gguf.MODEL_ARCH.GEMMA4_ASSISTANT diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index 38d222b7016b..44995f0c1637 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -409,10 +409,12 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const if (V->ne[0] != K->ne[0]) { return BEST_FATTN_KERNEL_NONE; } - if (!gqa_opt_applies) { - return BEST_FATTN_KERNEL_NONE; - } - break; + // MMA template instances for DKQ=512 only exist for ncols2 in {4,8}, + // requiring gqa_ratio < 3. Gemma-4 (12B/26B/31B) has gqa_ratio=8 and + // aborts in switch_ncols2<512,512>. fattn-tile supports DKQ=512 with + // ncols2 fallback to {2,1}. Route ALL DKQ=512 cases to TILE here so + // the early-return path and the no-mask MTP cross-attn path are both covered. + return BEST_FATTN_KERNEL_TILE; case 576: case 640: if (V->ne[0] != 512) { diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 822d97b829dc..3954b040763d 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1105,10 +1105,14 @@ void llama_context::set_abort_callback(bool (*abort_callback)(void * data), void void llama_context::set_embeddings(bool value) { LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value); + const bool changed = (cparams.embeddings != value); cparams.embeddings = value; - - // TODO: not sure yet if we want to reserve here - //sched_need_reserve = true; + // Changing embeddings mode changes the graph topology (adds/removes the backbone + // hidden-state output node). Re-reserve so compute buffers are sized correctly. + // Only trigger re-reserve on actual change — not on redundant same-value calls. + if (changed) { + sched_need_reserve = true; + } } void llama_context::set_embeddings_pre_norm(bool value) { @@ -2692,6 +2696,11 @@ int32_t llama_context::decode_mtp_async( return -8; } + // Flush main scheduler so KV cache writes from the preceding llama_decode are + // visible to the MTP worker's CUDA reads before it starts. Without this sync + // the worker races the still-in-flight CUDA async writes and reads stale K/V. + ggml_backend_sched_synchronize(sched.get()); + { std::unique_lock lk(mtp_mu); if (mtp_pending.has_value() || mtp_in_flight || mtp_completed.has_value()) { diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index e8db32545c75..1ead8cdc13f8 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -79,7 +79,13 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) { ggml_backend_tensor_set(tokens, ubatch->token, 0, n_tokens*ggml_element_size(tokens)); } - if (ubatch->embd) { + // Guard on `embd` being non-null (mirrors the can_reuse() check below): gemma4's + // per-layer-token input reuses this same input class but only allocates `tokens` + // (never `embd`). When the batch carries raw embeddings (ubatch->embd != null) -- + // e.g. the Gemma-4 MTP / embeddings verify path -- that per-layer input would + // otherwise dereference a null `embd` here. Only the input that actually built an + // `embd` tensor should consume ubatch->embd. + if (ubatch->embd && embd) { GGML_ASSERT(n_embd == embd->ne[0]); const int64_t n_tokens = ubatch->n_tokens; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index b4c504348d1a..8822f3f60e88 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -9580,11 +9580,21 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { GGML_ABORT("fatal error"); } - // add on pooling layer - llm->build_pooling(cls, cls_b, cls_out, cls_out_b, cls_norm); - - // add backend sampling layers (if any) - llm->build_sampling(); + // The Gemma-4 MTP graph is self-contained: it produces its own logits / h_post / + // on-device argmax and is driven on a dedicated scheduler. The shared embedding + // pooling and backend-sampling epilogues are for the main decode path only. They + // must be skipped for MTP graphs: the speculative path forces cparams.embeddings=true + // (so the main decode emits backbone hidden states), and that flag would otherwise + // make build_pooling run against the MTP graph's t_embd (h_post) -- which has no + // pooling inputs and crashes -- and build_sampling attach backend samplers to the + // MTP logits, which MTP never consumes. + if (params.gtype != LLM_GRAPH_TYPE_MTP) { + // add on pooling layer + llm->build_pooling(cls, cls_b, cls_out, cls_out_b, cls_norm); + + // add backend sampling layers (if any) + llm->build_sampling(); + } // if the gguf model was converted with --sentence-transformers-dense-modules // there will be two additional dense projection layers