diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 264475cd6c2a..eb7cf6eec0a1 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -7817,7 +7817,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield from super().modify_tensors(data_torch, name, bid) -@ModelBase.register("Gemma4AssistantForCausalLM") +@ModelBase.register("Gemma4AssistantForCausalLM", "Gemma4UnifiedAssistantForCausalLM") class Gemma4AssistantModel(Gemma4Model): model_arch = gguf.MODEL_ARCH.GEMMA4_ASSISTANT diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index 38d222b7016b..44995f0c1637 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -409,10 +409,12 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const if (V->ne[0] != K->ne[0]) { return BEST_FATTN_KERNEL_NONE; } - if (!gqa_opt_applies) { - return BEST_FATTN_KERNEL_NONE; - } - break; + // MMA template instances for DKQ=512 only exist for ncols2 in {4,8}, + // requiring gqa_ratio < 3. Gemma-4 (12B/26B/31B) has gqa_ratio=8 and + // aborts in switch_ncols2<512,512>. fattn-tile supports DKQ=512 with + // ncols2 fallback to {2,1}. Route ALL DKQ=512 cases to TILE here so + // the early-return path and the no-mask MTP cross-attn path are both covered. + return BEST_FATTN_KERNEL_TILE; case 576: case 640: if (V->ne[0] != 512) { diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 822d97b829dc..3954b040763d 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1105,10 +1105,14 @@ void llama_context::set_abort_callback(bool (*abort_callback)(void * data), void void llama_context::set_embeddings(bool value) { LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value); + const bool changed = (cparams.embeddings != value); cparams.embeddings = value; - - // TODO: not sure yet if we want to reserve here - //sched_need_reserve = true; + // Changing embeddings mode changes the graph topology (adds/removes the backbone + // hidden-state output node). Re-reserve so compute buffers are sized correctly. + // Only trigger re-reserve on actual change — not on redundant same-value calls. + if (changed) { + sched_need_reserve = true; + } } void llama_context::set_embeddings_pre_norm(bool value) { @@ -2692,6 +2696,11 @@ int32_t llama_context::decode_mtp_async( return -8; } + // Flush main scheduler so KV cache writes from the preceding llama_decode are + // visible to the MTP worker's CUDA reads before it starts. Without this sync + // the worker races the still-in-flight CUDA async writes and reads stale K/V. + ggml_backend_sched_synchronize(sched.get()); + { std::unique_lock lk(mtp_mu); if (mtp_pending.has_value() || mtp_in_flight || mtp_completed.has_value()) { diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index e8db32545c75..1ead8cdc13f8 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -79,7 +79,13 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) { ggml_backend_tensor_set(tokens, ubatch->token, 0, n_tokens*ggml_element_size(tokens)); } - if (ubatch->embd) { + // Guard on `embd` being non-null (mirrors the can_reuse() check below): gemma4's + // per-layer-token input reuses this same input class but only allocates `tokens` + // (never `embd`). When the batch carries raw embeddings (ubatch->embd != null) -- + // e.g. the Gemma-4 MTP / embeddings verify path -- that per-layer input would + // otherwise dereference a null `embd` here. Only the input that actually built an + // `embd` tensor should consume ubatch->embd. + if (ubatch->embd && embd) { GGML_ASSERT(n_embd == embd->ne[0]); const int64_t n_tokens = ubatch->n_tokens; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index fd565d735dab..8822f3f60e88 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1650,6 +1650,13 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.n_layer_kv_from_start = hparams.n_layer - (int32_t) n_kv_shared_layers; hparams.f_attention_scale = 1.0f; + if (hparams.n_layer > 0 && hparams.n_layer_kv_from_start <= 0) { + LLAMA_LOG_WARN("%s: gemma4_assistant KV sharing metadata leaves no dedicated KV layers " + "(n_layer=%u, shared_kv_layers=%u); disabling reuse\n", + __func__, hparams.n_layer, n_kv_shared_layers); + hparams.n_layer_kv_from_start = (int32_t) hparams.n_layer; + } + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -9573,11 +9580,21 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { GGML_ABORT("fatal error"); } - // add on pooling layer - llm->build_pooling(cls, cls_b, cls_out, cls_out_b, cls_norm); - - // add backend sampling layers (if any) - llm->build_sampling(); + // The Gemma-4 MTP graph is self-contained: it produces its own logits / h_post / + // on-device argmax and is driven on a dedicated scheduler. The shared embedding + // pooling and backend-sampling epilogues are for the main decode path only. They + // must be skipped for MTP graphs: the speculative path forces cparams.embeddings=true + // (so the main decode emits backbone hidden states), and that flag would otherwise + // make build_pooling run against the MTP graph's t_embd (h_post) -- which has no + // pooling inputs and crashes -- and build_sampling attach backend samplers to the + // MTP logits, which MTP never consumes. + if (params.gtype != LLM_GRAPH_TYPE_MTP) { + // add on pooling layer + llm->build_pooling(cls, cls_b, cls_out, cls_out_b, cls_norm); + + // add backend sampling layers (if any) + llm->build_sampling(); + } // if the gguf model was converted with --sentence-transformers-dense-modules // there will be two additional dense projection layers