From 5ca0e81964b9a5ba73e56c265ebd61c38ea86b03 Mon Sep 17 00:00:00 2001 From: Heimdall Lab Date: Wed, 3 Jun 2026 20:42:26 -0400 Subject: [PATCH] gemma4_assistant: protect n_layer_kv_from_start against shared_kv_layers == n_layer The GEMMA4 hparam-loading path already disables KV reuse when shared_kv_layers leaves no dedicated KV layers, but the GEMMA4_ASSISTANT path next to it does not. For 26B/31B assistants where block_count == shared_kv_layers == 4, this leaves hparams.n_layer_kv_from_start at 0 and downstream tensor-creation code hits a 0-length vector subscript (visible on Windows debug-iterators as "invalid vector subscript"; UB elsewhere). Mirrors the existing GEMMA4 protection a few lines above. Reproduces with google/gemma-4-26B-A4B-it-assistant converted via convert_hf_to_gguf.py. Edge variants (E2B/E4B) and the new 2026-06-03 12B Unified assistant likely have different shared_kv_layers values that avoid this edge case, which is why current AtomicChat-published GGUFs do not exhibit it. --- src/llama-model.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index fd565d735dab..b4c504348d1a 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1650,6 +1650,13 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.n_layer_kv_from_start = hparams.n_layer - (int32_t) n_kv_shared_layers; hparams.f_attention_scale = 1.0f; + if (hparams.n_layer > 0 && hparams.n_layer_kv_from_start <= 0) { + LLAMA_LOG_WARN("%s: gemma4_assistant KV sharing metadata leaves no dedicated KV layers " + "(n_layer=%u, shared_kv_layers=%u); disabling reuse\n", + __func__, hparams.n_layer, n_kv_shared_layers); + hparams.n_layer_kv_from_start = (int32_t) hparams.n_layer; + } + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);