From 5ca0e81964b9a5ba73e56c265ebd61c38ea86b03 Mon Sep 17 00:00:00 2001
From: Heimdall Lab <pegly86@gmail.com>
Date: Wed, 3 Jun 2026 20:42:26 -0400
Subject: [PATCH] gemma4_assistant: protect n_layer_kv_from_start against
 shared_kv_layers == n_layer

The GEMMA4 hparam-loading path already disables KV reuse when shared_kv_layers
leaves no dedicated KV layers, but the GEMMA4_ASSISTANT path next to it does
not. For 26B/31B assistants where block_count == shared_kv_layers == 4, this
leaves hparams.n_layer_kv_from_start at 0 and downstream tensor-creation code
hits a 0-length vector subscript (visible on Windows debug-iterators as
"invalid vector subscript"; UB elsewhere).

Mirrors the existing GEMMA4 protection a few lines above. Reproduces with
google/gemma-4-26B-A4B-it-assistant converted via convert_hf_to_gguf.py.

Edge variants (E2B/E4B) and the new 2026-06-03 12B Unified assistant likely
have different shared_kv_layers values that avoid this edge case, which is
why current AtomicChat-published GGUFs do not exhibit it.
---
 src/llama-model.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index fd565d735dab..b4c504348d1a 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1650,6 +1650,13 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 hparams.n_layer_kv_from_start = hparams.n_layer - (int32_t) n_kv_shared_layers;
                 hparams.f_attention_scale     = 1.0f;
 
+                if (hparams.n_layer > 0 && hparams.n_layer_kv_from_start <= 0) {
+                    LLAMA_LOG_WARN("%s: gemma4_assistant KV sharing metadata leaves no dedicated KV layers "
+                                   "(n_layer=%u, shared_kv_layers=%u); disabling reuse\n",
+                            __func__, hparams.n_layer, n_kv_shared_layers);
+                    hparams.n_layer_kv_from_start = (int32_t) hparams.n_layer;
+                }
+
                 ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,          hparams.rope_freq_base_train_swa, false);
                 ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa);
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);