diff --git a/conversion/gemma.py b/conversion/gemma.py index d8cf8be575c8..5b4ca5c583df 100644 --- a/conversion/gemma.py +++ b/conversion/gemma.py @@ -789,6 +789,16 @@ def set_gguf_parameters(self): class Gemma4AssistantModel(Gemma4Model): model_arch = gguf.MODEL_ARCH.GEMMA4_ASSISTANT + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, gen = item + + if "masked_embedding" in name: + logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") + return None + + return super().filter_tensors(item) + def set_gguf_parameters(self): super().set_gguf_parameters() self.gguf_writer.add_embedding_length_out(self.hparams["backbone_hidden_size"]) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index bd6246137b0a..584594097346 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -538,6 +538,8 @@ class VISION_PROJECTOR_TYPE(IntEnum): class MODEL_TENSOR(IntEnum): TOKEN_EMBD = auto() TOKEN_EMBD_NORM = auto() + MASKED_EMBD_CENTROIDS= auto() + MASKED_EMBD_ORDERING = auto() TOKEN_TYPES = auto() POS_EMBD = auto() OUTPUT = auto() @@ -1087,6 +1089,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.TOKEN_EMBD: "token_embd", MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", MODEL_TENSOR.TOKEN_TYPES: "token_types", + MODEL_TENSOR.MASKED_EMBD_CENTROIDS: "masked_embd_centroids", + MODEL_TENSOR.MASKED_EMBD_ORDERING: "masked_embd_ordering", MODEL_TENSOR.POS_EMBD: "position_embd", MODEL_TENSOR.OUTPUT_NORM: "output_norm", MODEL_TENSOR.OUTPUT: "output", @@ -2586,6 +2590,8 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.GEMMA4_ASSISTANT: [ MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.MASKED_EMBD_CENTROIDS, + MODEL_TENSOR.MASKED_EMBD_ORDERING, MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.NEXTN_PROJ_PRE, MODEL_TENSOR.NEXTN_PROJ_POST, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index a9537983de1f..5f1e28818509 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -37,6 +37,14 @@ class TensorNameMap: "model.embed", # talkie ), + # Masked embeddings + MODEL_TENSOR.MASKED_EMBD_CENTROIDS: ( + "masked_embedding.centroids", # gemma-4 E2B/E4B assistants + ), + MODEL_TENSOR.MASKED_EMBD_ORDERING: ( + "masked_embedding.token_ordering", # gemma-4 E2B/E4B assistants + ), + # Token type embeddings MODEL_TENSOR.TOKEN_TYPES: ( "embeddings.token_type_embeddings", # bert nomic-bert diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 6a5d5f8d2ac8..680b5fc64df3 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -559,6 +559,8 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_INDEXER_PROJ, "blk.%d.indexer.proj" }, { LLM_TENSOR_INDEXER_ATTN_K, "blk.%d.indexer.attn_k" }, { LLM_TENSOR_INDEXER_ATTN_Q_B, "blk.%d.indexer.attn_q_b" }, + { LLM_TENSOR_MASKED_EMBD_CENTROIDS, "masked_embd_centroids" }, + { LLM_TENSOR_MASKED_EMBD_ORDERING, "masked_embd_ordering" }, }; // declare information about the model weight tensors: @@ -783,6 +785,8 @@ static const std::map LLM_TENSOR_INFOS = { // latent projections feed ggml_mul_mat, the buft probe must use MUL_MAT to keep them on GPU {LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_MASKED_EMBD_CENTROIDS, {LLM_TENSOR_LAYER_INPUT, GGML_OP_NONE}}, + {LLM_TENSOR_MASKED_EMBD_ORDERING, {LLM_TENSOR_LAYER_INPUT, GGML_OP_NONE}}, }; LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {} diff --git a/src/llama-arch.h b/src/llama-arch.h index 03b1a265d67a..b65fce72e646 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -566,8 +566,11 @@ enum llm_tensor { LLM_TENSOR_NEXTN_HNORM, LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, + LLM_TENSOR_MASKED_EMBD_CENTROIDS, + LLM_TENSOR_MASKED_EMBD_ORDERING, }; + enum llm_tensor_layer { LLM_TENSOR_LAYER_INPUT, LLM_TENSOR_LAYER_REPEATING, diff --git a/src/models/gemma4-assistant.cpp b/src/models/gemma4-assistant.cpp index 5b7a25a5abaf..6378130e79ec 100644 --- a/src/models/gemma4-assistant.cpp +++ b/src/models/gemma4-assistant.cpp @@ -39,6 +39,9 @@ void llama_model_gemma4_assistant::load_arch_tensors(llama_model_loader &) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); + create_tensor(tn(LLM_TENSOR_MASKED_EMBD_CENTROIDS, "weight"), {}, TENSOR_NOT_REQUIRED); + create_tensor(tn(LLM_TENSOR_MASKED_EMBD_ORDERING), {}, TENSOR_NOT_REQUIRED); + const int64_t n_embd_backbone = hparams.n_embd_inp(); nextn_proj_post = create_tensor(tn(LLM_TENSOR_NEXTN_PROJ_POST, "weight"), { n_embd, n_embd_backbone }, 0);