Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
0806a96
llama : enable layer input extraction
ggerganov May 5, 2026
800494f
spec: support eagle3
ruixiang63 May 18, 2026
16e6555
eagle3: fix params bug
ruixiang63 May 18, 2026
752bf23
eagle3: support Gemma4 eagle3 from RedHatAI
ruixiang63 May 20, 2026
b32d9eb
eagle3: set sync when get features from target
ruixiang63 May 27, 2026
7c5f428
eagle3 : fix ubatch handling in embd_layer_inp extraction and encoder
ruixiang63 May 27, 2026
91b9cfc
eagle3: adapt to upstream changes
ruixiang63 Jun 5, 2026
4ca8087
eagle3: fix rebase issues and adapt to upstream changes
ruixiang63 Jun 8, 2026
413c16d
eagle3:exclude the eagle3 arch from test-llama-archs
ruixiang63 Jun 8, 2026
6c21222
eagle3: fix editorconfig check failures
ruixiang63 Jun 8, 2026
ac7e2b2
eagle3: fix multi-seq issue in d2t vocab mapping
ruixiang63 Jun 9, 2026
544aaa2
cont : minor style / clean-up
ggerganov Jun 10, 2026
5738c9a
Merge branch 'master' into pr/18039
ggerganov Jun 10, 2026
b9f41d1
spec : remove `common_speculative_setup_draft_model()`
ggerganov Jun 10, 2026
f3fbbed
llama : clean-up unused API
ggerganov Jun 10, 2026
8002c4c
eagle3: set d2t vocab mapping in decode graph
ruixiang63 Jun 10, 2026
33b02df
cont : assert layer inputs are configured
ggerganov Jun 10, 2026
7857221
hparams : use n_embd_inp instead of n_embd_target_features
ggerganov Jun 11, 2026
9b2543d
eagle3: make output.weight optional and inherit from target model whe…
ruixiang63 Jun 11, 2026
1d55316
haparams : generic norm-before-residual param
ggerganov Jun 11, 2026
2de116b
llama-ext : consistent names
ggerganov Jun 11, 2026
f408879
cont : fix
ggerganov Jun 11, 2026
d373233
hparams : remove target_hidden_size
ggerganov Jun 11, 2026
5caedbc
cparams : rename output_layer_inp -> embeddings_layer_inp
ggerganov Jun 11, 2026
0274f0f
arch : reuse ATTN_NORM_2 instead of adding new hidden norm
ggerganov Jun 11, 2026
9baa68b
llama : clean-up names
ggerganov Jun 11, 2026
0bd5449
cont : add assert + comment
ggerganov Jun 11, 2026
7c42aff
Update conversion/llama.py
ruixiang63 Jun 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
428 changes: 418 additions & 10 deletions common/speculative.cpp

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions conversion/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,9 @@
"LlamaBidirectionalModel": "llama",
"LlamaForCausalLM": "llama",
"LlamaModel": "llama",
"Eagle3DraftModel": "llama",
"Eagle3Speculator": "llama",
"LlamaForCausalLMEagle3": "llama",
"LlavaForConditionalGeneration": "llama",
"LlavaStableLMEpochForCausalLM": "stablelm",
"MPTForCausalLM": "mpt",
Expand Down
4 changes: 4 additions & 0 deletions conversion/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@
metadata: gguf.Metadata
dir_model_card: Path
remote_hf_model_id: str | None
target_model_dir: Path | None
Comment thread
CISC marked this conversation as resolved.

# subclasses should define this!
model_arch: gguf.MODEL_ARCH
Expand All @@ -119,6 +120,7 @@
small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None,
disable_mistral_community_chat_template: bool = False,
sentence_transformers_dense_modules: bool = False,
target_model_dir: Path | None = None,
Comment thread
CISC marked this conversation as resolved.
fuse_gate_up_exps: bool = False,
fp8_as_q8: bool = False):
if type(self) is ModelBase or \
Expand All @@ -139,6 +141,7 @@
self.dry_run = dry_run
self.remote_hf_model_id = remote_hf_model_id
self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
self.target_model_dir = target_model_dir
Comment thread
CISC marked this conversation as resolved.
self.fuse_gate_up_exps = fuse_gate_up_exps
self._gate_exp_buffer: dict[int, Tensor] = {}
self._up_exp_buffer: dict[int, Tensor] = {}
Expand Down Expand Up @@ -1332,15 +1335,15 @@

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute]

Check warning on line 1338 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1338:76: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment
assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute]

Check warning on line 1339 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1339:60: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment

tokpre = self.get_vocab_base_pre(tokenizer)

reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute]

Check warning on line 1343 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1343:93: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]

Check warning on line 1344 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1344:52: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment

added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute]

Check warning on line 1346 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1346:64: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment

for i in range(vocab_size):
if i not in reverse_vocab:
Expand All @@ -1353,7 +1356,7 @@
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
if not added_tokens_decoder[i].normalized:
previous_token = token
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) # ty: ignore[unresolved-attribute, invalid-assignment]

Check warning on line 1359 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1359:102: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment
if previous_token != token:
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")

Expand Down Expand Up @@ -1714,14 +1717,14 @@
def _set_vocab_hybriddna(self):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) # ty: ignore[unresolved-attribute]

Check warning on line 1720 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1720:76: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment
assert max(tokenizer.vocab.values()) < vocab_size # ty: ignore[unresolved-attribute]

Check warning on line 1721 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1721:60: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment

reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} # ty: ignore[unresolved-attribute]

Check warning on line 1723 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1723:93: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment
# k-mers can share text with a base-vocab BPE token (e.g. CCCCCC) and get
# dropped by get_vocab(); a reserved marker suffix (U+E000) keeps each
# k-mer's own id (llama.cpp strips it on detokenization)
for kmer in tokenizer.kmers: # ty: ignore[unresolved-attribute]

Check warning on line 1727 in conversion/base.py

View workflow job for this annotation

GitHub Actions / python type-check

ty (unused-ignore-comment)

conversion/base.py:1727:39: unused-ignore-comment: Unused `ty: ignore` directive help: Remove the unused suppression comment
reverse_vocab[tokenizer.dna_token_to_id[kmer]] = kmer + "\ue000" # ty: ignore[unresolved-attribute]
added_vocab = tokenizer.get_added_vocab() # ty: ignore[unresolved-attribute]
added_tokens_decoder = tokenizer.added_tokens_decoder # ty: ignore[unresolved-attribute]
Expand Down Expand Up @@ -2481,6 +2484,7 @@
torch.float16: np.float16,
torch.float32: np.float32,
torch.uint8: np.uint8,
torch.int64: np.int64,
}

# only used when byteswapping data. Only correct size is needed
Expand Down
131 changes: 130 additions & 1 deletion conversion/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@

from typing import Callable, Iterable, TYPE_CHECKING

import numpy as np
import torch

if TYPE_CHECKING:
from torch import Tensor

from .base import ModelBase, TextModel, gguf
from .base import ModelBase, TextModel, gguf, logger


@ModelBase.register(
Expand All @@ -21,6 +22,9 @@
"VLlama3ForCausalLM",
"LlavaForConditionalGeneration",
"VoxtralForConditionalGeneration",
"LlamaForCausalLMEagle3",
"Eagle3Speculator",
"Eagle3DraftModel",
"IQuestCoderForCausalLM",
"LlamaModel")
class LlamaModel(TextModel):
Expand All @@ -39,7 +43,61 @@ def __init__(self, *args, **kwargs):
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
self.origin_hf_arch = hparams.get('architectures', [None])[0]

# Detect eagle3 draft checkpoint by hparams (some models don't use a distinct HF arch name)
if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1:

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not important right now, but I'm guessing all this will basically be duplicated for every arch supported with very little if any differences? Would be nice if it can be refactored in a reusable way.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point! I kept this local for now because all Eagle3 checkpoints I have encountered so far are based on Llama decoder (no matter where they come from RedHat, LMSYS, NVIDIA, etc), and this PR only targets that path unless we find an Eagle3 checkpoint based on a different architecture. (potentially this #18039 (comment) but not sure).

If another architecture needs Eagle3 conversion later, this should be the first piece to factor out.

self.is_eagle3 = True
self.model_arch = gguf.MODEL_ARCH.EAGLE3
logger.info("Detected EAGLE-3 draft model, switching to EAGLE3 architecture")
# Re-initialize tensor_map with eagle3 architecture
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
# Update gguf_writer architecture
self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
self.gguf_writer.add_architecture()
if self.target_model_dir is None:
raise ValueError(
"EAGLE-3 model requires --target-model-dir to be specified. "
"Please provide the path to the target model directory to read config.json"
)
# Read both eagle3 raw config and target model config
with open(self.dir_model / "config.json", 'r', encoding='utf-8') as f:
eagle3_raw_config = json.load(f)
with open(self.target_model_dir / "config.json", 'r', encoding='utf-8') as f:
target_config = json.load(f)

if "text_config" in target_config:
target_config = {**target_config, **target_config["text_config"]}
self.target_vocab_size = target_config["vocab_size"]

# target_layers: derived from target model layer count (low/mid/high)
target_num_layers = target_config["num_hidden_layers"]
target_layers = [2, target_num_layers // 2, target_num_layers - 3]

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we also prefer the eagle3 config when eagle_aux_hidden_state_layer_ids is present?

Same question for vocab size when draft_vocab_size exists

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we also prefer the eagle3 config when eagle_aux_hidden_state_layer_ids is present?

Good question. First, many Eagle3 checkpoints do not include eagle_aux_hidden_state_layer_ids. Also, different Eagle3 checkpoints interpret layer_ids differently: some expect the IDs to be set before extracting the layers, while others expect them to be set afterward, which can sometimes require adding +1.
To avoid this ambiguity, I decided to compute the values manually based on the original paper and its implementation, rather than relying on the Eagle3 config. This ensures that the target layers are 100% correct without postprocessing and keep code logic aligned.

Same question for vocab size when draft_vocab_size exists

Both draft_vocab_size and the target model’s vocab_size are needed when performing the d2t vocab mapping for Eagle3. The target model’s vocab_size serves as an assertion to ensure that the d2t mapping does not go out of vocabulary.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yikes, guess the EAGLE3 rollout has not been smooth 😅

thanks for the clarity! unfortunate but logical :)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah it is. Thanks for the review!

logger.info(f"EAGLE-3: target_layers = {target_layers} (target model has {target_num_layers} layers)")
self.gguf_writer.add_array(f"{self.gguf_writer.arch}.target_layers", target_layers)

# target_hidden_size: prefer eagle3 config, fallback to target config
if eagle3_raw_config.get("target_hidden_size") is not None:
target_hidden_size = eagle3_raw_config["target_hidden_size"]
src = "EAGLE-3 config"
else:
target_hidden_size = target_config["hidden_size"]
src = "target model config"
logger.info(f"EAGLE-3: target_hidden_size = {target_hidden_size} (from {src})")
self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.target_hidden_size", target_hidden_size)

# norm_before_residual (RedHat-style eagle3 specific)
norm_before_residual = eagle3_raw_config.get("norm_before_residual", False)
logger.info(f"EAGLE-3: norm_before_residual = {norm_before_residual}")
self.gguf_writer.add_bool(f"{self.gguf_writer.arch}.norm_before_residual", norm_before_residual)

def set_vocab(self):
# eagle3: use tokenizer from target model if provided
original_dir_model = None
if getattr(self, 'is_eagle3', False):
assert self.target_model_dir is not None
logger.info(f"EAGLE-3: Using tokenizer from target model: {self.target_model_dir}")
original_dir_model = self.dir_model
self.dir_model = self.target_model_dir

if self.origin_hf_arch == "GlmasrModel":
return self._set_vocab_glmedge()

Expand Down Expand Up @@ -85,6 +143,10 @@ def set_vocab(self):
if self.hparams.get("vocab_size", 32000) == 49152:
self.gguf_writer.add_add_bos_token(False)

# eagle3: Restore original dir_model
if original_dir_model is not None:
self.dir_model = original_dir_model

def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
Expand Down Expand Up @@ -129,7 +191,49 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca

return super().filter_tensors((name, gen))

def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]:
tensors = super().index_tensors(remote_hf_model_id)

# Handle Eagle3Speculator nested config
if "transformer_layer_config" in self.hparams:
self.hparams = {**self.hparams, **self.hparams["transformer_layer_config"]}

# eagle3 detection
if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1:
logger.info("EAGLE-3: renaming midlayer.* / layers.0.* to model.layers.0.*")
new_tensors = {}
for name, gen in tensors.items():
if name.startswith("midlayer."):
new_name = "model.layers.0." + name[len("midlayer."):]
new_tensors[new_name] = gen
elif name.startswith("layers.0."): # Eagle3Speculator format
new_name = "model." + name
new_tensors[new_name] = gen
else:
new_tensors[name] = gen
return new_tensors

return tensors

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# eagle3: special tensors that bypass standard llama mapping
if getattr(self, 'is_eagle3', False):
if name == "fc.weight":
yield (name, data_torch)
return
if name == "d2t":
# store for manual int64 handling in prepare_tensors (avoid F32 conversion)
if not hasattr(self, '_eagle3_int_tensors'):
self._eagle3_int_tensors = {}
self._eagle3_int_tensors[name] = data_torch
return
if name == "t2d":
# not used at runtime, skip
return
if name.endswith(".hidden_norm.weight"):
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_NORM_2, bid), data_torch)
return

n_head = self.find_hparam(["n_heads", "num_attention_heads"])
n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"])

Expand Down Expand Up @@ -205,8 +309,33 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))

def prepare_tensors(self):
# eagle3: collect d2t original dtype before parent converts tensors to F32
eagle3_original_dtypes = {}
if getattr(self, 'is_eagle3', False):
for name, data_torch in self.get_tensors():
if name == "d2t":
eagle3_original_dtypes[name] = data_torch.dtype

super().prepare_tensors()

# eagle3: write d2t as absolute target token ids
if getattr(self, 'is_eagle3', False) and hasattr(self, '_eagle3_int_tensors'):
for name, data_torch in self._eagle3_int_tensors.items():
old_dtype = eagle3_original_dtypes.get(name, data_torch.dtype)
data = data_torch.to(torch.int64).cpu().numpy()
if name == "d2t":
data = data.reshape(-1)
data = data + np.arange(data.size, dtype=np.int64)
if np.any((data < 0) | (data >= self.target_vocab_size)):
raise ValueError(f"EAGLE-3 d2t target ids out of range for target vocab size {self.target_vocab_size}")
if np.unique(data).size != data.size:
raise ValueError("EAGLE-3 d2t contains duplicate target ids")
data_qtype = gguf.GGMLQuantizationType.I64

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Future-proofing is nice and all, but n_tokens and token ids are limited to int32_t, what is the original dtype?

@ruixiang63 ruixiang63 Jun 11, 2026

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The original d2t dtype is torch.int64 in the eagle3 checkpoint. That's why I used I64 to preserve that and avoid any accidental truncation after converting it to absolute target ids.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}"
logger.info(f"{name + ',':<30} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype)

if self._experts is not None:
# flatten `list[dict[str, Tensor]]` into `list[str]`
experts = [k for d in self._experts for k in d.keys()]
Expand Down
10 changes: 10 additions & 0 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,15 @@ def parse_args() -> argparse.Namespace:
help="Store tensors dequantized from FP8 as Q8_0 instead of BF16/F16.",
)

parser.add_argument(
"--target-model-dir", type=str, default=None,
help=(
"path to the target model directory; required when converting a standalone draft model "
"(e.g. EAGLE3 / DFlash) that needs target-model metadata such as tokenizer, hidden size, and "
"layer count to populate its GGUF."
),
)

args = parser.parse_args()
if not args.print_supported_models and args.model is None:
parser.error("the following arguments are required: model")
Expand Down Expand Up @@ -269,6 +278,7 @@ def main() -> None:
small_first_shard=args.no_tensor_first_split,
remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template,
sentence_transformers_dense_modules=args.sentence_transformers_dense_modules,
target_model_dir=Path(args.target_model_dir) if args.target_model_dir else None,
fuse_gate_up_exps=args.fuse_gate_up_exps,
fp8_as_q8=args.fp8_as_q8,
)
Expand Down
44 changes: 36 additions & 8 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,9 @@ class LLM:
HIDDEN_ACT = "{arch}.hidden_activation"
DENSE_FEAT_IN_SIZE = "{arch}.{dense}_feat_in"
DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out"
TARGET_LAYERS = "{arch}.target_layers"
TARGET_HIDDEN_SIZE = "{arch}.target_hidden_size"
NORM_BEFORE_RESIDUAL = "{arch}.norm_before_residual"

class Attention:
HEAD_COUNT = "{arch}.attention.head_count"
Expand Down Expand Up @@ -510,6 +513,7 @@ class MODEL_ARCH(IntEnum):
RND1 = auto()
PANGU_EMBED = auto()
MISTRAL3 = auto()
EAGLE3 = auto()
MISTRAL4 = auto()
PADDLEOCR = auto()
MIMO2 = auto()
Expand Down Expand Up @@ -900,14 +904,17 @@ class MODEL_TENSOR(IntEnum):
A_PER_DIM_K_SCALE = auto() # gemma4
A_PER_DIM_SCALE = auto() # gemma4
# nextn/mtp
NEXTN_PROJ_PRE = auto()
NEXTN_PROJ_POST = auto()
NEXTN_EH_PROJ = auto()
NEXTN_EMBED_TOKENS = auto()
NEXTN_ENORM = auto()
NEXTN_HNORM = auto()
NEXTN_PROJ_PRE = auto()
NEXTN_PROJ_POST = auto()
NEXTN_EH_PROJ = auto()
NEXTN_EMBED_TOKENS = auto()
NEXTN_ENORM = auto()
NEXTN_HNORM = auto()
NEXTN_SHARED_HEAD_HEAD = auto()
NEXTN_SHARED_HEAD_NORM = auto()
# eagle3
FC = auto() # feature fusion layer
D2T = auto() # draft to target vocabulary mapping
# lfm2 audio
A_ENC_NORM_CONV = auto()
A_ENC_LINEAR_POS = auto()
Expand Down Expand Up @@ -1062,6 +1069,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.RND1: "rnd1",
MODEL_ARCH.PANGU_EMBED: "pangu-embedded",
MODEL_ARCH.MISTRAL3: "mistral3",
MODEL_ARCH.EAGLE3: "eagle3",
MODEL_ARCH.MISTRAL4: "mistral4",
MODEL_ARCH.PADDLEOCR: "paddleocr",
MODEL_ARCH.MIMO2: "mimo2",
Expand Down Expand Up @@ -1094,8 +1102,8 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.POS_EMBD: "position_embd",
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
MODEL_TENSOR.OUTPUT: "output",
MODEL_TENSOR.DENSE_2_OUT: "dense_2", # embeddinggemma 2_Dense
MODEL_TENSOR.DENSE_3_OUT: "dense_3", # embeddinggemma 2_Dense
MODEL_TENSOR.DENSE_2_OUT: "dense_2", # embeddinggemma 2_Dense
MODEL_TENSOR.DENSE_3_OUT: "dense_3", # embeddinggemma 2_Dense
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long",
MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short",
Expand Down Expand Up @@ -1487,6 +1495,8 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.NEXTN_HNORM: "blk.{bid}.nextn.hnorm",
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: "blk.{bid}.nextn.shared_head_head",
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: "blk.{bid}.nextn.shared_head_norm",
MODEL_TENSOR.FC: "fc",
MODEL_TENSOR.D2T: "d2t",
}

MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
Expand Down Expand Up @@ -4027,6 +4037,24 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
],
MODEL_ARCH.EAGLE3: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_NORM_2,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FC,
MODEL_TENSOR.D2T,
],
MODEL_ARCH.MISTRAL4: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
Expand Down
17 changes: 13 additions & 4 deletions src/llama-arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
#include "llama-impl.h"

#include <map>
#include <set>
#include <vector>

static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
Expand Down Expand Up @@ -128,6 +127,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_RND1, "rnd1" },
{ LLM_ARCH_PANGU_EMBED, "pangu-embedded" },
{ LLM_ARCH_MISTRAL3, "mistral3" },
{ LLM_ARCH_EAGLE3, "eagle3" },
{ LLM_ARCH_MISTRAL4, "mistral4" },
{ LLM_ARCH_PADDLEOCR, "paddleocr" },
{ LLM_ARCH_MIMO2, "mimo2" },
Expand Down Expand Up @@ -292,12 +292,16 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {

{ LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },

{ LLM_KV_TARGET_LAYERS, "%s.target_layers" },
{ LLM_KV_TARGET_HIDDEN_SIZE, "%s.target_hidden_size" },
{ LLM_KV_NORM_BEFORE_RESIDUAL, "%s.norm_before_residual" },

{ LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
// sentence-transformers dense modules feature dims
{ LLM_KV_DENSE_2_FEAT_IN, "%s.dense_2_feat_in" },
{ LLM_KV_DENSE_2_FEAT_OUT, "%s.dense_2_feat_out" },
{ LLM_KV_DENSE_3_FEAT_IN, "%s.dense_3_feat_in" },
{ LLM_KV_DENSE_3_FEAT_OUT, "%s.dense_3_feat_out" },
{ LLM_KV_DENSE_2_FEAT_OUT, "%s.dense_2_feat_out" },
{ LLM_KV_DENSE_3_FEAT_IN, "%s.dense_3_feat_in" },
{ LLM_KV_DENSE_3_FEAT_OUT, "%s.dense_3_feat_out" },

{ LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" },
{ LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" },
Expand Down Expand Up @@ -561,6 +565,8 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
{ LLM_TENSOR_INDEXER_ATTN_Q_B, "blk.%d.indexer.attn_q_b" },
{ LLM_TENSOR_MASKED_EMBD_CENTROIDS, "masked_embd_centroids" },
{ LLM_TENSOR_MASKED_EMBD_ORDERING, "masked_embd_ordering" },
{ LLM_TENSOR_FC, "fc" },
{ LLM_TENSOR_D2T, "d2t" },
};

// declare information about the model weight tensors:
Expand Down Expand Up @@ -787,6 +793,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_MASKED_EMBD_CENTROIDS, {LLM_TENSOR_LAYER_INPUT, GGML_OP_NONE}},
{LLM_TENSOR_MASKED_EMBD_ORDERING, {LLM_TENSOR_LAYER_INPUT, GGML_OP_NONE}},
// eagle3
{LLM_TENSOR_FC, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
{LLM_TENSOR_D2T, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
};

LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
Expand Down
Loading
Loading