From 0806a964c5b6e4d28a023142c0b784907b652fed Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 5 May 2026 20:50:20 +0300 Subject: [PATCH 01/27] llama : enable layer input extraction --- src/llama-context.cpp | 16 +++++++++++++++- src/llama-context.h | 2 ++ src/llama-cparams.h | 3 +++ src/llama-ext.h | 11 +++++++++++ src/llama-graph.cpp | 14 +++++++++++++- src/llama-graph.h | 14 +++++++++----- src/llama-hparams.h | 1 + src/llama-model.cpp | 10 +++++++++- src/models/llama.cpp | 2 ++ src/models/openai-moe.cpp | 2 ++ src/models/qwen3.cpp | 2 ++ src/models/qwen3moe.cpp | 2 ++ 12 files changed, 71 insertions(+), 8 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 9a40c4366af1..31f9a530ee7d 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -86,6 +86,7 @@ llama_context::llama_context( cparams.cb_eval_user_data = params.cb_eval_user_data; cparams.ctx_other = nullptr; + cparams.output_layer_inp.resize(hparams.n_layer, false); // TODO: more generic if (model.arch == LLM_ARCH_GEMMA4_ASSISTANT) { @@ -1266,6 +1267,16 @@ bool llama_context::set_adapter_cvec( return res; } +void llama_context::set_output_layer_inp(uint32_t layer_id, bool enable) { + LLAMA_LOG_DEBUG("%s: layer_id = %d, enable = %d\n", __func__, layer_id, enable); + + GGML_ASSERT(layer_id < model.hparams.n_layer); + + cparams.output_layer_inp[layer_id] = enable; + + sched_need_reserve = true; +} + llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) { if (mctx && !mctx->apply()) { LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__); @@ -2041,7 +2052,6 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { has_embd = true; } - size_t backend_float_count = 0; size_t backend_token_count = 0; @@ -4029,3 +4039,7 @@ llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * c llama_context * llama_get_ctx_other(struct llama_context * ctx) { return ctx->get_cparams().ctx_other; } + +void llama_set_output_layer_inp(struct llama_context * ctx, uint32_t layer_id, bool enable) { + ctx->set_output_layer_inp(layer_id, enable); +} diff --git a/src/llama-context.h b/src/llama-context.h index 6f8f59a22a3e..1b516a7bf2b4 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -126,6 +126,8 @@ struct llama_context { int32_t il_start, int32_t il_end); + void set_output_layer_inp(uint32_t layer_id, bool enable); + // process a single ubatch with a specific graph type // if memory_context is provided, it will be applied first to the context's memory // ret contains the status of the graph computation diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 8a35d389ef40..cb326c8e31ca 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -3,6 +3,7 @@ #include "llama.h" #include +#include #define LLAMA_MAX_SEQ 256 @@ -44,6 +45,8 @@ struct llama_cparams { bool kv_unified; bool pipeline_parallel; + std::vector output_layer_inp; + enum llama_context_type ctx_type; enum llama_pooling_type pooling_type; diff --git a/src/llama-ext.h b/src/llama-ext.h index bd74544129b4..c118f9fb3feb 100644 --- a/src/llama-ext.h +++ b/src/llama-ext.h @@ -102,3 +102,14 @@ LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx); LLAMA_API float * llama_get_embeddings_nextn_ith(struct llama_context * ctx, int32_t i); LLAMA_API llama_context * llama_get_ctx_other(struct llama_context * ctx); + +// +// model/context data extraction +// + +// set if the layer input embeddings should be outputed +LLAMA_API void llama_set_output_layer_inp(struct llama_context * ctx, uint32_t layer_id, bool enable); + +LLAMA_API ggml_tensor * llama_model_get_tok_embd(const struct llama_model * model); +LLAMA_API void llama_model_set_tok_embd( struct llama_model * model, ggml_tensor * tensor); + diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index da7a9295561c..45f8da1c7940 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -895,6 +895,10 @@ void llm_graph_result::reset() { t_logits = nullptr; t_embd = nullptr; t_embd_pooled = nullptr; + + t_layer_inp.resize(LLAMA_MAX_LAYERS); + std::fill(t_layer_inp.begin(), t_layer_inp.end(), nullptr); + t_sampled.clear(); t_sampled_probs.clear(); t_sampled_logits.clear(); @@ -923,7 +927,7 @@ void llm_graph_result::set_inputs(const llama_ubatch * ubatch) { } } -void llm_graph_result::set_outputs() { +void llm_graph_result::set_outputs(const llm_graph_params & params) { if (t_logits != nullptr) { ggml_set_output(t_logits); } @@ -936,6 +940,14 @@ void llm_graph_result::set_outputs() { if (t_h_nextn != nullptr) { ggml_set_output(t_h_nextn); } + { + const auto & output_layer_inp = params.cparams.output_layer_inp; + for (size_t il = 0; il < output_layer_inp.size(); ++il) { + if (output_layer_inp[il]) { + ggml_set_output(t_layer_inp[il]); + } + } + } for (auto & [seq_id, t] : t_sampled) { if (t != nullptr) { ggml_set_output(t); diff --git a/src/llama-graph.h b/src/llama-graph.h index 6793846e3ea6..cc5cfe51dcdf 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -705,6 +705,8 @@ class llm_graph_result { ggml_tensor * get_embd_pooled() const { return t_embd_pooled; } ggml_tensor * get_h_nextn() const { return t_h_nextn; } + ggml_tensor * get_layer_inp(int il) const { return t_layer_inp[il]; } + ggml_cgraph * get_gf() const { return gf; } ggml_context * get_ctx() const { return ctx_compute.get(); } @@ -713,7 +715,7 @@ class llm_graph_result { void reset(); void set_inputs(const llama_ubatch * ubatch); - void set_outputs(); + void set_outputs(const llm_graph_params & params); // try to update the existing graph result using the new graph parameters in order to reuse it // this can only be done if we determine that the resulting graph using the new graph parameters @@ -734,10 +736,12 @@ class llm_graph_result { ggml_tensor * t_embd_pooled = nullptr; ggml_tensor * t_h_nextn = nullptr; // [n_embd, n_outputs] hidden state before final output norm - std::map t_sampled_logits; - std::map t_candidates; - std::map t_sampled; - std::map t_sampled_probs; + std::vector t_layer_inp; + + std::map t_sampled_logits; + std::map t_candidates; + std::map t_sampled; + std::map t_sampled_probs; std::vector inputs; diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 032944cb481c..4f23466ce02b 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -48,6 +48,7 @@ struct llama_hparams { uint32_t n_ctx_train; // context size the model was trained on uint32_t n_embd; + uint32_t n_embd_inp_impl = 0; uint32_t n_layer_all; uint32_t n_layer_nextn = 0; uint32_t n_expert = 0; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 4f12e0949acb..a31a23c06149 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2238,7 +2238,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { // TODO: move reranking logic here and generalize llm->build_dense_out(dense_2_out_layers, dense_2_out_layers_b, dense_3_out_layers); - llm->res->set_outputs(); + llm->res->set_outputs(params); return llm->res->get_gf(); } @@ -2687,3 +2687,11 @@ void llama_model_base::create_tensor_qkv(llama_layer & layer, int bid, layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", bid), {n_embd_v_}, TENSOR_NOT_REQUIRED); } } + +ggml_tensor * llama_model_get_tok_embd(const struct llama_model * model) { + return model->tok_embd; +} + +void llama_model_set_tok_embd(struct llama_model * model, ggml_tensor * tensor) { + model->tok_embd = tensor; +} diff --git a/src/models/llama.cpp b/src/models/llama.cpp index c0ec7e0a9adb..4bfebc8843c6 100644 --- a/src/models/llama.cpp +++ b/src/models/llama.cpp @@ -124,6 +124,8 @@ llama_model_llama::graph::graph(const llama_model & model, const llm_grap ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { + res->t_layer_inp[il] = inpL; + ggml_tensor * inpSA = inpL; // norm diff --git a/src/models/openai-moe.cpp b/src/models/openai-moe.cpp index 3ab15d61f08c..6d74f9c7e6ef 100644 --- a/src/models/openai-moe.cpp +++ b/src/models/openai-moe.cpp @@ -75,6 +75,8 @@ llama_model_openai_moe::graph::graph(const llama_model & model, const llm_graph_ ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { + res->t_layer_inp[il] = inpL; + const float freq_base_l = model.get_rope_freq_base (cparams, il); const float freq_scale_l = model.get_rope_freq_scale(cparams, il); diff --git a/src/models/qwen3.cpp b/src/models/qwen3.cpp index 1d0d2fab362a..f4b2a2aebe0f 100644 --- a/src/models/qwen3.cpp +++ b/src/models/qwen3.cpp @@ -69,6 +69,8 @@ llama_model_qwen3::graph::graph(const llama_model & model, const llm_graph_param ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { + res->t_layer_inp[il] = inpL; + ggml_tensor * inpSA = inpL; // norm diff --git a/src/models/qwen3moe.cpp b/src/models/qwen3moe.cpp index 317e668bec79..6f6df5390e33 100644 --- a/src/models/qwen3moe.cpp +++ b/src/models/qwen3moe.cpp @@ -78,6 +78,8 @@ llama_model_qwen3moe::graph::graph(const llama_model & model, const llm_graph_pa ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { + res->t_layer_inp[il] = inpL; + ggml_tensor * inpSA = inpL; // norm From 800494f85f1397d096cd4e63d9a7f2ced439d0a6 Mon Sep 17 00:00:00 2001 From: Ruixiang Wang Date: Mon, 18 May 2026 13:37:43 +0000 Subject: [PATCH 02/27] spec: support eagle3 --- common/speculative.cpp | 436 +++++++++++++++++++++++++++++++- common/speculative.h | 4 + conversion/base.py | 4 + conversion/llama.py | 119 ++++++++- convert_hf_to_gguf.py | 10 + gguf-py/gguf/constants.py | 30 +++ src/llama-arch.cpp | 12 + src/llama-arch.h | 8 + src/llama-context.cpp | 70 ++++- src/llama-context.h | 11 + src/llama-ext.h | 18 ++ src/llama-hparams.h | 7 + src/llama-model-loader.cpp | 1 + src/llama-model.cpp | 24 ++ src/llama-model.h | 10 + src/models/eagle3.cpp | 300 ++++++++++++++++++++++ src/models/models.h | 15 ++ tools/server/server-context.cpp | 3 + 18 files changed, 1069 insertions(+), 13 deletions(-) create mode 100644 src/models/eagle3.cpp diff --git a/common/speculative.cpp b/common/speculative.cpp index 86c1e6a42903..79202842023e 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -375,31 +375,425 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl { } }; + +// EAGLE3 speculative decoding state +// +// Input of draft decoder: (This is different compared to MTP) +// At "pos P", the decoder takes input pair (t_{P+1}, g_P), with RoPE at P. +// - t_{P+1} = token at sequence pos P+1 (the *next* token after P) +// - g_P = encoder output = projection of target's extracted hidden states at P +// +// Deferred boundary (MTP doesn't have this issue): +// Within a single process() call with n_tokens, we can only write decoder KV for +// training pos 0..n_tokens-2. The last training pos (n_tokens-1) needs t_{n_tokens} +// which lies *outside* this batch — it is the token target will sample next or the first token from next ubatch. +// So the last training pos of each process() call is *deferred* to whichever next call has +// the missing token in hand: +// - multi-ubatch prefill: the next process()'s first token completes the pair +// (handled by the per-seq "cross-ubatch bridge") +// - single-ubatch prefill / after verify: draft()'s seed step uses "dp.id_last" +// (target's freshest sample) to complete the pair +// +// Per-seq carry-over state: +// pending_g_last [n_embd_dec] ┐ the deferred boundary's (g, pos). Set by +// pending_pos_last llama_pos ┘ process() at end of ubatch (= last row); +// rebased by accept() to first-non-accepted pos. +// verify_g [N × n_embd_dec] snapshot of process()'s encoder output; +// verify_pos_first llama_pos consumed by accept() to recover the right +// verify_g_rows int32_t pending_g_last row for any n_accepted value. +// +// Performance is overall good but there is waste in verify cycle: +// process() runs encoder + decoder on the *full* verify batch including rows for +// rejected drafts. The KV at those positions is then dropped. +// +// TODO: Not sure if we need optimization for this waste? +// If so we may need hybrid stash: +// in verify mode, have process() only stash features and let draft() seed run +// encoder+decoder on n_accepted+1 rows). struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { - //common_params_speculative_eagle3 params; + common_params_speculative_draft params; + llama_batch batch; + + std::vector smpls; + + int32_t n_embd_dec = 0; // draft hidden size + int32_t n_embd_enc = 0; // n_extract_layers * target_hidden_size + int32_t tgt_hidden = 0; // target model hidden size + const int32_t * extract_layers = nullptr; // model_dft's extract layer indices + uint32_t n_extract_layers = 0; + + // [per-seq] deferred boundary state + std::vector> pending_g_last; + std::vector pending_pos_last; + + // [per-seq] snapshot of the most recent process()'s encoder output + std::vector> verify_g; // [n_seq][n_rows * n_embd_dec] + std::vector verify_pos_first; // [n_seq] — pos of verify_g[seq][0] + std::vector verify_g_rows; // [n_seq] — number of rows + + // scratch buffer for concatenated target features [n_tokens, n_embd_enc] + std::vector features_buf; common_speculative_impl_draft_eagle3(const common_params_speculative & params, uint32_t n_seq) : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq) + , params(params.draft) { LOG_INF("%s: adding speculative implementation 'draft-eagle3'\n", __func__); LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, params.draft.n_max, params.draft.n_min, params.draft.p_min); + + auto * ctx_tgt = this->params.ctx_tgt; + auto * ctx_dft = this->params.ctx_dft; + GGML_ASSERT(ctx_tgt && ctx_dft && "EAGLE3 requires ctx_tgt and ctx_dft to be set"); + + const llama_model * model_dft = llama_get_model(ctx_dft); + const llama_model * model_tgt = llama_get_model(ctx_tgt); + + extract_layers = llama_model_target_extract_layers (model_dft); + n_extract_layers = llama_model_n_target_extract_layers(model_dft); + if (n_extract_layers != 3) { + throw std::runtime_error("draft model is not eagle3 (expected 3 extract layers, got " + + std::to_string(n_extract_layers) + ")"); + } + + tgt_hidden = (int32_t) llama_model_target_hidden_size(model_dft); + if (tgt_hidden != llama_model_n_embd(model_tgt)) { + throw std::runtime_error("EAGLE3 target_hidden_size mismatch (draft expects " + + std::to_string(tgt_hidden) + ", target n_embd is " + + std::to_string(llama_model_n_embd(model_tgt)) + ")"); + } + + n_embd_dec = llama_model_n_embd(model_dft); + n_embd_enc = (int32_t) n_extract_layers * tgt_hidden; + + const int32_t n_b = (int32_t) llama_n_batch(ctx_dft); + batch = llama_batch_init(/*n_tokens=*/ n_b, /*embd=*/ n_embd_dec, /*n_seq_max=*/ 1); + // llama_batch_init allocates only one of token/embd; eagle3 decoder needs both. + // TODO: fix, how to call without malloc + batch.token = (llama_token *) malloc(sizeof(llama_token) * n_b); + + smpls.resize(n_seq); + for (auto & s : smpls) { + common_params_sampling sparams; + sparams.no_perf = false; + sparams.top_k = 10; + sparams.samplers = { COMMON_SAMPLER_TYPE_TOP_K }; + s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams)); + } + + // turn on extraction of the target layers' input embeddings + for (uint32_t k = 0; k < n_extract_layers; ++k) { + llama_set_output_layer_inp(ctx_tgt, (uint32_t) extract_layers[k], true); + } + + // turn on extraction of the draft model's pre-norm hidden state + // (used both for the encoder output g_embd and the decoder pre-norm output) + llama_set_embeddings_pre_norm(ctx_dft, true); + + pending_g_last.assign(n_seq, std::vector(n_embd_dec, 0.0f)); + pending_pos_last.assign(n_seq, -1); + + verify_g.assign(n_seq, std::vector()); + verify_pos_first.assign(n_seq, -1); + verify_g_rows.assign(n_seq, 0); } - void begin(llama_seq_id /*seq_id*/, const llama_tokens & /*prompt*/) override { - // noop + ~common_speculative_impl_draft_eagle3() override { + if (batch.token != nullptr) { + free(batch.token); + batch.token = nullptr; + } + llama_batch_free(batch); } - bool process(const llama_batch & /*batch*/) override { - // TODO: implement + void begin(llama_seq_id seq_id, const llama_tokens & prompt) override { + const int32_t N = (int32_t) prompt.size(); + if (N <= 0) { + return; + } + // expected state after prefill: ctx_dft has pos 0..N-2 (last position is deferred to + // draft()'s seed step). Warn only if more than one position is missing. + auto * ctx_dft = this->params.ctx_dft; + const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id); + if (pos_max < N - 2) { + LOG_WRN("%s: ctx_dft pos_max=%d < N-2=%d — process() did not run on every prefill ubatch. " + "Drafts may degrade.\n", + __func__, (int) pos_max, N - 2); + } + } + + bool process(const llama_batch & batch_in) override { + if (batch_in.n_tokens <= 0) { + return true; + } + + if (batch_in.token == nullptr || batch_in.embd != nullptr) { + return true; + } + + const int32_t n_tokens = batch_in.n_tokens; + + // i_batch_beg[seq] / i_batch_end[seq]: inclusive batch indices of this seq's + // first/last token in batch_in. Assumes per-seq tokens are contiguous within + // the ubatch (server's default ordering). + std::vector i_batch_beg(n_seq, -1); + std::vector i_batch_end(n_seq, -1); + for (int k = 0; k < n_tokens; ++k) { + GGML_ASSERT(batch_in.n_seq_id[k] == 1); + const llama_seq_id seq_id = batch_in.seq_id[k][0]; + if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) { + continue; + } + i_batch_end[seq_id] = k; + if (i_batch_beg[seq_id] < 0) { + i_batch_beg[seq_id] = k; + } + } + + auto * ctx_tgt = this->params.ctx_tgt; + auto * ctx_dft = this->params.ctx_dft; + + // Interleave each extract_layer's hidden state into a contiguous buffer of + // shape [n_tokens, n_extract_layers * tgt_hidden]. Then run EAGLE3 encoder + // to get one g_embd row per token. + features_buf.assign((size_t) n_tokens * n_embd_enc, 0.0f); + + for (uint32_t k = 0; k < n_extract_layers; ++k) { + const float * layer = llama_get_output_layer_inp(ctx_tgt, (uint32_t) extract_layers[k]); + if (!layer) { + GGML_ABORT("EAGLE3: target layer %d input not extracted.", + extract_layers[k]); + } + for (int32_t i = 0; i < n_tokens; ++i) { + float * dst = features_buf.data() + (size_t) i * n_embd_enc + k * (size_t) tgt_hidden; + const float * src = layer + (size_t) i * tgt_hidden; + std::memcpy(dst, src, (size_t) tgt_hidden * sizeof(float)); + } + } + + llama_batch enc_batch = { + /*.n_tokens =*/ n_tokens, + /*.token =*/ nullptr, + /*.embd =*/ features_buf.data(), + /*.pos =*/ nullptr, + /*.n_seq_id =*/ nullptr, + /*.seq_id =*/ nullptr, + /*.logits =*/ nullptr, + }; + int rc = llama_encode(ctx_dft, enc_batch); + if (rc != 0) { + LOG_ERR("%s: llama_encode(ctx_dft) failed rc=%d (n_tokens=%d)\n", + __func__, rc, (int) n_tokens); + return false; + } + + // g_embd has shape [n_tokens, n_embd_dec] in ctx_dft's pre-norm embeddings buffer + const float * g_embd = llama_get_embeddings_pre_norm(ctx_dft); + GGML_ASSERT(g_embd && "EAGLE3 encoder produced no output."); + + const size_t row_bytes = (size_t) n_embd_dec * sizeof(float); + + // EAGLE3 decoder input convention: at memory pos P the input pair is + // (token[P+1], g_embd[P]). This shifts the token index "left by one" relative to g_embd. + // + // Per seq, in order: + // (a) cross-ubatch bridge — when applicable, write the previously-deferred + // pos using this ubatch's first token + pending_g_last. + // (b) main write loop — for k in [beg, end-1], write (token[k+1], g_embd[k]) + // at pos[k]. The last training pos (k=end) is left unwritten = new + // deferred boundary, completed by the next process() or draft() call. + // (c) refresh deferred state — stash this ubatch's full g_embd into verify_g, + // update pending_g_last / pending_pos_last to the last row. + common_batch_clear(batch); + + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + const int32_t beg = i_batch_beg[seq_id]; + const int32_t end = i_batch_end[seq_id]; + if (beg < 0 || end < 0) { + continue; + } + + // cross-ubatch bridge — complete the prior ubatch's deferred boundary. + // Fires iff all three preconditions hold: + // 1) pending_pos_last >= 0 + // 2) pending_pos_last + 1 == pos[beg] + // 3) pending_pos_last > dft_pos_max + const llama_pos pending_pos = pending_pos_last[seq_id]; + if (pending_pos >= 0 && pending_pos + 1 == batch_in.pos[beg]) { + const llama_pos dft_pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id); + if (pending_pos > dft_pos_max) { + common_batch_add(batch, batch_in.token[beg], pending_pos, { seq_id }, /*logits=*/ false); + std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec, + pending_g_last[seq_id].data(), row_bytes); + } + } + + for (int32_t k = beg; k < end; ++k) { + common_batch_add(batch, batch_in.token[k + 1], batch_in.pos[k], + { seq_id }, /*logits=*/ false); + std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec, + g_embd + (size_t) k * n_embd_dec, row_bytes); + } + + // refresh deferred state + const int32_t n_rows = end - beg + 1; + verify_pos_first[seq_id] = batch_in.pos[beg]; + verify_g_rows[seq_id] = n_rows; + verify_g[seq_id].assign((size_t) n_rows * n_embd_dec, 0.0f); + std::memcpy(verify_g[seq_id].data(), + g_embd + (size_t) beg * n_embd_dec, + (size_t) n_rows * row_bytes); + + std::memcpy(pending_g_last[seq_id].data(), + g_embd + (size_t) end * n_embd_dec, row_bytes); + pending_pos_last[seq_id] = batch_in.pos[end]; + } + + if (batch.n_tokens > 0) { + rc = llama_decode(ctx_dft, batch); + if (rc != 0) { + LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (n_tokens=%d, ubatch_pos[0]=%d)\n", + __func__, rc, (int) batch.n_tokens, (int) batch_in.pos[0]); + return false; + } + } + return true; } - void draft(common_speculative_draft_params_vec & /*dparams*/) override { - // TODO: implement + void draft(common_speculative_draft_params_vec & dparams) override { + auto & ctx_dft = params.ctx_dft; + + common_batch_clear(batch); + + // keep track of which sequences are still drafting + int n_drafting = 0; + std::vector drafting(n_seq); + + const size_t row_bytes = (size_t) n_embd_dec * sizeof(float); + + // Complete the deferred boundary pair (dp.id_last, pending_g_last) at memory + // pos pending_pos_last. dp.id_last is target's freshest sample (= corrected + // token after verify, or first generated token after prefill), matching the + // EAGLE3 input convention (token[P+1], g_embd[P]) at pos P. + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + auto & dp = dparams[seq_id]; + + if (!dp.drafting) { + continue; + } + if (pending_pos_last[seq_id] < 0) { + continue; + } + + n_drafting++; + drafting[seq_id] = true; + common_sampler_reset(smpls[seq_id].get()); + + llama_memory_seq_rm(llama_get_memory(ctx_dft), seq_id, pending_pos_last[seq_id], -1); + + common_batch_add(batch, dp.id_last, pending_pos_last[seq_id], { seq_id }, true); + std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec, + pending_g_last[seq_id].data(), + row_bytes); + } + + if (batch.n_tokens == 0) { + return; + } + + int ret = llama_decode(ctx_dft, batch); + if (ret != 0) { + LOG_WRN("%s: llama_decode returned %d\n", __func__, ret); + return; + } + + int i = 0; + + while (n_drafting > 0) { + int i_batch = 0; + + common_batch_clear(batch); + + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + if (!drafting[seq_id]) { + continue; + } + + auto * smpl = smpls[seq_id].get(); + + common_sampler_sample(smpl, ctx_dft, i_batch, true); + // pre-norm hidden state of this position becomes g_embd for the next step + const float * prenorm = llama_get_embeddings_pre_norm_ith(ctx_dft, i_batch); + ++i_batch; + + const auto * cur_p = common_sampler_get_candidates(smpl, true); + + for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) { + LOG_DBG(" - seq_id %d, draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n", + seq_id, k, i, cur_p->data[k].id, cur_p->data[k].p, + common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str()); + } + + const llama_token id = cur_p->data[0].id; + + // only collect very high-confidence draft tokens + // (configurable via --spec-draft-p-min, set to 0.0 to disable early-stop) + if (cur_p->data[0].p < params.p_min) { + drafting[seq_id] = false; + n_drafting--; + + continue; + } + + common_sampler_accept(smpl, id, true); + + auto & dp = dparams.at(seq_id); + auto & result = *dp.result; + + result.push_back(id); + + if ((params.n_max <= (int) result.size()) || + (dp.n_max > 0 && dp.n_max <= (int) result.size())) { + drafting[seq_id] = false; + n_drafting--; + continue; + } + + common_batch_add(batch, id, pending_pos_last[seq_id] + (i + 1), { seq_id }, true); + std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec, + prenorm, + row_bytes); + } + + if (batch.n_tokens == 0) { + break; + } + + ret = llama_decode(ctx_dft, batch); + if (ret != 0) { + LOG_WRN("%s: llama_decode[%d] returned %d\n", __func__, i, ret); + break; + } + + ++i; + } } - void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override { - // noop + void accept(llama_seq_id seq_id, uint16_t n_accepted, bool /*is_other*/) override { + if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) { + return; + } + + const int32_t n_rows = verify_g_rows[seq_id]; + if (n_rows <= 0) { + return; + } + + const int32_t i_g = std::min(n_accepted, n_rows - 1); + pending_pos_last[seq_id] = verify_pos_first[seq_id] + i_g; + std::memcpy(pending_g_last[seq_id].data(), + verify_g[seq_id].data() + (size_t) i_g * n_embd_dec, + (size_t) n_embd_dec * sizeof(float)); } bool need_embd() const override { @@ -1369,9 +1763,11 @@ common_speculative * common_speculative_init(common_params_speculative & params, uint32_t enabled_configs = common_get_enabled_speculative_configs(params.types); bool has_draft_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE)); - bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3 + bool has_draft_eagle3 = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3)) && has_draft_model_path; bool has_mtp = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP)) && params.draft.ctx_dft != nullptr; + + bool has_ngram_cache = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_CACHE)); bool has_ngram_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE)); bool has_ngram_map_k = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K)); @@ -1499,6 +1895,26 @@ void common_speculative_free(common_speculative * spec) { delete spec; } +void common_speculative_setup_draft_model(struct llama_model * model_dft, const struct llama_model * model_tgt) { + if (model_dft == nullptr || model_tgt == nullptr) { + return; + } + if (llama_model_get_tok_embd(model_dft) == nullptr) { + ggml_tensor * tgt_tok_embd = llama_model_get_tok_embd(model_tgt); + if (tgt_tok_embd != nullptr) { + llama_model_set_tok_embd(model_dft, tgt_tok_embd); + LOG_INF("%s: draft inheriting target's tok_embd\n", __func__); + } + } + if (llama_model_get_lm_head(model_dft) == nullptr) { + ggml_tensor * tgt_lm_head = llama_model_get_lm_head(model_tgt); + if (tgt_lm_head != nullptr) { + llama_model_set_lm_head(model_dft, tgt_lm_head); + LOG_INF("%s: draft inheriting target's lm_head\n", __func__); + } + } +} + common_speculative_draft_params & common_speculative_get_draft_params( common_speculative * spec, llama_seq_id seq_id) { diff --git a/common/speculative.h b/common/speculative.h index bf76ad709e26..f1cfcb237f4c 100644 --- a/common/speculative.h +++ b/common/speculative.h @@ -27,6 +27,10 @@ common_speculative * common_speculative_init(common_params_speculative & params, void common_speculative_free(common_speculative * spec); +// Optional setup hook to call once after loading the draft model but before creating its context. +// Inherits any missing weights from the target model (e.g. tok_embd / lm_head from target model for eagle3 / dflash) +void common_speculative_setup_draft_model(struct llama_model * model_dft, const struct llama_model * model_tgt); + struct common_speculative_draft_params { // this flag is used to chain the drafts through all the available implementations // after the first successful draft from an implementation, we set it diff --git a/conversion/base.py b/conversion/base.py index 408e209aa884..9d81c19b46de 100644 --- a/conversion/base.py +++ b/conversion/base.py @@ -94,6 +94,7 @@ class ModelBase: metadata: gguf.Metadata dir_model_card: Path remote_hf_model_id: str | None + target_model_dir: Path | None # subclasses should define this! model_arch: gguf.MODEL_ARCH @@ -119,6 +120,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None, disable_mistral_community_chat_template: bool = False, sentence_transformers_dense_modules: bool = False, + target_model_dir: Path | None = None, fuse_gate_up_exps: bool = False, fp8_as_q8: bool = False): if type(self) is ModelBase or \ @@ -139,6 +141,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, self.dry_run = dry_run self.remote_hf_model_id = remote_hf_model_id self.sentence_transformers_dense_modules = sentence_transformers_dense_modules + self.target_model_dir = target_model_dir self.fuse_gate_up_exps = fuse_gate_up_exps self._gate_exp_buffer: dict[int, Tensor] = {} self._up_exp_buffer: dict[int, Tensor] = {} @@ -2481,6 +2484,7 @@ class LazyTorchTensor(gguf.LazyBase): torch.float16: np.float16, torch.float32: np.float32, torch.uint8: np.uint8, + torch.int64: np.int64, } # only used when byteswapping data. Only correct size is needed diff --git a/conversion/llama.py b/conversion/llama.py index fd6167bfd91f..db073b9b361a 100644 --- a/conversion/llama.py +++ b/conversion/llama.py @@ -10,7 +10,7 @@ if TYPE_CHECKING: from torch import Tensor -from .base import ModelBase, TextModel, gguf +from .base import ModelBase, TextModel, gguf, logger @ModelBase.register( @@ -21,6 +21,9 @@ "VLlama3ForCausalLM", "LlavaForConditionalGeneration", "VoxtralForConditionalGeneration", + "LlamaForCausalLMEagle3", + "Eagle3Speculator", + "Eagle3DraftModel", "IQuestCoderForCausalLM", "LlamaModel") class LlamaModel(TextModel): @@ -39,7 +42,57 @@ def __init__(self, *args, **kwargs): hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False) self.origin_hf_arch = hparams.get('architectures', [None])[0] + # Detect eagle3 draft checkpoint by hparams (some models don't use a distinct HF arch name) + if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1: + self.is_eagle3 = True + self.model_arch = gguf.MODEL_ARCH.EAGLE3 + logger.info("Detected EAGLE-3 draft model, switching to EAGLE3 architecture") + # Re-initialize tensor_map with eagle3 architecture + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + # Update gguf_writer architecture + self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch] + self.gguf_writer.add_architecture() + if self.target_model_dir is None: + raise ValueError( + "EAGLE-3 model requires --target-model-dir to be specified. " + "Please provide the path to the target model directory to read config.json" + ) + # Read both eagle3 raw config and target model config + with open(self.dir_model / "config.json", 'r', encoding='utf-8') as f: + eagle3_raw_config = json.load(f) + with open(self.target_model_dir / "config.json", 'r', encoding='utf-8') as f: + target_config = json.load(f) + + # extract_layers: derived from target model layer count (low/mid/high) + target_num_layers = target_config["num_hidden_layers"] + extract_layers = [2, target_num_layers // 2, target_num_layers - 3] + logger.info(f"EAGLE-3: extract_layers = {extract_layers} (target model has {target_num_layers} layers)") + self.gguf_writer.add_array(f"{self.gguf_writer.arch}.extract_layers", extract_layers) + + # target_hidden_size: prefer eagle3 config, fallback to target config + if eagle3_raw_config.get("target_hidden_size") is not None: + target_hidden_size = eagle3_raw_config["target_hidden_size"] + src = "EAGLE-3 config" + else: + target_hidden_size = target_config["hidden_size"] + src = "target model config" + logger.info(f"EAGLE-3: target_hidden_size = {target_hidden_size} (from {src})") + self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.target_hidden_size", target_hidden_size) + + # norm_before_residual (RedHat-style eagle3 specific) + norm_before_residual = eagle3_raw_config.get("norm_before_residual", False) + logger.info(f"EAGLE-3: norm_before_residual = {norm_before_residual}") + self.gguf_writer.add_bool(f"{self.gguf_writer.arch}.norm_before_residual", norm_before_residual) + def set_vocab(self): + # eagle3: use tokenizer from target model if provided + original_dir_model = None + if getattr(self, 'is_eagle3', False): + assert self.target_model_dir is not None + logger.info(f"EAGLE-3: Using tokenizer from target model: {self.target_model_dir}") + original_dir_model = self.dir_model + self.dir_model = self.target_model_dir + if self.origin_hf_arch == "GlmasrModel": return self._set_vocab_glmedge() @@ -85,6 +138,10 @@ def set_vocab(self): if self.hparams.get("vocab_size", 32000) == 49152: self.gguf_writer.add_add_bos_token(False) + # eagle3: Restore original dir_model + if original_dir_model is not None: + self.dir_model = original_dir_model + def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams @@ -129,7 +186,49 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca return super().filter_tensors((name, gen)) + def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]: + tensors = super().index_tensors(remote_hf_model_id) + + # Handle Eagle3Speculator nested config + if "transformer_layer_config" in self.hparams: + self.hparams = {**self.hparams, **self.hparams["transformer_layer_config"]} + + # eagle3 detection + if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1: + logger.info("EAGLE-3: renaming midlayer.* / layers.0.* to model.layers.0.*") + new_tensors = {} + for name, gen in tensors.items(): + if name.startswith("midlayer."): + new_name = "model.layers.0." + name[len("midlayer."):] + new_tensors[new_name] = gen + elif name.startswith("layers.0."): # Eagle3Speculator format + new_name = "model." + name + new_tensors[new_name] = gen + else: + new_tensors[name] = gen + return new_tensors + + return tensors + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # eagle3: special tensors that bypass standard llama mapping + if getattr(self, 'is_eagle3', False): + if name == "fc.weight": + yield (name, data_torch) + return + if name == "d2t": + # store for manual int64 handling in prepare_tensors (avoid F32 conversion) + if not hasattr(self, '_eagle3_int_tensors'): + self._eagle3_int_tensors = {} + self._eagle3_int_tensors[name] = data_torch + return + if name == "t2d": + # not used at runtime, skip + return + if name == "model.layers.0.hidden_norm.weight": + yield ("blk.0.hidden_norm.weight", data_torch) + return + n_head = self.find_hparam(["n_heads", "num_attention_heads"]) n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"]) @@ -205,8 +304,26 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) def prepare_tensors(self): + # eagle3: collect d2t original dtype before parent converts tensors to F32 + eagle3_original_dtypes = {} + if getattr(self, 'is_eagle3', False): + for name, data_torch in self.get_tensors(): + if name == "d2t": + eagle3_original_dtypes[name] = data_torch.dtype + super().prepare_tensors() + # eagle3: write d2t as int64 directly (not converted to F32) + if getattr(self, 'is_eagle3', False) and hasattr(self, '_eagle3_int_tensors'): + for name, data_torch in self._eagle3_int_tensors.items(): + old_dtype = eagle3_original_dtypes.get(name, data_torch.dtype) + data = data_torch.to(torch.int64).numpy() + data_qtype = gguf.GGMLQuantizationType.I64 + + shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}" + logger.info(f"{name + ',':<30} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") + self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype) + if self._experts is not None: # flatten `list[dict[str, Tensor]]` into `list[str]` experts = [k for d in self._experts for k in d.keys()] diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a6192c039a0a..3b23d5ebc0d3 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -153,6 +153,15 @@ def parse_args() -> argparse.Namespace: help="Store tensors dequantized from FP8 as Q8_0 instead of BF16/F16.", ) + parser.add_argument( + "--target-model-dir", type=str, default=None, + help=( + "path to the target model directory; required when converting a standalone draft model " + "(e.g. EAGLE3 / DFlash) that needs target-model metadata such as tokenizer, hidden size, and " + "layer count to populate its GGUF." + ), + ) + args = parser.parse_args() if not args.print_supported_models and args.model is None: parser.error("the following arguments are required: model") @@ -269,6 +278,7 @@ def main() -> None: small_first_shard=args.no_tensor_first_split, remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template, sentence_transformers_dense_modules=args.sentence_transformers_dense_modules, + target_model_dir=Path(args.target_model_dir) if args.target_model_dir else None, fuse_gate_up_exps=args.fuse_gate_up_exps, fp8_as_q8=args.fp8_as_q8, ) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index bd6246137b0a..1ad57f24d3c0 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -154,6 +154,9 @@ class LLM: HIDDEN_ACT = "{arch}.hidden_activation" DENSE_FEAT_IN_SIZE = "{arch}.{dense}_feat_in" DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out" + EAGLE3_EXTRACT_LAYERS = "{arch}.extract_layers" + EAGLE3_TARGET_HIDDEN_SIZE = "{arch}.target_hidden_size" + EAGLE3_NORM_BEFORE_RESIDUAL = "{arch}.norm_before_residual" class Attention: HEAD_COUNT = "{arch}.attention.head_count" @@ -510,6 +513,7 @@ class MODEL_ARCH(IntEnum): RND1 = auto() PANGU_EMBED = auto() MISTRAL3 = auto() + EAGLE3 = auto() MISTRAL4 = auto() PADDLEOCR = auto() MIMO2 = auto() @@ -906,6 +910,10 @@ class MODEL_TENSOR(IntEnum): NEXTN_HNORM = auto() NEXTN_SHARED_HEAD_HEAD = auto() NEXTN_SHARED_HEAD_NORM = auto() + # eagle3 + EAGLE3_FC = auto() # feature fusion layer + EAGLE3_HIDDEN_NORM = auto() # hidden normalization + EAGLE3_D2T = auto() # draft to target vocabulary mapping # lfm2 audio A_ENC_NORM_CONV = auto() A_ENC_LINEAR_POS = auto() @@ -1060,6 +1068,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.RND1: "rnd1", MODEL_ARCH.PANGU_EMBED: "pangu-embedded", MODEL_ARCH.MISTRAL3: "mistral3", + MODEL_ARCH.EAGLE3: "eagle3", MODEL_ARCH.MISTRAL4: "mistral4", MODEL_ARCH.PADDLEOCR: "paddleocr", MODEL_ARCH.MIMO2: "mimo2", @@ -1483,6 +1492,9 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.NEXTN_HNORM: "blk.{bid}.nextn.hnorm", MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: "blk.{bid}.nextn.shared_head_head", MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: "blk.{bid}.nextn.shared_head_norm", + MODEL_TENSOR.EAGLE3_FC: "fc", + MODEL_TENSOR.EAGLE3_HIDDEN_NORM: "blk.{bid}.hidden_norm", + MODEL_TENSOR.EAGLE3_D2T: "d2t", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -4021,6 +4033,24 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.EAGLE3: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.EAGLE3_FC, + MODEL_TENSOR.EAGLE3_HIDDEN_NORM, + MODEL_TENSOR.EAGLE3_D2T, + ], MODEL_ARCH.MISTRAL4: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 6a5d5f8d2ac8..46217c5eb753 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -128,6 +128,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_RND1, "rnd1" }, { LLM_ARCH_PANGU_EMBED, "pangu-embedded" }, { LLM_ARCH_MISTRAL3, "mistral3" }, + { LLM_ARCH_EAGLE3, "eagle3" }, { LLM_ARCH_MISTRAL4, "mistral4" }, { LLM_ARCH_PADDLEOCR, "paddleocr" }, { LLM_ARCH_MIMO2, "mimo2" }, @@ -292,6 +293,10 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" }, + { LLM_KV_EAGLE3_EXTRACT_LAYERS, "%s.extract_layers" }, + { LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, "%s.target_hidden_size" }, + { LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, "%s.norm_before_residual" }, + { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" }, // sentence-transformers dense modules feature dims { LLM_KV_DENSE_2_FEAT_IN, "%s.dense_2_feat_in" }, @@ -559,6 +564,9 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_INDEXER_PROJ, "blk.%d.indexer.proj" }, { LLM_TENSOR_INDEXER_ATTN_K, "blk.%d.indexer.attn_k" }, { LLM_TENSOR_INDEXER_ATTN_Q_B, "blk.%d.indexer.attn_q_b" }, + { LLM_TENSOR_EAGLE3_HIDDEN_NORM, "blk.%d.hidden_norm" }, + { LLM_TENSOR_EAGLE3_FC, "fc" }, + { LLM_TENSOR_EAGLE3_D2T, "d2t" }, }; // declare information about the model weight tensors: @@ -783,6 +791,10 @@ static const std::map LLM_TENSOR_INFOS = { // latent projections feed ggml_mul_mat, the buft probe must use MUL_MAT to keep them on GPU {LLM_TENSOR_FFN_LATENT_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + // eagle3 + {LLM_TENSOR_EAGLE3_FC, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_EAGLE3_HIDDEN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_EAGLE3_D2T, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}}, }; LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {} diff --git a/src/llama-arch.h b/src/llama-arch.h index 03b1a265d67a..60581af024da 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -142,6 +142,7 @@ enum llm_arch { LLM_ARCH_TALKIE, LLM_ARCH_MELLUM, LLM_ARCH_UNKNOWN, + LLM_ARCH_EAGLE3, }; enum llm_kv { @@ -336,6 +337,10 @@ enum llm_kv { LLM_KV_CLASSIFIER_OUTPUT_LABELS, + LLM_KV_EAGLE3_EXTRACT_LAYERS, + LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, + LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, + LLM_KV_SHORTCONV_L_CACHE, LLM_KV_XIELU_ALPHA_N, @@ -566,6 +571,9 @@ enum llm_tensor { LLM_TENSOR_NEXTN_HNORM, LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, + LLM_TENSOR_EAGLE3_FC, + LLM_TENSOR_EAGLE3_HIDDEN_NORM, + LLM_TENSOR_EAGLE3_D2T, }; enum llm_tensor_layer { diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 31f9a530ee7d..4c40bdf3703d 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -87,6 +87,7 @@ llama_context::llama_context( cparams.ctx_other = nullptr; cparams.output_layer_inp.resize(hparams.n_layer, false); + embd_layer_inp.resize(hparams.n_layer); // TODO: more generic if (model.arch == LLM_ARCH_GEMMA4_ASSISTANT) { @@ -1277,6 +1278,13 @@ void llama_context::set_output_layer_inp(uint32_t layer_id, bool enable) { sched_need_reserve = true; } +float * llama_context::get_output_layer_inp(uint32_t layer_id) { + if (layer_id >= embd_layer_inp.size() || embd_layer_inp[layer_id].empty()) { + return nullptr; + } + return embd_layer_inp[layer_id].data(); +} + llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) { if (mctx && !mctx->apply()) { LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__); @@ -1361,7 +1369,10 @@ int llama_context::encode(const llama_batch & batch_inp) { const auto & hparams = model.hparams; - const int64_t n_embd = hparams.n_embd_inp(); + // eagle3/DFlash: features as encoder input, and non-draft paths fall back to model's input dim + const int64_t n_embd = (hparams.n_embd_target_features > 0 && batch_inp.embd) + ? (int64_t) hparams.n_embd_target_features + : hparams.n_embd_inp(); const int64_t n_vocab = model.vocab.n_tokens(); // note: during encode, we always pass the full sequence starting from pos = 0 @@ -1872,7 +1883,39 @@ int llama_context::decode(const llama_batch & batch_inp) { if (n_outputs) { GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all); GGML_ASSERT((n_outputs_prev + n_outputs)*n_vocab <= (int64_t) logits.size); - ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float)); + + // eagle3: Map draft vocab to target vocab + if (model.arch == LLM_ARCH_EAGLE3 && model.d2t) { + static thread_local std::vector eagle3_d2t_map; + static thread_local std::vector eagle3_draft_logits; + + const int64_t draft_vocab_size = t_logits->ne[0]; + const uint32_t last_idx = n_outputs - 1; + + if (eagle3_d2t_map.empty()) { + eagle3_d2t_map.resize(model.d2t->ne[0]); + ggml_backend_tensor_get(model.d2t, eagle3_d2t_map.data(), 0, + eagle3_d2t_map.size() * sizeof(int64_t)); + } + + eagle3_draft_logits.resize(draft_vocab_size); + const size_t last_offset = last_idx * draft_vocab_size * sizeof(float); + ggml_backend_tensor_get_async(backend_res, t_logits, eagle3_draft_logits.data(), + last_offset, draft_vocab_size * sizeof(float)); + synchronize(); + + float * last_logits_out = logits_out + last_idx * n_vocab; + std::fill(last_logits_out, last_logits_out + n_vocab, + -std::numeric_limits::infinity()); + + for (int64_t j = 0; j < draft_vocab_size; j++) { + const int64_t target_id = j + eagle3_d2t_map[j]; + GGML_ASSERT(target_id >= 0 && target_id < n_vocab); + last_logits_out[target_id] = eagle3_draft_logits[j]; + } + } else { + ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float)); + } } } @@ -1936,6 +1979,8 @@ int llama_context::decode(const llama_batch & batch_inp) { } } + extract_layer_inputs(res); + // extract nextn embeddings before // only meaningful in LLAMA_POOLING_TYPE_NONE (per-token); other pooling modes are ignored. { @@ -2174,6 +2219,23 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { return n_outputs_max; } +void llama_context::extract_layer_inputs(const llm_graph_result * res) { + for (uint32_t il = 0; il < cparams.output_layer_inp.size(); ++il) { + if (!cparams.output_layer_inp[il]) { + continue; + } + ggml_tensor * t = res->get_layer_inp((int) il); + if (!t) { + continue; + } + const size_t nbytes = ggml_nbytes(t); + embd_layer_inp[il].resize(nbytes / sizeof(float)); + ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched.get(), t); + GGML_ASSERT(backend != nullptr); + ggml_backend_tensor_get_async(backend, t, embd_layer_inp[il].data(), 0, nbytes); + } +} + void llama_context::output_reorder() { const uint64_t n_vocab = model.vocab.n_tokens(); const uint64_t n_embd = model.hparams.n_embd; @@ -4043,3 +4105,7 @@ llama_context * llama_get_ctx_other(struct llama_context * ctx) { void llama_set_output_layer_inp(struct llama_context * ctx, uint32_t layer_id, bool enable) { ctx->set_output_layer_inp(layer_id, enable); } + +float * llama_get_output_layer_inp(struct llama_context * ctx, uint32_t layer_id) { + return ctx->get_output_layer_inp(layer_id); +} diff --git a/src/llama-context.h b/src/llama-context.h index 1b516a7bf2b4..d6d483cb97d6 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -128,6 +128,9 @@ struct llama_context { void set_output_layer_inp(uint32_t layer_id, bool enable); + // read back the input embeddings of the specified layer + float * get_output_layer_inp(uint32_t layer_id); + // process a single ubatch with a specific graph type // if memory_context is provided, it will be applied first to the context's memory // ret contains the status of the graph computation @@ -228,6 +231,10 @@ struct llama_context { // map the output row index `i` to batch index int64_t output_resolve_row(int32_t i) const; + // async-copy enabled layer-input tensors (per cparams.output_layer_inp) + // from backend into host-side embd_layer_inp buffers + void extract_layer_inputs(const llm_graph_result * res); + // // graph // @@ -356,6 +363,10 @@ struct llama_context { // host buffer for the model output (logits and embeddings) ggml_backend_buffer_ptr buf_output; + // host buffer for output layer input embeddings, per layer + // populated when cparams.output_layer_inp[il] is true + std::vector> embd_layer_inp; + // keep copies of the per-sequence memory on the device std::map mem_storage; diff --git a/src/llama-ext.h b/src/llama-ext.h index c118f9fb3feb..51838a761f10 100644 --- a/src/llama-ext.h +++ b/src/llama-ext.h @@ -110,6 +110,24 @@ LLAMA_API llama_context * llama_get_ctx_other(struct llama_context * ctx); // set if the layer input embeddings should be outputed LLAMA_API void llama_set_output_layer_inp(struct llama_context * ctx, uint32_t layer_id, bool enable); +// read back the input embeddings of the specified layer for the most recent ubatch +// the layer must have been enabled via llama_set_output_layer_inp +LLAMA_API float * llama_get_output_layer_inp(struct llama_context * ctx, uint32_t layer_id); + LLAMA_API ggml_tensor * llama_model_get_tok_embd(const struct llama_model * model); LLAMA_API void llama_model_set_tok_embd( struct llama_model * model, ggml_tensor * tensor); +LLAMA_API ggml_tensor * llama_model_get_lm_head(const struct llama_model * model); +LLAMA_API void llama_model_set_lm_head( struct llama_model * model, ggml_tensor * tensor); + +// +// eagle3/DFlash: consume target model extracted features +// + +// returns pointer to the target-model layer indices +LLAMA_API const int32_t * llama_model_target_extract_layers (const struct llama_model * model); +// returns the number of extracted layers from target model +LLAMA_API uint32_t llama_model_n_target_extract_layers(const struct llama_model * model); +// returns the target model hidden size +LLAMA_API uint32_t llama_model_target_hidden_size (const struct llama_model * model); + diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 4f23466ce02b..62d91129504d 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -236,6 +236,13 @@ struct llama_hparams { // >=0 => input embedding index for deepstack injection std::array deepstack_mapping_arr; + // eagle3/DFlash sahred params + // n_embd_target_features = n_extract * target_hidden_size (encoder input dim) + uint32_t n_embd_target_features = 0; + uint32_t target_hidden_size = 0; + // eagle3: whether to apply hidden_norm before storing residual + bool eagle3_norm_before_residual = false; + // gemma4 per-layer embedding uint32_t n_embd_per_layer = 0; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 0d1cf3cc33bb..474cabdfc095 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -394,6 +394,7 @@ namespace GGUFMeta { template bool llama_model_loader::get_arr>(enum llm_kv kid, std::vector & result, bool required); template bool llama_model_loader::get_arr>(enum llm_kv kid, std::array & result, bool required); + template bool llama_model_loader::get_arr>(enum llm_kv kid, std::vector & result, bool required); template bool llama_model_loader::get_key(const std::string & key, T & result, bool required) { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index a31a23c06149..a41740f81ce2 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -287,6 +287,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params return new llama_model_qwen35moe(params); case LLM_ARCH_MISTRAL3: return new llama_model_mistral3(params); + case LLM_ARCH_EAGLE3: + return new llama_model_eagle3(params); case LLM_ARCH_MIMO2: return new llama_model_mimo2(params); case LLM_ARCH_KIMI_LINEAR: @@ -2406,6 +2408,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_ERNIE4_5: case LLM_ARCH_ERNIE4_5_MOE: case LLM_ARCH_MISTRAL3: + case LLM_ARCH_EAGLE3: case LLM_ARCH_MISTRAL4: case LLM_ARCH_LLAMA_EMBED: case LLM_ARCH_MAINCODER: @@ -2695,3 +2698,24 @@ ggml_tensor * llama_model_get_tok_embd(const struct llama_model * model) { void llama_model_set_tok_embd(struct llama_model * model, ggml_tensor * tensor) { model->tok_embd = tensor; } + +ggml_tensor * llama_model_get_lm_head(const struct llama_model * model) { + return model->output; +} + +void llama_model_set_lm_head(struct llama_model * model, ggml_tensor * tensor) { + model->output = tensor; +} + +const int32_t * llama_model_target_extract_layers(const struct llama_model * model) { + const auto & v = model->target_extract_layers; + return v.empty() ? nullptr : v.data(); +} + +uint32_t llama_model_n_target_extract_layers(const struct llama_model * model) { + return (uint32_t) model->target_extract_layers.size(); +} + +uint32_t llama_model_target_hidden_size(const struct llama_model * model) { + return model->hparams.target_hidden_size; +} diff --git a/src/llama-model.h b/src/llama-model.h index 992c8d9c8fd9..b28eb7baf256 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -475,6 +475,9 @@ struct llama_layer { struct ggml_tensor * ffn_act_beta = nullptr; struct ggml_tensor * ffn_act_eps = nullptr; + // eagle3 + struct ggml_tensor * eagle3_hidden_norm = nullptr; + // Kimi Linear KDA (using ssm_ prefix for consistency) // Note: ssm_dt_b already exists above (mamba bias), reused for Kimi dt_bias struct ggml_tensor * ssm_q_conv = nullptr; @@ -569,6 +572,13 @@ struct llama_model { struct ggml_tensor * per_layer_model_proj = nullptr; struct ggml_tensor * per_layer_proj_norm = nullptr; + // eagle3 + struct ggml_tensor * fc = nullptr; // feature fusion layer + struct ggml_tensor * d2t = nullptr; // draft to target vocabulary mapping + + // unified vector to store target-model extracted layer ids in eagle3, dflash, etc. + std::vector target_extract_layers; + std::vector layers; //Dense linear projections for SentenceTransformers models like embeddinggemma diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp new file mode 100644 index 000000000000..3694d262cb85 --- /dev/null +++ b/src/models/eagle3.cpp @@ -0,0 +1,300 @@ +#include "models.h" + +void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + if (!ml.get_arr(LLM_KV_EAGLE3_EXTRACT_LAYERS, target_extract_layers, false)) { + throw std::runtime_error("EAGLE3 model requires 'extract_layers' in GGUF metadata"); + } + if (target_extract_layers.size() != 3) { + throw std::runtime_error("EAGLE3 requires exactly 3 entries in 'extract_layers'"); + } + LLAMA_LOG_INFO("%s: EAGLE3 extract_layers = [%d, %d, %d]\n", __func__, + target_extract_layers[0], + target_extract_layers[1], + target_extract_layers[2]); + + ml.get_key(LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, hparams.target_hidden_size); + LLAMA_LOG_INFO("%s: EAGLE3 target_hidden_size = %u (draft n_embd = %u)\n", __func__, + hparams.target_hidden_size, hparams.n_embd); + + hparams.n_embd_target_features = (uint32_t) target_extract_layers.size() * hparams.target_hidden_size; + + // eagle3 norm_before_residual (optional, default false) + // compatible with Readhat eagle3 speculator model + ml.get_key(LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, hparams.eagle3_norm_before_residual, false); + if (hparams.eagle3_norm_before_residual) { + LLAMA_LOG_INFO("%s: EAGLE3 norm_before_residual = true\n", __func__); + } + + type = LLM_TYPE_UNKNOWN; +} + +void llama_model_eagle3::load_arch_tensors(llama_model_loader &) { + LLAMA_LOAD_LOCALS; + + const int64_t n_embd_target_features = (int64_t) hparams.n_embd_target_features; + const int64_t n_embd_attn_input = 2 * n_embd; + + // Get vocab size from the d2t tensor in the GGUF file (optional - only needed if eagle3 has different vocab_size than target) + // d2t: draft to target vocabulary mapping + int64_t n_draft_vocab = n_vocab; // Default: same as target vocab + const struct ggml_tensor * d2t_meta = ml->get_tensor_meta("d2t"); + if (d2t_meta) { + n_draft_vocab = d2t_meta->ne[0]; // update draft vocab size + d2t = create_tensor(tn(LLM_TENSOR_EAGLE3_D2T), {n_draft_vocab}, 0); + LLAMA_LOG_INFO("%s: EAGLE3 using d2t mapping (draft_vocab_size = %lld)\n", __func__, (long long)n_draft_vocab); + } else { + d2t = nullptr; // no d2t, use default vocab size + LLAMA_LOG_INFO("%s: EAGLE3 without d2t - sharing same vocab_size with target (vocab_size = %lld)\n", __func__, (long long)n_draft_vocab); + } + + // Feature fusion layer: projects 3 target layers to draft hidden size + fc = create_tensor(tn(LLM_TENSOR_EAGLE3_FC, "weight"), {n_embd_target_features, n_embd}, 0); + + // Output layer (uses draft vocab size) + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_draft_vocab}, 0); + + // Token embeddings (optional - Llama 3.3 70B EAGLE3 has its own) + const struct ggml_tensor * tok_embd_meta = ml->get_tensor_meta(tn(LLM_TENSOR_TOKEN_EMBD, "weight").str().c_str()); + if (tok_embd_meta) { + const int64_t n_target_vocab = tok_embd_meta->ne[1]; + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_target_vocab}, 0); + LLAMA_LOG_INFO("%s: EAGLE3 using its own token_embd (vocab = %lld)\n", __func__, (long long)n_target_vocab); + } + + // Single decoder layer + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + // input_layernorm: applied to token embeddings + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + // Attention takes input_embeds_normed + fused_target_normed as input + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd_attn_input, n_embd_head_k * n_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd_attn_input, n_embd_k_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd_attn_input, n_embd_v_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + + // eagle3 specific: hidden_norm applied to fused target features + layer.eagle3_hidden_norm = create_tensor(tn(LLM_TENSOR_EAGLE3_HIDDEN_NORM, "weight", i), {n_embd}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + + // rope_freqs for llama3 rope scaling (optional - only if eagle3 config has rope_scaling) + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED); + } +} + +std::unique_ptr llama_model_eagle3::build_arch_graph(const llm_graph_params & params) const { + switch (params.gtype) { + case LLM_GRAPH_TYPE_ENCODER: + return std::make_unique>(*this, params); + case LLM_GRAPH_TYPE_DEFAULT: + case LLM_GRAPH_TYPE_DECODER: + return std::make_unique>(*this, params); + default: + GGML_ABORT("invalid graph type"); + }; +} + +template <> +ggml_tensor * llama_model_eagle3::graph::build_inp_embd_enc() const { + const int64_t n_embd_target_features = (int64_t) hparams.n_embd_target_features; + + ggml_tensor * cur = nullptr; + + // Input: Target model features (3 layers concatenated: low, mid, high) + // Data will be provided via ubatch->embd in encode_eagle3_features() + auto inp_target = std::make_unique(n_embd_target_features); + inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_target_features, n_tokens); + ggml_set_input(inp_target->embd); + + cur = inp_target->embd; + cb(cur, "inp_embd", -1); + + res->add_input(std::move(inp_target)); + + return cur; +} + +// eagle3 Encoder: processes target model features through feature fusion layer +// Input: target_features e.g. [12288, n_tokens] from target model layers low, middle, high +// Output: g_embeddings e.g. [4096, n_tokens] stored in context +template <> +llama_model_eagle3::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + ggml_tensor * cur = nullptr; + + cur = build_inp_embd_enc(); + + // Feature fusion layer + cur = build_lora_mm(model.fc, cur); + cb(cur, "fc_out", -1); + + // Output: g_embeddings e.g. [4096, n_tokens] + // store in t_h_pre_norm (same as MTP) so can be read via llama_get_embeddings_pre_norm(ctx_dft) + ggml_set_output(cur); + res->t_h_pre_norm = cur; + + ggml_build_forward_expand(gf, cur); +} + +// eagle3 Decoder: processes draft tokens using g_embeddings from encoder +// Input: draft tokens + g_embeddings from encoder +// Output: draft logits +template <> +llama_model_eagle3::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v(); + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_layer == 1); // eagle3 has only one decoder layer + + ggml_tensor * cur; + ggml_tensor * inpL; + + // eagle3 Decoder receives: + // 1. Token embeddings (e.g.from eagle3's own tok_embd for Llama 3.3 70B, or target model for Llama 3.1 8B) + // 2. g_embeddings from encoder + GGML_ASSERT(model.tok_embd != nullptr && "EAGLE3 decoder requires token embeddings (own or from target model)"); + + auto inp = std::make_unique(n_embd); + + inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + ggml_set_input(inp->tokens); + + inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); + ggml_set_input(inp->embd); + + ggml_tensor * inp_embd = ggml_get_rows(ctx0, model.tok_embd, inp->tokens); + cb(inp_embd, "inp_embd", -1); + + ggml_tensor * inp_g = inp->embd; + cb(inp_g, "inp_g_embeddings", -1); + + res->add_input(std::move(inp)); + + inpL = inp_g; + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_attn = build_attn_inp_kv(); + + const float kq_scale = 1.0f/sqrtf(float(n_embd_head)); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + // Single decoder layer (il = 0) + const int il = 0; + { + // Apply input_layernorm to the token embeddings + ggml_tensor * embd_norm = build_norm(inp_embd, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(embd_norm, "embd_norm", il); + + // Apply hidden_norm to inp_g + ggml_tensor * g_norm = build_norm(inp_g, + model.layers[il].eagle3_hidden_norm, NULL, + LLM_NORM_RMS, -1); + cb(g_norm, "g_norm", il); + + // norm_before_residual: determines what goes into the residual connection (compatible with Readhat eagle3 speculator model) + // - false (default): use raw inp_g for residual + // - true: use normalized g_norm for residual + // inpL is the concatenated input (normalized inp_embd + normalized inp_g) + ggml_tensor * inpSA = hparams.eagle3_norm_before_residual ? g_norm : inpL; + + // Concatenate normalized inp_embd and normalized inp_g + cur = ggml_concat(ctx0, embd_norm, g_norm, il); + cb(cur, "concat_embd", il); + + // Self-attention with concatenated input + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + // rope freq factors, returns nullptr if not available + ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); + + // RoPE + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cb(Qcur, "Qcur_rope", il); + cb(Kcur, "Kcur_rope", il); + + cur = build_attn(inp_attn, + model.layers[il].wo, NULL, nullptr, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); + + if (inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + // Add residual and update it + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // Apply FFN norm to the sum + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "post_attn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + // Output norm with residual + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "eagle3_prenorm", il); + + inpL = cur; + } + + cur = inpL; + + // Output prenorm state (for next token's g_embeddings in autoregressive generation) + ggml_set_output(cur); + res->t_h_pre_norm = cur; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head - projects to draft vocabulary + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); +} diff --git a/src/models/models.h b/src/models/models.h index c137e32e8fd1..bcaee24377f5 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -1089,6 +1089,21 @@ struct llama_model_glm_dsa : public llama_model_base { std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; +struct llama_model_eagle3 : public llama_model_base { + llama_model_eagle3(const struct llama_model_params & params) : llama_model_base(params) {} + void load_arch_hparams(llama_model_loader & ml) override; + void load_arch_tensors(llama_model_loader & ml) override; + + template + struct graph : public llm_graph_context { + graph(const llama_model & model, const llm_graph_params & params); + + ggml_tensor * build_inp_embd_enc() const; + }; + + std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; +}; + struct llama_model_mistral4 : public llama_model_deepseek2 { llama_model_mistral4(const struct llama_model_params & params) : llama_model_deepseek2(params) {} diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 6fa302e132f3..0ebb90aba2f5 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -939,6 +939,9 @@ struct server_context_impl { return false; } + // eagle3/DFlash: shares target model's token_embd + common_speculative_setup_draft_model(model_dft.get(), model_tgt); + auto cparams = common_context_params_to_llama(params_dft); const bool spec_mtp = std::find(params_base.speculative.types.begin(), From 16e65554fb0887f15b72bb459f4d252081002055 Mon Sep 17 00:00:00 2001 From: Ruixiang Wang Date: Mon, 18 May 2026 16:02:29 +0000 Subject: [PATCH 03/27] eagle3: fix params bug --- common/speculative.cpp | 8 ++++---- src/llama-context.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index 79202842023e..1d373ccd4fbc 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -404,8 +404,8 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl { // // Performance is overall good but there is waste in verify cycle: // process() runs encoder + decoder on the *full* verify batch including rows for -// rejected drafts. The KV at those positions is then dropped. -// +// rejected drafts. The KV at those positions is then dropped. +// // TODO: Not sure if we need optimization for this waste? // If so we may need hybrid stash: // in verify mode, have process() only stash features and let draft() seed run @@ -486,8 +486,8 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { } // turn on extraction of the draft model's pre-norm hidden state - // (used both for the encoder output g_embd and the decoder pre-norm output) - llama_set_embeddings_pre_norm(ctx_dft, true); + // (used both for the encoder output g_embd and the decoder pre-norm output). + llama_set_embeddings_pre_norm(ctx_dft, true, /*masked*/ true); pending_g_last.assign(n_seq, std::vector(n_embd_dec, 0.0f)); pending_pos_last.assign(n_seq, -1); diff --git a/src/llama-context.h b/src/llama-context.h index d6d483cb97d6..7d7828319693 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -231,7 +231,7 @@ struct llama_context { // map the output row index `i` to batch index int64_t output_resolve_row(int32_t i) const; - // async-copy enabled layer-input tensors (per cparams.output_layer_inp) + // async-copy enabled layer-input tensors (per cparams.output_layer_inp) // from backend into host-side embd_layer_inp buffers void extract_layer_inputs(const llm_graph_result * res); From 752bf2331f01d05d09632dc06ce7b4fa59ec6380 Mon Sep 17 00:00:00 2001 From: Ruixiang Wang Date: Wed, 20 May 2026 16:23:42 +0000 Subject: [PATCH 04/27] eagle3: support Gemma4 eagle3 from RedHatAI --- conversion/__init__.py | 3 +++ conversion/llama.py | 3 +++ src/models/gemma4.cpp | 2 ++ 3 files changed, 8 insertions(+) diff --git a/conversion/__init__.py b/conversion/__init__.py index 18162976f458..cd6f8e6b937c 100644 --- a/conversion/__init__.py +++ b/conversion/__init__.py @@ -130,6 +130,9 @@ "LlamaBidirectionalModel": "llama", "LlamaForCausalLM": "llama", "LlamaModel": "llama", + "Eagle3DraftModel": "llama", + "Eagle3Speculator": "llama", + "LlamaForCausalLMEagle3": "llama", "LlavaForConditionalGeneration": "llama", "LlavaStableLMEpochForCausalLM": "stablelm", "MPTForCausalLM": "mpt", diff --git a/conversion/llama.py b/conversion/llama.py index db073b9b361a..b08388e456bd 100644 --- a/conversion/llama.py +++ b/conversion/llama.py @@ -63,6 +63,9 @@ def __init__(self, *args, **kwargs): with open(self.target_model_dir / "config.json", 'r', encoding='utf-8') as f: target_config = json.load(f) + if "text_config" in target_config: + target_config = {**target_config, **target_config["text_config"]} + # extract_layers: derived from target model layer count (low/mid/high) target_num_layers = target_config["num_hidden_layers"] extract_layers = [2, target_num_layers // 2, target_num_layers - 3] diff --git a/src/models/gemma4.cpp b/src/models/gemma4.cpp index 6f7fcd645cbd..d0cc40fab2c8 100644 --- a/src/models/gemma4.cpp +++ b/src/models/gemma4.cpp @@ -210,6 +210,8 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para const float freq_scale_l = model.get_rope_freq_scale(cparams, il); const int n_rot_l = hparams.n_rot(il); + res->t_layer_inp[il] = inpL; + // norm cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); From b32d9ebe970d496bbfe38af880dbf6ed76919c07 Mon Sep 17 00:00:00 2001 From: Ruixiang Wang Date: Wed, 27 May 2026 13:52:08 +0000 Subject: [PATCH 05/27] eagle3: set sync when get features from target Co-authored-by: tnhnyzc <115956684+tnhnyzc@users.noreply.github.com> --- src/llama-context.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 4c40bdf3703d..c9956d12bd4a 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -4107,5 +4107,7 @@ void llama_set_output_layer_inp(struct llama_context * ctx, uint32_t layer_id, b } float * llama_get_output_layer_inp(struct llama_context * ctx, uint32_t layer_id) { + ctx->synchronize(); + return ctx->get_output_layer_inp(layer_id); } From 7c5f428dc8d902b409c4c34e2fea91d960666a66 Mon Sep 17 00:00:00 2001 From: Ruixiang Wang Date: Wed, 27 May 2026 16:38:27 +0000 Subject: [PATCH 06/27] eagle3 : fix ubatch handling in embd_layer_inp extraction and encoder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Doğaç Eldenk --- common/speculative.cpp | 51 +++++++++++++++++++++++++++--------------- src/llama-context.cpp | 45 ++++++++++++++++++++++++++++++------- src/llama-context.h | 4 ++-- src/llama-ext.h | 2 +- 4 files changed, 73 insertions(+), 29 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index 1d373ccd4fbc..0c49f0ee372a 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -433,6 +433,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { // scratch buffer for concatenated target features [n_tokens, n_embd_enc] std::vector features_buf; + std::vector g_embd_buf; common_speculative_impl_draft_eagle3(const common_params_speculative & params, uint32_t n_seq) : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq) @@ -570,25 +571,39 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { } } - llama_batch enc_batch = { - /*.n_tokens =*/ n_tokens, - /*.token =*/ nullptr, - /*.embd =*/ features_buf.data(), - /*.pos =*/ nullptr, - /*.n_seq_id =*/ nullptr, - /*.seq_id =*/ nullptr, - /*.logits =*/ nullptr, - }; - int rc = llama_encode(ctx_dft, enc_batch); - if (rc != 0) { - LOG_ERR("%s: llama_encode(ctx_dft) failed rc=%d (n_tokens=%d)\n", - __func__, rc, (int) n_tokens); - return false; + g_embd_buf.resize((size_t) n_tokens * n_embd_dec); + + // llama_encode() requires the full encoder batch to fit in n_ubatch. + // Allow batch > ubatch: eagle3's per-token encoder can be chunked safely. + const int32_t n_ubatch_dft = (int32_t) llama_n_ubatch(ctx_dft); + for (int32_t i = 0; i < n_tokens; i += n_ubatch_dft) { + const int32_t n_chunk = std::min(n_ubatch_dft, n_tokens - i); + + llama_batch enc_batch = { + /*.n_tokens =*/ n_chunk, + /*.token =*/ nullptr, + /*.embd =*/ features_buf.data() + (size_t) i * n_embd_enc, + /*.pos =*/ nullptr, + /*.n_seq_id =*/ nullptr, + /*.seq_id =*/ nullptr, + /*.logits =*/ nullptr, + }; + const int32_t rc = llama_encode(ctx_dft, enc_batch); + if (rc != 0) { + LOG_ERR("%s: llama_encode(ctx_dft) failed rc=%d (n_tokens=%d, offset=%d)\n", + __func__, rc, (int) n_chunk, (int) i); + return false; + } + + // g_embd has shape [n_chunk, n_embd_dec] in ctx_dft's pre-norm embeddings buffer. + const float * g_embd_chunk = llama_get_embeddings_pre_norm(ctx_dft); + GGML_ASSERT(g_embd_chunk && "EAGLE3 encoder produced no output."); + std::memcpy(g_embd_buf.data() + (size_t) i * n_embd_dec, + g_embd_chunk, + (size_t) n_chunk * n_embd_dec * sizeof(float)); } - // g_embd has shape [n_tokens, n_embd_dec] in ctx_dft's pre-norm embeddings buffer - const float * g_embd = llama_get_embeddings_pre_norm(ctx_dft); - GGML_ASSERT(g_embd && "EAGLE3 encoder produced no output."); + const float * g_embd = g_embd_buf.data(); const size_t row_bytes = (size_t) n_embd_dec * sizeof(float); @@ -649,7 +664,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { } if (batch.n_tokens > 0) { - rc = llama_decode(ctx_dft, batch); + const int32_t rc = llama_decode(ctx_dft, batch); if (rc != 0) { LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (n_tokens=%d, ubatch_pos[0]=%d)\n", __func__, rc, (int) batch.n_tokens, (int) batch_in.pos[0]); diff --git a/src/llama-context.cpp b/src/llama-context.cpp index c9956d12bd4a..28b3d3880f48 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1279,10 +1279,10 @@ void llama_context::set_output_layer_inp(uint32_t layer_id, bool enable) { } float * llama_context::get_output_layer_inp(uint32_t layer_id) { - if (layer_id >= embd_layer_inp.size() || embd_layer_inp[layer_id].empty()) { + if (layer_id >= embd_layer_inp.size() || !embd_layer_inp[layer_id].has_data()) { return nullptr; } - return embd_layer_inp[layer_id].data(); + return embd_layer_inp[layer_id].data; } llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) { @@ -1979,7 +1979,7 @@ int llama_context::decode(const llama_batch & batch_inp) { } } - extract_layer_inputs(res); + extract_layer_inputs(res, n_tokens_prev, ubatch.n_tokens); // extract nextn embeddings before // only meaningful in LLAMA_POOLING_TYPE_NONE (per-token); other pooling modes are ignored. @@ -2099,6 +2099,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { size_t backend_float_count = 0; size_t backend_token_count = 0; + size_t embd_layer_inp_float_count = 0; logits.size = has_logits ? n_vocab*n_outputs_max : 0; embd.size = has_embd ? n_embd_out*n_outputs_max : 0; @@ -2110,6 +2111,12 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { embd_nextn.size = (size_t) n_embd_out * n_batch; } + for (bool enabled : cparams.output_layer_inp) { + if (enabled) { + embd_layer_inp_float_count += (size_t) n_embd * n_batch; + } + } + // Allocate backend sampling output buffers if there are backend samplers configured. const bool has_sampling = !sampling.samplers.empty(); if (has_sampling) { @@ -2124,8 +2131,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0; const size_t new_size = - (logits.size + embd.size + embd_nextn.size + backend_float_count) * sizeof(float) + - ( backend_token_count) * sizeof(llama_token); + (logits.size + embd.size + embd_nextn.size + embd_layer_inp_float_count + backend_float_count) * sizeof(float) + + ( backend_token_count) * sizeof(llama_token); // alloc only when more than the current capacity is required // TODO: also consider shrinking the buffer @@ -2142,6 +2149,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { logits.data = nullptr; embd.data = nullptr; embd_nextn.data = nullptr; + for (auto & layer_inp : embd_layer_inp) { + layer_inp = {nullptr, 0}; + } } auto * buft = ggml_backend_cpu_buffer_type(); @@ -2173,6 +2183,15 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { embd_nextn = has_embd_nextn ? buffer_view{(float *) (base + offset), embd_nextn.size} : buffer_view{nullptr, 0}; offset += embd_nextn.size * sizeof(float); + for (uint32_t il = 0; il < embd_layer_inp.size(); ++il) { + if (cparams.output_layer_inp[il]) { + embd_layer_inp[il] = buffer_view{(float *) (base + offset), (size_t) n_embd * n_batch}; + offset += embd_layer_inp[il].size * sizeof(float); + } else { + embd_layer_inp[il] = buffer_view{nullptr, 0}; + } + } + if (has_sampling) { sampling.logits = {(float *) (base + offset), (size_t)(n_vocab*n_outputs_max)}; offset += sampling.logits.size * sizeof(float); @@ -2219,20 +2238,30 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { return n_outputs_max; } -void llama_context::extract_layer_inputs(const llm_graph_result * res) { +void llama_context::extract_layer_inputs(const llm_graph_result * res, size_t token_offset, size_t n_tokens) { for (uint32_t il = 0; il < cparams.output_layer_inp.size(); ++il) { if (!cparams.output_layer_inp[il]) { continue; } + if (!embd_layer_inp[il].has_data()) { + continue; + } ggml_tensor * t = res->get_layer_inp((int) il); if (!t) { continue; } const size_t nbytes = ggml_nbytes(t); - embd_layer_inp[il].resize(nbytes / sizeof(float)); + const size_t nfloats = nbytes / sizeof(float); + GGML_ASSERT(n_tokens > 0); + GGML_ASSERT(nfloats % n_tokens == 0); + + const size_t row_floats = nfloats / n_tokens; + const size_t dst_offset = token_offset * row_floats; + GGML_ASSERT(dst_offset + nfloats <= embd_layer_inp[il].size); + ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched.get(), t); GGML_ASSERT(backend != nullptr); - ggml_backend_tensor_get_async(backend, t, embd_layer_inp[il].data(), 0, nbytes); + ggml_backend_tensor_get_async(backend, t, embd_layer_inp[il].data + dst_offset, 0, nbytes); } } diff --git a/src/llama-context.h b/src/llama-context.h index 7d7828319693..af809d280386 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -233,7 +233,7 @@ struct llama_context { // async-copy enabled layer-input tensors (per cparams.output_layer_inp) // from backend into host-side embd_layer_inp buffers - void extract_layer_inputs(const llm_graph_result * res); + void extract_layer_inputs(const llm_graph_result * res, size_t token_offset, size_t n_tokens); // // graph @@ -365,7 +365,7 @@ struct llama_context { // host buffer for output layer input embeddings, per layer // populated when cparams.output_layer_inp[il] is true - std::vector> embd_layer_inp; + std::vector> embd_layer_inp; // keep copies of the per-sequence memory on the device std::map mem_storage; diff --git a/src/llama-ext.h b/src/llama-ext.h index 51838a761f10..fdde3c89a01e 100644 --- a/src/llama-ext.h +++ b/src/llama-ext.h @@ -110,7 +110,7 @@ LLAMA_API llama_context * llama_get_ctx_other(struct llama_context * ctx); // set if the layer input embeddings should be outputed LLAMA_API void llama_set_output_layer_inp(struct llama_context * ctx, uint32_t layer_id, bool enable); -// read back the input embeddings of the specified layer for the most recent ubatch +// read back the input embeddings of the specified layer for the most recent decode batch // the layer must have been enabled via llama_set_output_layer_inp LLAMA_API float * llama_get_output_layer_inp(struct llama_context * ctx, uint32_t layer_id); From 91b9cfc74220558bff741a59b24adb62d66cb977 Mon Sep 17 00:00:00 2001 From: Ruixiang Wang Date: Fri, 5 Jun 2026 12:55:05 +0000 Subject: [PATCH 07/27] eagle3: adapt to upstream changes --- common/speculative.cpp | 8 ++++---- src/llama-context.cpp | 5 +++-- src/llama-model.cpp | 5 +++-- src/models/eagle3.cpp | 6 +++--- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index 0c49f0ee372a..0ac0d7ffcaef 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -488,7 +488,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { // turn on extraction of the draft model's pre-norm hidden state // (used both for the encoder output g_embd and the decoder pre-norm output). - llama_set_embeddings_pre_norm(ctx_dft, true, /*masked*/ true); + llama_set_embeddings_nextn(ctx_dft, true, /*masked*/ true); pending_g_last.assign(n_seq, std::vector(n_embd_dec, 0.0f)); pending_pos_last.assign(n_seq, -1); @@ -596,7 +596,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { } // g_embd has shape [n_chunk, n_embd_dec] in ctx_dft's pre-norm embeddings buffer. - const float * g_embd_chunk = llama_get_embeddings_pre_norm(ctx_dft); + const float * g_embd_chunk = llama_get_embeddings_nextn(ctx_dft); GGML_ASSERT(g_embd_chunk && "EAGLE3 encoder produced no output."); std::memcpy(g_embd_buf.data() + (size_t) i * n_embd_dec, g_embd_chunk, @@ -738,7 +738,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { common_sampler_sample(smpl, ctx_dft, i_batch, true); // pre-norm hidden state of this position becomes g_embd for the next step - const float * prenorm = llama_get_embeddings_pre_norm_ith(ctx_dft, i_batch); + const float * prenorm = llama_get_embeddings_nextn_ith(ctx_dft, i_batch); ++i_batch; const auto * cur_p = common_sampler_get_candidates(smpl, true); @@ -1778,7 +1778,7 @@ common_speculative * common_speculative_init(common_params_speculative & params, uint32_t enabled_configs = common_get_enabled_speculative_configs(params.types); bool has_draft_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE)); - bool has_draft_eagle3 = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3)) && has_draft_model_path; + bool has_draft_eagle3 = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3)) && params.draft.ctx_dft != nullptr; bool has_mtp = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP)) && params.draft.ctx_dft != nullptr; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 28b3d3880f48..1b3c073c05d5 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -86,6 +86,7 @@ llama_context::llama_context( cparams.cb_eval_user_data = params.cb_eval_user_data; cparams.ctx_other = nullptr; + cparams.output_layer_inp.resize(hparams.n_layer, false); embd_layer_inp.resize(hparams.n_layer); @@ -196,7 +197,7 @@ llama_context::llama_context( cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch); - cparams.n_outputs_max = params.n_outputs_max == 0 ? cparams.n_batch : params.n_outputs_max; + cparams.n_outputs_max = params.n_outputs_max == 0 || llama_model_has_encoder(&model) ? cparams.n_batch : params.n_outputs_max; cparams.op_offload = params.op_offload; cparams.kv_unified = params.kv_unified; @@ -1271,7 +1272,7 @@ bool llama_context::set_adapter_cvec( void llama_context::set_output_layer_inp(uint32_t layer_id, bool enable) { LLAMA_LOG_DEBUG("%s: layer_id = %d, enable = %d\n", __func__, layer_id, enable); - GGML_ASSERT(layer_id < model.hparams.n_layer); + GGML_ASSERT(layer_id < model.hparams.n_layer_all); cparams.output_layer_inp[layer_id] = enable; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index a41740f81ce2..ebecd57f550a 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2603,8 +2603,9 @@ uint64_t llama_model_n_params(const llama_model * model) { bool llama_model_has_encoder(const llama_model * model) { switch (model->arch) { - case LLM_ARCH_T5: return true; - case LLM_ARCH_T5ENCODER: return true; + case LLM_ARCH_T5: + case LLM_ARCH_T5ENCODER: + case LLM_ARCH_EAGLE3: return true; default: return false; } } diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp index 3694d262cb85..5c5769be53dd 100644 --- a/src/models/eagle3.cpp +++ b/src/models/eagle3.cpp @@ -136,9 +136,9 @@ llama_model_eagle3::graph::graph(const llama_model & model, const llm_grap cb(cur, "fc_out", -1); // Output: g_embeddings e.g. [4096, n_tokens] - // store in t_h_pre_norm (same as MTP) so can be read via llama_get_embeddings_pre_norm(ctx_dft) + // store in t_h_nextn (same as MTP) so can be read via llama_get_embeddings_nextn(ctx_dft) ggml_set_output(cur); - res->t_h_pre_norm = cur; + res->t_h_nextn = cur; ggml_build_forward_expand(gf, cur); } @@ -283,7 +283,7 @@ llama_model_eagle3::graph::graph(const llama_model & model, const llm_gra // Output prenorm state (for next token's g_embeddings in autoregressive generation) ggml_set_output(cur); - res->t_h_pre_norm = cur; + res->t_h_nextn = cur; cur = build_norm(cur, model.output_norm, NULL, From 4ca8087b7b80c6486bc17513ff1434e42e77da64 Mon Sep 17 00:00:00 2001 From: Ruixiang Wang Date: Mon, 8 Jun 2026 12:58:53 +0000 Subject: [PATCH 08/27] eagle3: fix rebase issues and adapt to upstream changes --- src/llama-arch.h | 2 +- src/llama-context.cpp | 5 +++-- src/llama-hparams.h | 1 - 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/llama-arch.h b/src/llama-arch.h index 60581af024da..0474d0e6659b 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -141,8 +141,8 @@ enum llm_arch { LLM_ARCH_KIMI_LINEAR, LLM_ARCH_TALKIE, LLM_ARCH_MELLUM, - LLM_ARCH_UNKNOWN, LLM_ARCH_EAGLE3, + LLM_ARCH_UNKNOWN, }; enum llm_kv { diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 1b3c073c05d5..f1296b7d4882 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -87,8 +87,8 @@ llama_context::llama_context( cparams.ctx_other = nullptr; - cparams.output_layer_inp.resize(hparams.n_layer, false); - embd_layer_inp.resize(hparams.n_layer); + cparams.output_layer_inp.resize(hparams.n_layer_all, false); + embd_layer_inp.resize(hparams.n_layer_all); // TODO: more generic if (model.arch == LLM_ARCH_GEMMA4_ASSISTANT) { @@ -2086,6 +2086,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { const auto n_batch = cparams.n_batch; const auto n_vocab = vocab.n_tokens(); + const auto n_embd = hparams.n_embd; const auto n_embd_out = hparams.n_embd_out(); bool has_logits = true; diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 62d91129504d..970a8d689e05 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -48,7 +48,6 @@ struct llama_hparams { uint32_t n_ctx_train; // context size the model was trained on uint32_t n_embd; - uint32_t n_embd_inp_impl = 0; uint32_t n_layer_all; uint32_t n_layer_nextn = 0; uint32_t n_expert = 0; From 413c16da6693a77e40b4b0e5c9308d74956a2909 Mon Sep 17 00:00:00 2001 From: Ruixiang Wang Date: Mon, 8 Jun 2026 13:15:21 +0000 Subject: [PATCH 09/27] eagle3:exclude the eagle3 arch from test-llama-archs --- tests/test-llama-archs.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp index 8037a11398b0..4d06274ef1eb 100644 --- a/tests/test-llama-archs.cpp +++ b/tests/test-llama-archs.cpp @@ -450,6 +450,9 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) { continue; // FIXME: ISWA KV cache initialization needs more fixture params } + if (arch == LLM_ARCH_EAGLE3) { + continue; + } for (bool moe : {false, true}) { if (moe && !moe_implemented(arch)) { continue; @@ -553,6 +556,9 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) { continue; // FIXME: ISWA KV cache initialization needs more fixture params } + if (arch == LLM_ARCH_EAGLE3) { + continue; + } const bool encode = arch == LLM_ARCH_T5 || arch == LLM_ARCH_DREAM || arch == LLM_ARCH_LLADA || arch == LLM_ARCH_LLADA_MOE || arch == LLM_ARCH_RND1; for (bool moe : {false, true}) { From 6c212225ce132cc57b5b8f53d1cad0de20442f55 Mon Sep 17 00:00:00 2001 From: Ruixiang Wang Date: Mon, 8 Jun 2026 13:26:41 +0000 Subject: [PATCH 10/27] eagle3: fix editorconfig check failures --- src/llama-ext.h | 1 - src/models/gemma4.cpp | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/llama-ext.h b/src/llama-ext.h index fdde3c89a01e..163e1a674284 100644 --- a/src/llama-ext.h +++ b/src/llama-ext.h @@ -130,4 +130,3 @@ LLAMA_API const int32_t * llama_model_target_extract_layers (const struct llama LLAMA_API uint32_t llama_model_n_target_extract_layers(const struct llama_model * model); // returns the target model hidden size LLAMA_API uint32_t llama_model_target_hidden_size (const struct llama_model * model); - diff --git a/src/models/gemma4.cpp b/src/models/gemma4.cpp index d0cc40fab2c8..6a96979cebde 100644 --- a/src/models/gemma4.cpp +++ b/src/models/gemma4.cpp @@ -211,7 +211,7 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para const int n_rot_l = hparams.n_rot(il); res->t_layer_inp[il] = inpL; - + // norm cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); From ac7e2b2f4fc05905e8581c241b9bea07dc1f44a4 Mon Sep 17 00:00:00 2001 From: Ruixiang Wang Date: Tue, 9 Jun 2026 17:03:11 +0000 Subject: [PATCH 11/27] eagle3: fix multi-seq issue in d2t vocab mapping --- src/llama-context.cpp | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index f1296b7d4882..cc69ae8a1594 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1891,7 +1891,6 @@ int llama_context::decode(const llama_batch & batch_inp) { static thread_local std::vector eagle3_draft_logits; const int64_t draft_vocab_size = t_logits->ne[0]; - const uint32_t last_idx = n_outputs - 1; if (eagle3_d2t_map.empty()) { eagle3_d2t_map.resize(model.d2t->ne[0]); @@ -1899,20 +1898,24 @@ int llama_context::decode(const llama_batch & batch_inp) { eagle3_d2t_map.size() * sizeof(int64_t)); } - eagle3_draft_logits.resize(draft_vocab_size); - const size_t last_offset = last_idx * draft_vocab_size * sizeof(float); + // remap every output row (one per sequence) from draft vocab to target vocab. + eagle3_draft_logits.resize((size_t) n_outputs * draft_vocab_size); ggml_backend_tensor_get_async(backend_res, t_logits, eagle3_draft_logits.data(), - last_offset, draft_vocab_size * sizeof(float)); + 0, (size_t) n_outputs * draft_vocab_size * sizeof(float)); synchronize(); - float * last_logits_out = logits_out + last_idx * n_vocab; - std::fill(last_logits_out, last_logits_out + n_vocab, - -std::numeric_limits::infinity()); + for (uint32_t r = 0; r < n_outputs; r++) { + float * row_out = logits_out + (size_t) r * n_vocab; + const float * row_in = eagle3_draft_logits.data() + (size_t) r * draft_vocab_size; - for (int64_t j = 0; j < draft_vocab_size; j++) { - const int64_t target_id = j + eagle3_d2t_map[j]; - GGML_ASSERT(target_id >= 0 && target_id < n_vocab); - last_logits_out[target_id] = eagle3_draft_logits[j]; + std::fill(row_out, row_out + n_vocab, + -std::numeric_limits::infinity()); + + for (int64_t j = 0; j < draft_vocab_size; j++) { + const int64_t target_id = j + eagle3_d2t_map[j]; + GGML_ASSERT(target_id >= 0 && target_id < n_vocab); + row_out[target_id] = row_in[j]; + } } } else { ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float)); From 544aaa2faa9c2fb8bf1705629d82df97816fe28e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 10 Jun 2026 10:07:31 +0300 Subject: [PATCH 12/27] cont : minor style / clean-up --- common/speculative.cpp | 40 +++++++++++++++++++++------------------- src/llama-arch.h | 2 +- src/llama-context.cpp | 3 +-- src/models/eagle3.cpp | 9 ++++----- src/models/qwen35.cpp | 2 +- 5 files changed, 28 insertions(+), 28 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index 0ac0d7ffcaef..31935dc688b0 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -556,13 +556,12 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { // Interleave each extract_layer's hidden state into a contiguous buffer of // shape [n_tokens, n_extract_layers * tgt_hidden]. Then run EAGLE3 encoder // to get one g_embd row per token. - features_buf.assign((size_t) n_tokens * n_embd_enc, 0.0f); + features_buf.resize((size_t) n_tokens * n_embd_enc, 0.0f); for (uint32_t k = 0; k < n_extract_layers; ++k) { const float * layer = llama_get_output_layer_inp(ctx_tgt, (uint32_t) extract_layers[k]); if (!layer) { - GGML_ABORT("EAGLE3: target layer %d input not extracted.", - extract_layers[k]); + GGML_ABORT("EAGLE3: target layer %d input not extracted.", extract_layers[k]); } for (int32_t i = 0; i < n_tokens; ++i) { float * dst = features_buf.data() + (size_t) i * n_embd_enc + k * (size_t) tgt_hidden; @@ -631,7 +630,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { // Fires iff all three preconditions hold: // 1) pending_pos_last >= 0 // 2) pending_pos_last + 1 == pos[beg] - // 3) pending_pos_last > dft_pos_max + // 3) pending_pos_last > dft_pos_max // TODO: is this check needed? const llama_pos pending_pos = pending_pos_last[seq_id]; if (pending_pos >= 0 && pending_pos + 1 == batch_in.pos[beg]) { const llama_pos dft_pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id); @@ -643,8 +642,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { } for (int32_t k = beg; k < end; ++k) { - common_batch_add(batch, batch_in.token[k + 1], batch_in.pos[k], - { seq_id }, /*logits=*/ false); + common_batch_add(batch, batch_in.token[k + 1], batch_in.pos[k], { seq_id }, /*logits=*/ false); std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec, g_embd + (size_t) k * n_embd_dec, row_bytes); } @@ -652,15 +650,11 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { // refresh deferred state const int32_t n_rows = end - beg + 1; verify_pos_first[seq_id] = batch_in.pos[beg]; - verify_g_rows[seq_id] = n_rows; - verify_g[seq_id].assign((size_t) n_rows * n_embd_dec, 0.0f); - std::memcpy(verify_g[seq_id].data(), - g_embd + (size_t) beg * n_embd_dec, - (size_t) n_rows * row_bytes); - - std::memcpy(pending_g_last[seq_id].data(), - g_embd + (size_t) end * n_embd_dec, row_bytes); pending_pos_last[seq_id] = batch_in.pos[end]; + verify_g_rows[seq_id] = n_rows; + verify_g[seq_id].resize((size_t) n_rows * n_embd_dec, 0.0f); + std::memcpy(verify_g[seq_id].data(), g_embd + (size_t) beg * n_embd_dec, row_bytes * n_rows); + std::memcpy(pending_g_last[seq_id].data(), g_embd + (size_t) end * n_embd_dec, row_bytes); } if (batch.n_tokens > 0) { @@ -767,17 +761,14 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { result.push_back(id); - if ((params.n_max <= (int) result.size()) || - (dp.n_max > 0 && dp.n_max <= (int) result.size())) { + if (params.n_max <= (int) result.size()) { drafting[seq_id] = false; n_drafting--; continue; } common_batch_add(batch, id, pending_pos_last[seq_id] + (i + 1), { seq_id }, true); - std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec, - prenorm, - row_bytes); + std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec, prenorm, row_bytes); } if (batch.n_tokens == 0) { @@ -792,6 +783,17 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { ++i; } + + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + auto & dp = dparams[seq_id]; + if (!dp.drafting) { + continue; + } + + if (dp.result->size() < (size_t) params.n_min) { + dp.result->clear(); + } + } } void accept(llama_seq_id seq_id, uint16_t n_accepted, bool /*is_other*/) override { diff --git a/src/llama-arch.h b/src/llama-arch.h index 0474d0e6659b..6a4f3ec6b841 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -572,7 +572,7 @@ enum llm_tensor { LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, LLM_TENSOR_EAGLE3_FC, - LLM_TENSOR_EAGLE3_HIDDEN_NORM, + LLM_TENSOR_EAGLE3_HIDDEN_NORM, // TODO: remove, use LLM_TENSOR_ATTN_NORM instead LLM_TENSOR_EAGLE3_D2T, }; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index cc69ae8a1594..2ff84e5b4cef 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1908,8 +1908,7 @@ int llama_context::decode(const llama_batch & batch_inp) { float * row_out = logits_out + (size_t) r * n_vocab; const float * row_in = eagle3_draft_logits.data() + (size_t) r * draft_vocab_size; - std::fill(row_out, row_out + n_vocab, - -std::numeric_limits::infinity()); + std::fill(row_out, row_out + n_vocab, -std::numeric_limits::infinity()); for (int64_t j = 0; j < draft_vocab_size; j++) { const int64_t target_id = j + eagle3_d2t_map[j]; diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp index 5c5769be53dd..7d2121f6d8a4 100644 --- a/src/models/eagle3.cpp +++ b/src/models/eagle3.cpp @@ -249,11 +249,6 @@ llama_model_eagle3::graph::graph(const llama_model & model, const llm_gra model.layers[il].wo, NULL, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); - if (inp_out_ids) { - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - // Add residual and update it ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); @@ -285,6 +280,10 @@ llama_model_eagle3::graph::graph(const llama_model & model, const llm_gra ggml_set_output(cur); res->t_h_nextn = cur; + if (inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + } + cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index 4b642cff467c..6783d98ec204 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -173,7 +173,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para } if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { - cur = ggml_get_rows(ctx0, cur, inp_out_ids); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } From b9f41d181dee4ff3599bd599b373c41bc1eb32d3 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 10 Jun 2026 10:28:42 +0300 Subject: [PATCH 13/27] spec : remove `common_speculative_setup_draft_model()` --- common/speculative.cpp | 20 -------------------- common/speculative.h | 4 ---- src/llama-context.cpp | 11 ++++++++++- src/models/eagle3.cpp | 11 +++++++++-- tools/server/server-context.cpp | 3 --- 5 files changed, 19 insertions(+), 30 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index 31935dc688b0..6af8dc2a9d0b 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -1912,26 +1912,6 @@ void common_speculative_free(common_speculative * spec) { delete spec; } -void common_speculative_setup_draft_model(struct llama_model * model_dft, const struct llama_model * model_tgt) { - if (model_dft == nullptr || model_tgt == nullptr) { - return; - } - if (llama_model_get_tok_embd(model_dft) == nullptr) { - ggml_tensor * tgt_tok_embd = llama_model_get_tok_embd(model_tgt); - if (tgt_tok_embd != nullptr) { - llama_model_set_tok_embd(model_dft, tgt_tok_embd); - LOG_INF("%s: draft inheriting target's tok_embd\n", __func__); - } - } - if (llama_model_get_lm_head(model_dft) == nullptr) { - ggml_tensor * tgt_lm_head = llama_model_get_lm_head(model_tgt); - if (tgt_lm_head != nullptr) { - llama_model_set_lm_head(model_dft, tgt_lm_head); - LOG_INF("%s: draft inheriting target's lm_head\n", __func__); - } - } -} - common_speculative_draft_params & common_speculative_get_draft_params( common_speculative * spec, llama_seq_id seq_id) { diff --git a/common/speculative.h b/common/speculative.h index f1cfcb237f4c..bf76ad709e26 100644 --- a/common/speculative.h +++ b/common/speculative.h @@ -27,10 +27,6 @@ common_speculative * common_speculative_init(common_params_speculative & params, void common_speculative_free(common_speculative * spec); -// Optional setup hook to call once after loading the draft model but before creating its context. -// Inherits any missing weights from the target model (e.g. tok_embd / lm_head from target model for eagle3 / dflash) -void common_speculative_setup_draft_model(struct llama_model * model_dft, const struct llama_model * model_tgt); - struct common_speculative_draft_params { // this flag is used to chain the drafts through all the available implementations // after the first successful draft from an implementation, we set it diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 2ff84e5b4cef..b8e641345ccc 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -94,12 +94,21 @@ llama_context::llama_context( if (model.arch == LLM_ARCH_GEMMA4_ASSISTANT) { if (params.ctx_other == nullptr) { // TODO: change from runtime_error to llama_exception to avoid printing error message - throw std::runtime_error("Gemma4Assistant requires ctx_other to be set (this is normal during memory fitting)"); + throw std::runtime_error("Gemma4Assistant requires ctx_other to be set (this warning is normal during memory fitting)"); } cparams.ctx_other = params.ctx_other; } + if (model.arch == LLM_ARCH_EAGLE3) { + if (model.tok_embd == nullptr) { + if (params.ctx_other == nullptr) { + throw std::runtime_error("EAGLE3 requires ctx_other to be set (this warning is normal during memory fitting)"); + } + cparams.ctx_other = params.ctx_other; + } + } + // Initialize backend samplers here so they are part of the sampling graph // before the reserve passes run later in this function. This avoids a later // re-reserve when graph nodes change. diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp index 7d2121f6d8a4..6179e92274c8 100644 --- a/src/models/eagle3.cpp +++ b/src/models/eagle3.cpp @@ -159,7 +159,14 @@ llama_model_eagle3::graph::graph(const llama_model & model, const llm_gra // eagle3 Decoder receives: // 1. Token embeddings (e.g.from eagle3's own tok_embd for Llama 3.3 70B, or target model for Llama 3.1 8B) // 2. g_embeddings from encoder - GGML_ASSERT(model.tok_embd != nullptr && "EAGLE3 decoder requires token embeddings (own or from target model)"); + auto * tok_embd = model.tok_embd; + if (model.tok_embd == nullptr) { + GGML_ASSERT(cparams.ctx_other != nullptr); + const auto * model_other = llama_get_model(cparams.ctx_other); + + GGML_ASSERT(model_other->tok_embd != nullptr && "EAGLE3 decoder requires token embeddings (own or from target model)"); + tok_embd = model_other->tok_embd; + } auto inp = std::make_unique(n_embd); @@ -169,7 +176,7 @@ llama_model_eagle3::graph::graph(const llama_model & model, const llm_gra inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); ggml_set_input(inp->embd); - ggml_tensor * inp_embd = ggml_get_rows(ctx0, model.tok_embd, inp->tokens); + ggml_tensor * inp_embd = ggml_get_rows(ctx0, tok_embd, inp->tokens); cb(inp_embd, "inp_embd", -1); ggml_tensor * inp_g = inp->embd; diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 3c395f683ab9..bdfa51718080 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -935,9 +935,6 @@ struct server_context_impl { return false; } - // eagle3/DFlash: shares target model's token_embd - common_speculative_setup_draft_model(model_dft.get(), model_tgt); - auto cparams = common_context_params_to_llama(params_dft); const bool spec_mtp = std::find(params_base.speculative.types.begin(), From f3fbbedfcaf8defa3a09fafe0454b56c4a0573b3 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 10 Jun 2026 11:09:48 +0300 Subject: [PATCH 14/27] llama : clean-up unused API --- src/llama-context.cpp | 6 +++--- src/llama-context.h | 8 ++++---- src/llama-ext.h | 6 ------ src/llama-model.cpp | 16 ---------------- 4 files changed, 7 insertions(+), 29 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index b8e641345ccc..ec7a60f65dcf 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -71,6 +71,9 @@ llama_context::llama_context( cparams.no_perf = params.no_perf; cparams.warmup = false; + cparams.output_layer_inp.resize(hparams.n_layer_all, false); + embd_layer_inp.resize(hparams.n_layer_all); + cparams.ctx_type = params.ctx_type; cparams.pooling_type = params.pooling_type; @@ -87,9 +90,6 @@ llama_context::llama_context( cparams.ctx_other = nullptr; - cparams.output_layer_inp.resize(hparams.n_layer_all, false); - embd_layer_inp.resize(hparams.n_layer_all); - // TODO: more generic if (model.arch == LLM_ARCH_GEMMA4_ASSISTANT) { if (params.ctx_other == nullptr) { diff --git a/src/llama-context.h b/src/llama-context.h index af809d280386..d6e321a5049c 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -297,6 +297,10 @@ struct llama_context { // sets llm_graph_result::t_h_nextn buffer_view embd_nextn = {nullptr, 0}; + // host buffers for output layer input embeddings, per layer + // populated when cparams.output_layer_inp[il] is true + std::vector> embd_layer_inp; + struct sampling_info { // !samplers.empty() to check if any samplers are active std::map samplers; @@ -363,10 +367,6 @@ struct llama_context { // host buffer for the model output (logits and embeddings) ggml_backend_buffer_ptr buf_output; - // host buffer for output layer input embeddings, per layer - // populated when cparams.output_layer_inp[il] is true - std::vector> embd_layer_inp; - // keep copies of the per-sequence memory on the device std::map mem_storage; diff --git a/src/llama-ext.h b/src/llama-ext.h index 163e1a674284..f511f091bb86 100644 --- a/src/llama-ext.h +++ b/src/llama-ext.h @@ -114,12 +114,6 @@ LLAMA_API void llama_set_output_layer_inp(struct llama_context * ctx, uint32_t l // the layer must have been enabled via llama_set_output_layer_inp LLAMA_API float * llama_get_output_layer_inp(struct llama_context * ctx, uint32_t layer_id); -LLAMA_API ggml_tensor * llama_model_get_tok_embd(const struct llama_model * model); -LLAMA_API void llama_model_set_tok_embd( struct llama_model * model, ggml_tensor * tensor); - -LLAMA_API ggml_tensor * llama_model_get_lm_head(const struct llama_model * model); -LLAMA_API void llama_model_set_lm_head( struct llama_model * model, ggml_tensor * tensor); - // // eagle3/DFlash: consume target model extracted features // diff --git a/src/llama-model.cpp b/src/llama-model.cpp index ebecd57f550a..5a2b7aaec451 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2692,22 +2692,6 @@ void llama_model_base::create_tensor_qkv(llama_layer & layer, int bid, } } -ggml_tensor * llama_model_get_tok_embd(const struct llama_model * model) { - return model->tok_embd; -} - -void llama_model_set_tok_embd(struct llama_model * model, ggml_tensor * tensor) { - model->tok_embd = tensor; -} - -ggml_tensor * llama_model_get_lm_head(const struct llama_model * model) { - return model->output; -} - -void llama_model_set_lm_head(struct llama_model * model, ggml_tensor * tensor) { - model->output = tensor; -} - const int32_t * llama_model_target_extract_layers(const struct llama_model * model) { const auto & v = model->target_extract_layers; return v.empty() ? nullptr : v.data(); From 8002c4ce8066af3ab6750b556131acda85924d15 Mon Sep 17 00:00:00 2001 From: Ruixiang Wang Date: Wed, 10 Jun 2026 21:19:36 +0000 Subject: [PATCH 15/27] eagle3: set d2t vocab mapping in decode graph --- conversion/llama.py | 13 +++++++++++-- src/llama-context.cpp | 36 +----------------------------------- src/models/eagle3.cpp | 15 +++++++++++++++ 3 files changed, 27 insertions(+), 37 deletions(-) diff --git a/conversion/llama.py b/conversion/llama.py index b08388e456bd..dd732716545e 100644 --- a/conversion/llama.py +++ b/conversion/llama.py @@ -5,6 +5,7 @@ from typing import Callable, Iterable, TYPE_CHECKING +import numpy as np import torch if TYPE_CHECKING: @@ -65,6 +66,7 @@ def __init__(self, *args, **kwargs): if "text_config" in target_config: target_config = {**target_config, **target_config["text_config"]} + self.target_vocab_size = target_config["vocab_size"] # extract_layers: derived from target model layer count (low/mid/high) target_num_layers = target_config["num_hidden_layers"] @@ -316,11 +318,18 @@ def prepare_tensors(self): super().prepare_tensors() - # eagle3: write d2t as int64 directly (not converted to F32) + # eagle3: write d2t as absolute target token ids if getattr(self, 'is_eagle3', False) and hasattr(self, '_eagle3_int_tensors'): for name, data_torch in self._eagle3_int_tensors.items(): old_dtype = eagle3_original_dtypes.get(name, data_torch.dtype) - data = data_torch.to(torch.int64).numpy() + data = data_torch.to(torch.int64).cpu().numpy() + if name == "d2t": + data = data.reshape(-1) + data = data + np.arange(data.size, dtype=np.int64) + if np.any((data < 0) | (data >= self.target_vocab_size)): + raise ValueError(f"EAGLE-3 d2t target ids out of range for target vocab size {self.target_vocab_size}") + if np.unique(data).size != data.size: + raise ValueError("EAGLE-3 d2t contains duplicate target ids") data_qtype = gguf.GGMLQuantizationType.I64 shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}" diff --git a/src/llama-context.cpp b/src/llama-context.cpp index ec7a60f65dcf..21043d34b9c2 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1893,41 +1893,7 @@ int llama_context::decode(const llama_batch & batch_inp) { if (n_outputs) { GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all); GGML_ASSERT((n_outputs_prev + n_outputs)*n_vocab <= (int64_t) logits.size); - - // eagle3: Map draft vocab to target vocab - if (model.arch == LLM_ARCH_EAGLE3 && model.d2t) { - static thread_local std::vector eagle3_d2t_map; - static thread_local std::vector eagle3_draft_logits; - - const int64_t draft_vocab_size = t_logits->ne[0]; - - if (eagle3_d2t_map.empty()) { - eagle3_d2t_map.resize(model.d2t->ne[0]); - ggml_backend_tensor_get(model.d2t, eagle3_d2t_map.data(), 0, - eagle3_d2t_map.size() * sizeof(int64_t)); - } - - // remap every output row (one per sequence) from draft vocab to target vocab. - eagle3_draft_logits.resize((size_t) n_outputs * draft_vocab_size); - ggml_backend_tensor_get_async(backend_res, t_logits, eagle3_draft_logits.data(), - 0, (size_t) n_outputs * draft_vocab_size * sizeof(float)); - synchronize(); - - for (uint32_t r = 0; r < n_outputs; r++) { - float * row_out = logits_out + (size_t) r * n_vocab; - const float * row_in = eagle3_draft_logits.data() + (size_t) r * draft_vocab_size; - - std::fill(row_out, row_out + n_vocab, -std::numeric_limits::infinity()); - - for (int64_t j = 0; j < draft_vocab_size; j++) { - const int64_t target_id = j + eagle3_d2t_map[j]; - GGML_ASSERT(target_id >= 0 && target_id < n_vocab); - row_out[target_id] = row_in[j]; - } - } - } else { - ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float)); - } + ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float)); } } diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp index 6179e92274c8..ea0e50606fc3 100644 --- a/src/models/eagle3.cpp +++ b/src/models/eagle3.cpp @@ -299,6 +299,21 @@ llama_model_eagle3::graph::graph(const llama_model & model, const llm_gra // lm_head - projects to draft vocabulary cur = build_lora_mm(model.output, cur); + if (model.d2t) { + const int64_t n_draft_vocab = cur->ne[0]; + const int64_t n_outputs = cur->ne[1]; + const int64_t n_vocab = (int64_t) model.vocab.n_tokens(); + + GGML_ASSERT(model.d2t->type == GGML_TYPE_I64); + GGML_ASSERT(model.d2t->ne[0] == n_draft_vocab); + + ggml_tensor * logits = ggml_fill(ctx0, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, n_vocab, n_outputs), -INFINITY); + cur = ggml_set_rows(ctx0, logits, + ggml_reshape_3d(ctx0, cur, 1, n_draft_vocab, n_outputs), + ggml_reshape_3d(ctx0, model.d2t, n_draft_vocab, 1, 1)); + cur = ggml_reshape_2d(ctx0, cur, n_vocab, n_outputs); + } + cb(cur, "result_output", -1); res->t_logits = cur; From 33b02df4ca680712220b2b4b2c67741c586c607b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 10 Jun 2026 12:26:22 +0300 Subject: [PATCH 16/27] cont : assert layer inputs are configured --- src/llama-context.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 21043d34b9c2..8cea70d65237 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -71,8 +71,8 @@ llama_context::llama_context( cparams.no_perf = params.no_perf; cparams.warmup = false; - cparams.output_layer_inp.resize(hparams.n_layer_all, false); - embd_layer_inp.resize(hparams.n_layer_all); + cparams.output_layer_inp.resize(hparams.n_layer(), false); + embd_layer_inp.resize(hparams.n_layer()); cparams.ctx_type = params.ctx_type; cparams.pooling_type = params.pooling_type; @@ -1281,7 +1281,7 @@ bool llama_context::set_adapter_cvec( void llama_context::set_output_layer_inp(uint32_t layer_id, bool enable) { LLAMA_LOG_DEBUG("%s: layer_id = %d, enable = %d\n", __func__, layer_id, enable); - GGML_ASSERT(layer_id < model.hparams.n_layer_all); + GGML_ASSERT(layer_id < model.hparams.n_layer()); cparams.output_layer_inp[layer_id] = enable; @@ -2111,7 +2111,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0; const size_t new_size = (logits.size + embd.size + embd_nextn.size + embd_layer_inp_float_count + backend_float_count) * sizeof(float) + - ( backend_token_count) * sizeof(llama_token); + ( backend_token_count) * sizeof(llama_token); // alloc only when more than the current capacity is required // TODO: also consider shrinking the buffer @@ -2223,12 +2223,13 @@ void llama_context::extract_layer_inputs(const llm_graph_result * res, size_t to continue; } if (!embd_layer_inp[il].has_data()) { - continue; + GGML_ABORT("output layer input buffer not allocated"); } ggml_tensor * t = res->get_layer_inp((int) il); if (!t) { - continue; + GGML_ABORT("layer input tensor not found"); } + const size_t nbytes = ggml_nbytes(t); const size_t nfloats = nbytes / sizeof(float); GGML_ASSERT(n_tokens > 0); From 785722184028e4a40bcfa296139ef97ef2e71aab Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 Jun 2026 11:21:53 +0300 Subject: [PATCH 17/27] hparams : use n_embd_inp instead of n_embd_target_features --- src/llama-context.cpp | 4 +--- src/llama-hparams.h | 5 ++--- src/models/eagle3.cpp | 12 +++++------- 3 files changed, 8 insertions(+), 13 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 8cea70d65237..62f05808fe7c 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1380,9 +1380,7 @@ int llama_context::encode(const llama_batch & batch_inp) { const auto & hparams = model.hparams; // eagle3/DFlash: features as encoder input, and non-draft paths fall back to model's input dim - const int64_t n_embd = (hparams.n_embd_target_features > 0 && batch_inp.embd) - ? (int64_t) hparams.n_embd_target_features - : hparams.n_embd_inp(); + const int64_t n_embd = hparams.n_embd_inp(); const int64_t n_vocab = model.vocab.n_tokens(); // note: during encode, we always pass the full sequence starting from pos = 0 diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 970a8d689e05..6d4c0080db20 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -236,9 +236,8 @@ struct llama_hparams { std::array deepstack_mapping_arr; // eagle3/DFlash sahred params - // n_embd_target_features = n_extract * target_hidden_size (encoder input dim) - uint32_t n_embd_target_features = 0; - uint32_t target_hidden_size = 0; + // n_embd_impl = n_extract * target_hidden_size (encoder input dim) + uint32_t target_hidden_size = 0; // eagle3: whether to apply hidden_norm before storing residual bool eagle3_norm_before_residual = false; diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp index ea0e50606fc3..683c7ffad3c7 100644 --- a/src/models/eagle3.cpp +++ b/src/models/eagle3.cpp @@ -18,7 +18,7 @@ void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) { LLAMA_LOG_INFO("%s: EAGLE3 target_hidden_size = %u (draft n_embd = %u)\n", __func__, hparams.target_hidden_size, hparams.n_embd); - hparams.n_embd_target_features = (uint32_t) target_extract_layers.size() * hparams.target_hidden_size; + hparams.n_embd_inp_impl = (uint32_t) target_extract_layers.size() * hparams.target_hidden_size; // eagle3 norm_before_residual (optional, default false) // compatible with Readhat eagle3 speculator model @@ -33,7 +33,7 @@ void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) { void llama_model_eagle3::load_arch_tensors(llama_model_loader &) { LLAMA_LOAD_LOCALS; - const int64_t n_embd_target_features = (int64_t) hparams.n_embd_target_features; + const int64_t n_embd_inp = hparams.n_embd_inp(); const int64_t n_embd_attn_input = 2 * n_embd; // Get vocab size from the d2t tensor in the GGUF file (optional - only needed if eagle3 has different vocab_size than target) @@ -50,7 +50,7 @@ void llama_model_eagle3::load_arch_tensors(llama_model_loader &) { } // Feature fusion layer: projects 3 target layers to draft hidden size - fc = create_tensor(tn(LLM_TENSOR_EAGLE3_FC, "weight"), {n_embd_target_features, n_embd}, 0); + fc = create_tensor(tn(LLM_TENSOR_EAGLE3_FC, "weight"), {n_embd_inp, n_embd}, 0); // Output layer (uses draft vocab size) output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); @@ -104,14 +104,12 @@ std::unique_ptr llama_model_eagle3::build_arch_graph(const ll template <> ggml_tensor * llama_model_eagle3::graph::build_inp_embd_enc() const { - const int64_t n_embd_target_features = (int64_t) hparams.n_embd_target_features; - ggml_tensor * cur = nullptr; // Input: Target model features (3 layers concatenated: low, mid, high) // Data will be provided via ubatch->embd in encode_eagle3_features() - auto inp_target = std::make_unique(n_embd_target_features); - inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_target_features, n_tokens); + auto inp_target = std::make_unique(hparams.n_embd_inp()); + inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32,hparams.n_embd_inp(), n_tokens); ggml_set_input(inp_target->embd); cur = inp_target->embd; From 9b2543d00d8d6bf3f42cfd4bea9ad2bfac05a47c Mon Sep 17 00:00:00 2001 From: Ruixiang Wang Date: Thu, 11 Jun 2026 10:35:55 +0000 Subject: [PATCH 18/27] eagle3: make output.weight optional and inherit from target model when needed --- src/llama-context.cpp | 2 +- src/models/eagle3.cpp | 13 +++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 62f05808fe7c..772ba50dc74b 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -101,7 +101,7 @@ llama_context::llama_context( } if (model.arch == LLM_ARCH_EAGLE3) { - if (model.tok_embd == nullptr) { + if (model.tok_embd == nullptr || model.output == nullptr) { if (params.ctx_other == nullptr) { throw std::runtime_error("EAGLE3 requires ctx_other to be set (this warning is normal during memory fitting)"); } diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp index 683c7ffad3c7..471971ab3b13 100644 --- a/src/models/eagle3.cpp +++ b/src/models/eagle3.cpp @@ -54,7 +54,7 @@ void llama_model_eagle3::load_arch_tensors(llama_model_loader &) { // Output layer (uses draft vocab size) output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); - output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_draft_vocab}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_draft_vocab}, TENSOR_NOT_REQUIRED); // Token embeddings (optional - Llama 3.3 70B EAGLE3 has its own) const struct ggml_tensor * tok_embd_meta = ml->get_tensor_meta(tn(LLM_TENSOR_TOKEN_EMBD, "weight").str().c_str()); @@ -295,7 +295,16 @@ llama_model_eagle3::graph::graph(const llama_model & model, const llm_gra cb(cur, "result_norm", -1); // lm_head - projects to draft vocabulary - cur = build_lora_mm(model.output, cur); + // if the draft has no own output projection, inherit the target model's lm_head + auto * output = model.output; + if (output == nullptr) { + GGML_ASSERT(cparams.ctx_other != nullptr); + const auto * model_other = llama_get_model(cparams.ctx_other); + + GGML_ASSERT(model_other->output != nullptr && "EAGLE3 decoder requires an output projection (own or from target model)"); + output = model_other->output; + } + cur = build_lora_mm(output, cur); if (model.d2t) { const int64_t n_draft_vocab = cur->ne[0]; From 1d55316eb18d781131ae84c463f93b252375e002 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 Jun 2026 13:16:32 +0300 Subject: [PATCH 19/27] haparams : generic norm-before-residual param --- src/llama-hparams.h | 5 ++--- src/models/eagle3.cpp | 6 +++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 6d4c0080db20..eac464e6b645 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -45,6 +45,7 @@ struct llama_hparams { bool rope_finetuned; bool use_par_res; bool swin_norm; + bool norm_before_residual = false; uint32_t n_ctx_train; // context size the model was trained on uint32_t n_embd; @@ -236,10 +237,8 @@ struct llama_hparams { std::array deepstack_mapping_arr; // eagle3/DFlash sahred params - // n_embd_impl = n_extract * target_hidden_size (encoder input dim) + // n_embd_inp = n_extract * target_hidden_size (encoder input dim) uint32_t target_hidden_size = 0; - // eagle3: whether to apply hidden_norm before storing residual - bool eagle3_norm_before_residual = false; // gemma4 per-layer embedding uint32_t n_embd_per_layer = 0; diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp index 471971ab3b13..2f9999bc402e 100644 --- a/src/models/eagle3.cpp +++ b/src/models/eagle3.cpp @@ -22,8 +22,8 @@ void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) { // eagle3 norm_before_residual (optional, default false) // compatible with Readhat eagle3 speculator model - ml.get_key(LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, hparams.eagle3_norm_before_residual, false); - if (hparams.eagle3_norm_before_residual) { + ml.get_key(LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, hparams.norm_before_residual, false); + if (hparams.norm_before_residual) { LLAMA_LOG_INFO("%s: EAGLE3 norm_before_residual = true\n", __func__); } @@ -212,7 +212,7 @@ llama_model_eagle3::graph::graph(const llama_model & model, const llm_gra // - false (default): use raw inp_g for residual // - true: use normalized g_norm for residual // inpL is the concatenated input (normalized inp_embd + normalized inp_g) - ggml_tensor * inpSA = hparams.eagle3_norm_before_residual ? g_norm : inpL; + ggml_tensor * inpSA = hparams.norm_before_residual ? g_norm : inpL; // Concatenate normalized inp_embd and normalized inp_g cur = ggml_concat(ctx0, embd_norm, g_norm, il); From 2de116b28b57b4c59524241a61c92c0b3a200645 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 Jun 2026 14:38:10 +0300 Subject: [PATCH 20/27] llama-ext : consistent names --- common/speculative.cpp | 4 +-- src/llama-context.cpp | 63 ++++++++++++++++++++++++------------------ src/llama-context.h | 8 ++---- src/llama-ext.h | 14 +++++----- 4 files changed, 48 insertions(+), 41 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index 6af8dc2a9d0b..a3a64f925a9d 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -483,7 +483,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { // turn on extraction of the target layers' input embeddings for (uint32_t k = 0; k < n_extract_layers; ++k) { - llama_set_output_layer_inp(ctx_tgt, (uint32_t) extract_layers[k], true); + llama_set_embeddings_layer_inp(ctx_tgt, (uint32_t) extract_layers[k], true); } // turn on extraction of the draft model's pre-norm hidden state @@ -559,7 +559,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { features_buf.resize((size_t) n_tokens * n_embd_enc, 0.0f); for (uint32_t k = 0; k < n_extract_layers; ++k) { - const float * layer = llama_get_output_layer_inp(ctx_tgt, (uint32_t) extract_layers[k]); + const float * layer = llama_get_embeddings_layer_inp(ctx_tgt, (uint32_t) extract_layers[k]); if (!layer) { GGML_ABORT("EAGLE3: target layer %d input not extracted.", extract_layers[k]); } diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 772ba50dc74b..5a015cba6755 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -950,6 +950,14 @@ float * llama_context::get_embeddings_nextn_ith(int32_t i) { } } +float * llama_context::get_embeddings_layer_inp(uint32_t lid) { + output_reorder(); + + GGML_ASSERT(lid < embd_layer_inp.size() && embd_layer_inp[lid].has_data()); + + return embd_layer_inp[lid].data; +} + llama_token llama_context::get_sampled_token_ith(int32_t idx) { output_reorder(); @@ -1137,6 +1145,16 @@ void llama_context::set_embeddings_nextn(bool value, bool masked) { cparams.embeddings_nextn_masked = masked; } +void llama_context::set_embeddings_layer_inp(uint32_t lid, bool enable) { + LLAMA_LOG_DEBUG("%s: lid = %d, enable = %d\n", __func__, lid, enable); + + GGML_ASSERT(lid < model.hparams.n_layer()); + + cparams.output_layer_inp[lid] = enable; + + sched_need_reserve = true; +} + void llama_context::set_causal_attn(bool value) { LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value); @@ -1278,23 +1296,6 @@ bool llama_context::set_adapter_cvec( return res; } -void llama_context::set_output_layer_inp(uint32_t layer_id, bool enable) { - LLAMA_LOG_DEBUG("%s: layer_id = %d, enable = %d\n", __func__, layer_id, enable); - - GGML_ASSERT(layer_id < model.hparams.n_layer()); - - cparams.output_layer_inp[layer_id] = enable; - - sched_need_reserve = true; -} - -float * llama_context::get_output_layer_inp(uint32_t layer_id) { - if (layer_id >= embd_layer_inp.size() || !embd_layer_inp[layer_id].has_data()) { - return nullptr; - } - return embd_layer_inp[layer_id].data; -} - llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) { if (mctx && !mctx->apply()) { LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__); @@ -2269,6 +2270,14 @@ void llama_context::output_reorder() { } } + if (embd_layer_inp.size() > 0) { + for (int lid = 0; lid < (int) embd_layer_inp.size(); ++lid) { + for (uint64_t k = 0; k < n_embd; ++k) { + std::swap(embd_layer_inp[lid].data[i0*n_embd + k], embd_layer_inp[lid].data[i1*n_embd + k]); + } + } + } + if (!sampling.samplers.empty()) { assert(sampling.logits.size > 0); assert(sampling.probs.size > 0); @@ -3683,6 +3692,10 @@ void llama_set_embeddings_nextn(llama_context * ctx, bool value, bool masked) { ctx->set_embeddings_nextn(value, masked); } +void llama_set_embeddings_layer_inp(llama_context * ctx, uint32_t lid, bool value) { + ctx->set_embeddings_layer_inp(lid, value); +} + llama_memory_t llama_get_memory(const struct llama_context * ctx) { if (!ctx) { return nullptr; @@ -3703,6 +3716,12 @@ float * llama_get_embeddings_nextn_ith(llama_context * ctx, int32_t i) { return ctx->get_embeddings_nextn_ith(i); } +float * llama_get_embeddings_layer_inp(llama_context * ctx, uint32_t lid) { + ctx->synchronize(); + + return ctx->get_embeddings_layer_inp(lid); +} + bool llama_set_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) { return ctx->set_sampler(seq_id, smpl); } @@ -4108,13 +4127,3 @@ llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * c llama_context * llama_get_ctx_other(struct llama_context * ctx) { return ctx->get_cparams().ctx_other; } - -void llama_set_output_layer_inp(struct llama_context * ctx, uint32_t layer_id, bool enable) { - ctx->set_output_layer_inp(layer_id, enable); -} - -float * llama_get_output_layer_inp(struct llama_context * ctx, uint32_t layer_id) { - ctx->synchronize(); - - return ctx->get_output_layer_inp(layer_id); -} diff --git a/src/llama-context.h b/src/llama-context.h index d6e321a5049c..853052be2cad 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -88,6 +88,8 @@ struct llama_context { float * get_embeddings_nextn(); float * get_embeddings_nextn_ith(int32_t i); + float * get_embeddings_layer_inp(uint32_t lid); + llama_token * get_sampled_tokens() const; llama_token get_sampled_token_ith(int32_t idx); @@ -112,6 +114,7 @@ struct llama_context { void set_embeddings (bool value); void set_embeddings_nextn(bool value, bool masked); + void set_embeddings_layer_inp(uint32_t lid, bool enable); void set_causal_attn(bool value); void set_warmup(bool value); @@ -126,11 +129,6 @@ struct llama_context { int32_t il_start, int32_t il_end); - void set_output_layer_inp(uint32_t layer_id, bool enable); - - // read back the input embeddings of the specified layer - float * get_output_layer_inp(uint32_t layer_id); - // process a single ubatch with a specific graph type // if memory_context is provided, it will be applied first to the context's memory // ret contains the status of the graph computation diff --git a/src/llama-ext.h b/src/llama-ext.h index f511f091bb86..d20bdefc517d 100644 --- a/src/llama-ext.h +++ b/src/llama-ext.h @@ -101,19 +101,19 @@ LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx); // LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i); LLAMA_API float * llama_get_embeddings_nextn_ith(struct llama_context * ctx, int32_t i); +// Set whether the context outputs the input embeddings of a specific layer +LLAMA_API void llama_set_embeddings_layer_inp(struct llama_context * ctx, uint32_t lid, bool value); + +// mirrors: +// LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); +LLAMA_API float * llama_get_embeddings_layer_inp(struct llama_context * ctx, uint32_t lid); + LLAMA_API llama_context * llama_get_ctx_other(struct llama_context * ctx); // // model/context data extraction // -// set if the layer input embeddings should be outputed -LLAMA_API void llama_set_output_layer_inp(struct llama_context * ctx, uint32_t layer_id, bool enable); - -// read back the input embeddings of the specified layer for the most recent decode batch -// the layer must have been enabled via llama_set_output_layer_inp -LLAMA_API float * llama_get_output_layer_inp(struct llama_context * ctx, uint32_t layer_id); - // // eagle3/DFlash: consume target model extracted features // From f4088797e483e06008be6e9aaf2be3379c7adfdf Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 Jun 2026 15:04:53 +0300 Subject: [PATCH 21/27] cont : fix --- src/llama-context.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 5a015cba6755..077eab7753b3 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2272,8 +2272,10 @@ void llama_context::output_reorder() { if (embd_layer_inp.size() > 0) { for (int lid = 0; lid < (int) embd_layer_inp.size(); ++lid) { - for (uint64_t k = 0; k < n_embd; ++k) { - std::swap(embd_layer_inp[lid].data[i0*n_embd + k], embd_layer_inp[lid].data[i1*n_embd + k]); + if (embd_layer_inp[lid].size > 0) { + for (uint64_t k = 0; k < n_embd; ++k) { + std::swap(embd_layer_inp[lid].data[i0*n_embd + k], embd_layer_inp[lid].data[i1*n_embd + k]); + } } } } From d37323315c16128dd9c3a6e98a4eb13be4c9f1f9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 Jun 2026 15:10:52 +0300 Subject: [PATCH 22/27] hparams : remove target_hidden_size --- common/speculative.cpp | 20 +++++++------------- src/llama-ext.h | 2 -- src/llama-hparams.h | 4 ---- src/llama-model.cpp | 4 ---- src/models/eagle3.cpp | 9 +++++---- 5 files changed, 12 insertions(+), 27 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index a3a64f925a9d..06c28aa322dc 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -418,7 +418,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { int32_t n_embd_dec = 0; // draft hidden size int32_t n_embd_enc = 0; // n_extract_layers * target_hidden_size - int32_t tgt_hidden = 0; // target model hidden size + int32_t n_embd_tgt = 0; // target model hidden size const int32_t * extract_layers = nullptr; // model_dft's extract layer indices uint32_t n_extract_layers = 0; @@ -456,15 +456,9 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { std::to_string(n_extract_layers) + ")"); } - tgt_hidden = (int32_t) llama_model_target_hidden_size(model_dft); - if (tgt_hidden != llama_model_n_embd(model_tgt)) { - throw std::runtime_error("EAGLE3 target_hidden_size mismatch (draft expects " + - std::to_string(tgt_hidden) + ", target n_embd is " + - std::to_string(llama_model_n_embd(model_tgt)) + ")"); - } - + n_embd_tgt = llama_model_n_embd(model_tgt); n_embd_dec = llama_model_n_embd(model_dft); - n_embd_enc = (int32_t) n_extract_layers * tgt_hidden; + n_embd_enc = (int32_t) n_extract_layers * n_embd_tgt; const int32_t n_b = (int32_t) llama_n_batch(ctx_dft); batch = llama_batch_init(/*n_tokens=*/ n_b, /*embd=*/ n_embd_dec, /*n_seq_max=*/ 1); @@ -554,7 +548,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { auto * ctx_dft = this->params.ctx_dft; // Interleave each extract_layer's hidden state into a contiguous buffer of - // shape [n_tokens, n_extract_layers * tgt_hidden]. Then run EAGLE3 encoder + // shape [n_tokens, n_extract_layers * n_embd_tgt]. Then run EAGLE3 encoder // to get one g_embd row per token. features_buf.resize((size_t) n_tokens * n_embd_enc, 0.0f); @@ -564,9 +558,9 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { GGML_ABORT("EAGLE3: target layer %d input not extracted.", extract_layers[k]); } for (int32_t i = 0; i < n_tokens; ++i) { - float * dst = features_buf.data() + (size_t) i * n_embd_enc + k * (size_t) tgt_hidden; - const float * src = layer + (size_t) i * tgt_hidden; - std::memcpy(dst, src, (size_t) tgt_hidden * sizeof(float)); + float * dst = features_buf.data() + (size_t) i * n_embd_enc + k * (size_t) n_embd_tgt; + const float * src = layer + (size_t) i * n_embd_tgt; + std::memcpy(dst, src, (size_t) n_embd_tgt * sizeof(float)); } } diff --git a/src/llama-ext.h b/src/llama-ext.h index d20bdefc517d..105daa367c96 100644 --- a/src/llama-ext.h +++ b/src/llama-ext.h @@ -122,5 +122,3 @@ LLAMA_API llama_context * llama_get_ctx_other(struct llama_context * ctx); LLAMA_API const int32_t * llama_model_target_extract_layers (const struct llama_model * model); // returns the number of extracted layers from target model LLAMA_API uint32_t llama_model_n_target_extract_layers(const struct llama_model * model); -// returns the target model hidden size -LLAMA_API uint32_t llama_model_target_hidden_size (const struct llama_model * model); diff --git a/src/llama-hparams.h b/src/llama-hparams.h index eac464e6b645..d045059a63e9 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -236,10 +236,6 @@ struct llama_hparams { // >=0 => input embedding index for deepstack injection std::array deepstack_mapping_arr; - // eagle3/DFlash sahred params - // n_embd_inp = n_extract * target_hidden_size (encoder input dim) - uint32_t target_hidden_size = 0; - // gemma4 per-layer embedding uint32_t n_embd_per_layer = 0; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 5a2b7aaec451..0e81a49768e0 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2700,7 +2700,3 @@ const int32_t * llama_model_target_extract_layers(const struct llama_model * mod uint32_t llama_model_n_target_extract_layers(const struct llama_model * model) { return (uint32_t) model->target_extract_layers.size(); } - -uint32_t llama_model_target_hidden_size(const struct llama_model * model) { - return model->hparams.target_hidden_size; -} diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp index 2f9999bc402e..f23b1f5ee9b0 100644 --- a/src/models/eagle3.cpp +++ b/src/models/eagle3.cpp @@ -14,11 +14,12 @@ void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) { target_extract_layers[1], target_extract_layers[2]); - ml.get_key(LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, hparams.target_hidden_size); - LLAMA_LOG_INFO("%s: EAGLE3 target_hidden_size = %u (draft n_embd = %u)\n", __func__, - hparams.target_hidden_size, hparams.n_embd); + uint32_t n_embd_tgt = 0; - hparams.n_embd_inp_impl = (uint32_t) target_extract_layers.size() * hparams.target_hidden_size; + ml.get_key(LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, n_embd_tgt); + LLAMA_LOG_INFO("%s: EAGLE3 n_embd_tgt = %u (draft n_embd = %u)\n", __func__, n_embd_tgt, hparams.n_embd); + + hparams.n_embd_inp_impl = (uint32_t) target_extract_layers.size() * n_embd_tgt; // eagle3 norm_before_residual (optional, default false) // compatible with Readhat eagle3 speculator model From 5caedbcd4fbffd057eb0963ff4af1709a070a7c2 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 Jun 2026 15:14:27 +0300 Subject: [PATCH 23/27] cparams : rename output_layer_inp -> embeddings_layer_inp --- src/llama-context.cpp | 12 ++++++------ src/llama-cparams.h | 2 +- src/llama-graph.cpp | 6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 077eab7753b3..23f5a7ee29cf 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -71,7 +71,7 @@ llama_context::llama_context( cparams.no_perf = params.no_perf; cparams.warmup = false; - cparams.output_layer_inp.resize(hparams.n_layer(), false); + cparams.embeddings_layer_inp.resize(hparams.n_layer(), false); embd_layer_inp.resize(hparams.n_layer()); cparams.ctx_type = params.ctx_type; @@ -1150,7 +1150,7 @@ void llama_context::set_embeddings_layer_inp(uint32_t lid, bool enable) { GGML_ASSERT(lid < model.hparams.n_layer()); - cparams.output_layer_inp[lid] = enable; + cparams.embeddings_layer_inp[lid] = enable; sched_need_reserve = true; } @@ -2089,7 +2089,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { embd_nextn.size = (size_t) n_embd_out * n_batch; } - for (bool enabled : cparams.output_layer_inp) { + for (bool enabled : cparams.embeddings_layer_inp) { if (enabled) { embd_layer_inp_float_count += (size_t) n_embd * n_batch; } @@ -2162,7 +2162,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { offset += embd_nextn.size * sizeof(float); for (uint32_t il = 0; il < embd_layer_inp.size(); ++il) { - if (cparams.output_layer_inp[il]) { + if (cparams.embeddings_layer_inp[il]) { embd_layer_inp[il] = buffer_view{(float *) (base + offset), (size_t) n_embd * n_batch}; offset += embd_layer_inp[il].size * sizeof(float); } else { @@ -2217,8 +2217,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { } void llama_context::extract_layer_inputs(const llm_graph_result * res, size_t token_offset, size_t n_tokens) { - for (uint32_t il = 0; il < cparams.output_layer_inp.size(); ++il) { - if (!cparams.output_layer_inp[il]) { + for (uint32_t il = 0; il < cparams.embeddings_layer_inp.size(); ++il) { + if (!cparams.embeddings_layer_inp[il]) { continue; } if (!embd_layer_inp[il].has_data()) { diff --git a/src/llama-cparams.h b/src/llama-cparams.h index cb326c8e31ca..2b109f909c0b 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -45,7 +45,7 @@ struct llama_cparams { bool kv_unified; bool pipeline_parallel; - std::vector output_layer_inp; + std::vector embeddings_layer_inp; // [n_layer()] extract input embeddings for layer enum llama_context_type ctx_type; enum llama_pooling_type pooling_type; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 83609e5294f9..fe28385d3b7e 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -950,9 +950,9 @@ void llm_graph_result::set_outputs(const llm_graph_params & params) { ggml_set_output(t_h_nextn); } { - const auto & output_layer_inp = params.cparams.output_layer_inp; - for (size_t il = 0; il < output_layer_inp.size(); ++il) { - if (output_layer_inp[il]) { + const auto & embeddings_layer_inp = params.cparams.embeddings_layer_inp; + for (size_t il = 0; il < embeddings_layer_inp.size(); ++il) { + if (embeddings_layer_inp[il]) { ggml_set_output(t_layer_inp[il]); } } From 0274f0fc7d7fbab69c33ccf757cf7f3d036c17aa Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 Jun 2026 15:26:10 +0300 Subject: [PATCH 24/27] arch : reuse ATTN_NORM_2 instead of adding new hidden norm --- conversion/llama.py | 2 +- gguf-py/gguf/constants.py | 4 +--- src/llama-arch.cpp | 3 --- src/llama-arch.h | 1 - src/llama-model.h | 3 --- src/models/eagle3.cpp | 8 ++++---- 6 files changed, 6 insertions(+), 15 deletions(-) diff --git a/conversion/llama.py b/conversion/llama.py index dd732716545e..0cce96a582e0 100644 --- a/conversion/llama.py +++ b/conversion/llama.py @@ -231,7 +231,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # not used at runtime, skip return if name == "model.layers.0.hidden_norm.weight": - yield ("blk.0.hidden_norm.weight", data_torch) + yield ("blk.0.attn_norm_2.weight", data_torch) return n_head = self.find_hparam(["n_heads", "num_attention_heads"]) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index b1f4f00fbecb..545b37db733e 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -914,7 +914,6 @@ class MODEL_TENSOR(IntEnum): NEXTN_SHARED_HEAD_NORM = auto() # eagle3 EAGLE3_FC = auto() # feature fusion layer - EAGLE3_HIDDEN_NORM = auto() # hidden normalization EAGLE3_D2T = auto() # draft to target vocabulary mapping # lfm2 audio A_ENC_NORM_CONV = auto() @@ -1497,7 +1496,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: "blk.{bid}.nextn.shared_head_head", MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: "blk.{bid}.nextn.shared_head_norm", MODEL_TENSOR.EAGLE3_FC: "fc", - MODEL_TENSOR.EAGLE3_HIDDEN_NORM: "blk.{bid}.hidden_norm", MODEL_TENSOR.EAGLE3_D2T: "d2t", } @@ -4045,6 +4043,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.OUTPUT, MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_NORM_2, MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_K, MODEL_TENSOR.ATTN_V, @@ -4054,7 +4053,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, MODEL_TENSOR.EAGLE3_FC, - MODEL_TENSOR.EAGLE3_HIDDEN_NORM, MODEL_TENSOR.EAGLE3_D2T, ], MODEL_ARCH.MISTRAL4: [ diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index a58f599712de..f88c3bf74e11 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -3,7 +3,6 @@ #include "llama-impl.h" #include -#include #include static const std::map LLM_ARCH_NAMES = { @@ -566,7 +565,6 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_INDEXER_ATTN_Q_B, "blk.%d.indexer.attn_q_b" }, { LLM_TENSOR_MASKED_EMBD_CENTROIDS, "masked_embd_centroids" }, { LLM_TENSOR_MASKED_EMBD_ORDERING, "masked_embd_ordering" }, - { LLM_TENSOR_EAGLE3_HIDDEN_NORM, "blk.%d.hidden_norm" }, { LLM_TENSOR_EAGLE3_FC, "fc" }, { LLM_TENSOR_EAGLE3_D2T, "d2t" }, }; @@ -797,7 +795,6 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_MASKED_EMBD_ORDERING, {LLM_TENSOR_LAYER_INPUT, GGML_OP_NONE}}, // eagle3 {LLM_TENSOR_EAGLE3_FC, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_EAGLE3_HIDDEN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_EAGLE3_D2T, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}}, }; diff --git a/src/llama-arch.h b/src/llama-arch.h index 0303cf6c11f7..dc06d02157ca 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -574,7 +574,6 @@ enum llm_tensor { LLM_TENSOR_MASKED_EMBD_CENTROIDS, LLM_TENSOR_MASKED_EMBD_ORDERING, LLM_TENSOR_EAGLE3_FC, - LLM_TENSOR_EAGLE3_HIDDEN_NORM, // TODO: remove, use LLM_TENSOR_ATTN_NORM instead LLM_TENSOR_EAGLE3_D2T, }; diff --git a/src/llama-model.h b/src/llama-model.h index b28eb7baf256..a350de4c01e6 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -475,9 +475,6 @@ struct llama_layer { struct ggml_tensor * ffn_act_beta = nullptr; struct ggml_tensor * ffn_act_eps = nullptr; - // eagle3 - struct ggml_tensor * eagle3_hidden_norm = nullptr; - // Kimi Linear KDA (using ssm_ prefix for consistency) // Note: ssm_dt_b already exists above (mamba bias), reused for Kimi dt_bias struct ggml_tensor * ssm_q_conv = nullptr; diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp index f23b1f5ee9b0..3a299126b574 100644 --- a/src/models/eagle3.cpp +++ b/src/models/eagle3.cpp @@ -72,15 +72,15 @@ void llama_model_eagle3::load_arch_tensors(llama_model_loader &) { // input_layernorm: applied to token embeddings layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + // eagle3 specific: hidden_norm applied to fused target features + layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0); + // Attention takes input_embeds_normed + fused_target_normed as input layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd_attn_input, n_embd_head_k * n_head}, 0); layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd_attn_input, n_embd_k_gqa}, 0); layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd_attn_input, n_embd_v_gqa}, 0); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); - // eagle3 specific: hidden_norm applied to fused target features - layer.eagle3_hidden_norm = create_tensor(tn(LLM_TENSOR_EAGLE3_HIDDEN_NORM, "weight", i), {n_embd}, 0); - layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); @@ -205,7 +205,7 @@ llama_model_eagle3::graph::graph(const llama_model & model, const llm_gra // Apply hidden_norm to inp_g ggml_tensor * g_norm = build_norm(inp_g, - model.layers[il].eagle3_hidden_norm, NULL, + model.layers[il].attn_norm_2, NULL, LLM_NORM_RMS, -1); cb(g_norm, "g_norm", il); From 9baa68be99c0969e1181d805d649b8cb132b15a0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 Jun 2026 15:42:08 +0300 Subject: [PATCH 25/27] llama : clean-up names --- common/speculative.cpp | 33 +++++++++++++++++---------------- conversion/llama.py | 8 ++++---- gguf-py/gguf/constants.py | 34 +++++++++++++++++----------------- src/llama-arch.cpp | 20 ++++++++++---------- src/llama-arch.h | 10 +++++----- src/llama-ext.h | 8 ++------ src/llama-model.cpp | 8 ++++---- src/llama-model.h | 2 +- src/models/eagle3.cpp | 22 +++++++++++----------- 9 files changed, 71 insertions(+), 74 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index 06c28aa322dc..87e9047de73a 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -416,11 +416,12 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { std::vector smpls; - int32_t n_embd_dec = 0; // draft hidden size - int32_t n_embd_enc = 0; // n_extract_layers * target_hidden_size - int32_t n_embd_tgt = 0; // target model hidden size - const int32_t * extract_layers = nullptr; // model_dft's extract layer indices - uint32_t n_extract_layers = 0; + int32_t n_embd_dec = 0; // draft hidden size + int32_t n_embd_enc = 0; // target_layer_ids_n * target_hidden_size + int32_t n_embd_tgt = 0; // target model hidden size + + const int32_t * target_layer_ids = nullptr; // model_dft's extract layer indices + uint32_t target_layer_ids_n = 0; // [per-seq] deferred boundary state std::vector> pending_g_last; @@ -449,16 +450,16 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { const llama_model * model_dft = llama_get_model(ctx_dft); const llama_model * model_tgt = llama_get_model(ctx_tgt); - extract_layers = llama_model_target_extract_layers (model_dft); - n_extract_layers = llama_model_n_target_extract_layers(model_dft); - if (n_extract_layers != 3) { + target_layer_ids = llama_model_target_layer_ids (model_dft); + target_layer_ids_n = llama_model_target_layer_ids_n(model_dft); + if (target_layer_ids_n != 3) { throw std::runtime_error("draft model is not eagle3 (expected 3 extract layers, got " + - std::to_string(n_extract_layers) + ")"); + std::to_string(target_layer_ids_n) + ")"); } n_embd_tgt = llama_model_n_embd(model_tgt); n_embd_dec = llama_model_n_embd(model_dft); - n_embd_enc = (int32_t) n_extract_layers * n_embd_tgt; + n_embd_enc = (int32_t) target_layer_ids_n * n_embd_tgt; const int32_t n_b = (int32_t) llama_n_batch(ctx_dft); batch = llama_batch_init(/*n_tokens=*/ n_b, /*embd=*/ n_embd_dec, /*n_seq_max=*/ 1); @@ -476,8 +477,8 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { } // turn on extraction of the target layers' input embeddings - for (uint32_t k = 0; k < n_extract_layers; ++k) { - llama_set_embeddings_layer_inp(ctx_tgt, (uint32_t) extract_layers[k], true); + for (uint32_t k = 0; k < target_layer_ids_n; ++k) { + llama_set_embeddings_layer_inp(ctx_tgt, (uint32_t) target_layer_ids[k], true); } // turn on extraction of the draft model's pre-norm hidden state @@ -548,14 +549,14 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { auto * ctx_dft = this->params.ctx_dft; // Interleave each extract_layer's hidden state into a contiguous buffer of - // shape [n_tokens, n_extract_layers * n_embd_tgt]. Then run EAGLE3 encoder + // shape [n_tokens, target_layer_ids_n * n_embd_tgt]. Then run EAGLE3 encoder // to get one g_embd row per token. features_buf.resize((size_t) n_tokens * n_embd_enc, 0.0f); - for (uint32_t k = 0; k < n_extract_layers; ++k) { - const float * layer = llama_get_embeddings_layer_inp(ctx_tgt, (uint32_t) extract_layers[k]); + for (uint32_t k = 0; k < target_layer_ids_n; ++k) { + const float * layer = llama_get_embeddings_layer_inp(ctx_tgt, (uint32_t) target_layer_ids[k]); if (!layer) { - GGML_ABORT("EAGLE3: target layer %d input not extracted.", extract_layers[k]); + GGML_ABORT("EAGLE3: target layer %d input not extracted.", target_layer_ids[k]); } for (int32_t i = 0; i < n_tokens; ++i) { float * dst = features_buf.data() + (size_t) i * n_embd_enc + k * (size_t) n_embd_tgt; diff --git a/conversion/llama.py b/conversion/llama.py index 0cce96a582e0..cad802bf8246 100644 --- a/conversion/llama.py +++ b/conversion/llama.py @@ -68,11 +68,11 @@ def __init__(self, *args, **kwargs): target_config = {**target_config, **target_config["text_config"]} self.target_vocab_size = target_config["vocab_size"] - # extract_layers: derived from target model layer count (low/mid/high) + # target_layers: derived from target model layer count (low/mid/high) target_num_layers = target_config["num_hidden_layers"] - extract_layers = [2, target_num_layers // 2, target_num_layers - 3] - logger.info(f"EAGLE-3: extract_layers = {extract_layers} (target model has {target_num_layers} layers)") - self.gguf_writer.add_array(f"{self.gguf_writer.arch}.extract_layers", extract_layers) + target_layers = [2, target_num_layers // 2, target_num_layers - 3] + logger.info(f"EAGLE-3: target_layers = {target_layers} (target model has {target_num_layers} layers)") + self.gguf_writer.add_array(f"{self.gguf_writer.arch}.target_layers", target_layers) # target_hidden_size: prefer eagle3 config, fallback to target config if eagle3_raw_config.get("target_hidden_size") is not None: diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 545b37db733e..bebc57a1b615 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -154,9 +154,9 @@ class LLM: HIDDEN_ACT = "{arch}.hidden_activation" DENSE_FEAT_IN_SIZE = "{arch}.{dense}_feat_in" DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out" - EAGLE3_EXTRACT_LAYERS = "{arch}.extract_layers" - EAGLE3_TARGET_HIDDEN_SIZE = "{arch}.target_hidden_size" - EAGLE3_NORM_BEFORE_RESIDUAL = "{arch}.norm_before_residual" + TARGET_LAYERS = "{arch}.target_layers" + TARGET_HIDDEN_SIZE = "{arch}.target_hidden_size" + NORM_BEFORE_RESIDUAL = "{arch}.norm_before_residual" class Attention: HEAD_COUNT = "{arch}.attention.head_count" @@ -904,17 +904,17 @@ class MODEL_TENSOR(IntEnum): A_PER_DIM_K_SCALE = auto() # gemma4 A_PER_DIM_SCALE = auto() # gemma4 # nextn/mtp - NEXTN_PROJ_PRE = auto() - NEXTN_PROJ_POST = auto() - NEXTN_EH_PROJ = auto() - NEXTN_EMBED_TOKENS = auto() - NEXTN_ENORM = auto() - NEXTN_HNORM = auto() + NEXTN_PROJ_PRE = auto() + NEXTN_PROJ_POST = auto() + NEXTN_EH_PROJ = auto() + NEXTN_EMBED_TOKENS = auto() + NEXTN_ENORM = auto() + NEXTN_HNORM = auto() NEXTN_SHARED_HEAD_HEAD = auto() NEXTN_SHARED_HEAD_NORM = auto() # eagle3 - EAGLE3_FC = auto() # feature fusion layer - EAGLE3_D2T = auto() # draft to target vocabulary mapping + FC = auto() # feature fusion layer + D2T = auto() # draft to target vocabulary mapping # lfm2 audio A_ENC_NORM_CONV = auto() A_ENC_LINEAR_POS = auto() @@ -1102,8 +1102,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.POS_EMBD: "position_embd", MODEL_TENSOR.OUTPUT_NORM: "output_norm", MODEL_TENSOR.OUTPUT: "output", - MODEL_TENSOR.DENSE_2_OUT: "dense_2", # embeddinggemma 2_Dense - MODEL_TENSOR.DENSE_3_OUT: "dense_3", # embeddinggemma 2_Dense + MODEL_TENSOR.DENSE_2_OUT: "dense_2", # embeddinggemma 2_Dense + MODEL_TENSOR.DENSE_3_OUT: "dense_3", # embeddinggemma 2_Dense MODEL_TENSOR.ROPE_FREQS: "rope_freqs", MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long", MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short", @@ -1495,8 +1495,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.NEXTN_HNORM: "blk.{bid}.nextn.hnorm", MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: "blk.{bid}.nextn.shared_head_head", MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: "blk.{bid}.nextn.shared_head_norm", - MODEL_TENSOR.EAGLE3_FC: "fc", - MODEL_TENSOR.EAGLE3_D2T: "d2t", + MODEL_TENSOR.FC: "fc", + MODEL_TENSOR.D2T: "d2t", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -4052,8 +4052,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_GATE, MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, - MODEL_TENSOR.EAGLE3_FC, - MODEL_TENSOR.EAGLE3_D2T, + MODEL_TENSOR.FC, + MODEL_TENSOR.D2T, ], MODEL_ARCH.MISTRAL4: [ MODEL_TENSOR.TOKEN_EMBD, diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index f88c3bf74e11..6af9b0df9848 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -292,16 +292,16 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" }, - { LLM_KV_EAGLE3_EXTRACT_LAYERS, "%s.extract_layers" }, - { LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, "%s.target_hidden_size" }, - { LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, "%s.norm_before_residual" }, + { LLM_KV_TARGET_LAYERS, "%s.target_layers" }, + { LLM_KV_TARGET_HIDDEN_SIZE, "%s.target_hidden_size" }, + { LLM_KV_NORM_BEFORE_RESIDUAL, "%s.norm_before_residual" }, { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" }, // sentence-transformers dense modules feature dims { LLM_KV_DENSE_2_FEAT_IN, "%s.dense_2_feat_in" }, - { LLM_KV_DENSE_2_FEAT_OUT, "%s.dense_2_feat_out" }, - { LLM_KV_DENSE_3_FEAT_IN, "%s.dense_3_feat_in" }, - { LLM_KV_DENSE_3_FEAT_OUT, "%s.dense_3_feat_out" }, + { LLM_KV_DENSE_2_FEAT_OUT, "%s.dense_2_feat_out" }, + { LLM_KV_DENSE_3_FEAT_IN, "%s.dense_3_feat_in" }, + { LLM_KV_DENSE_3_FEAT_OUT, "%s.dense_3_feat_out" }, { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, @@ -565,8 +565,8 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_INDEXER_ATTN_Q_B, "blk.%d.indexer.attn_q_b" }, { LLM_TENSOR_MASKED_EMBD_CENTROIDS, "masked_embd_centroids" }, { LLM_TENSOR_MASKED_EMBD_ORDERING, "masked_embd_ordering" }, - { LLM_TENSOR_EAGLE3_FC, "fc" }, - { LLM_TENSOR_EAGLE3_D2T, "d2t" }, + { LLM_TENSOR_FC, "fc" }, + { LLM_TENSOR_D2T, "d2t" }, }; // declare information about the model weight tensors: @@ -794,8 +794,8 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_MASKED_EMBD_CENTROIDS, {LLM_TENSOR_LAYER_INPUT, GGML_OP_NONE}}, {LLM_TENSOR_MASKED_EMBD_ORDERING, {LLM_TENSOR_LAYER_INPUT, GGML_OP_NONE}}, // eagle3 - {LLM_TENSOR_EAGLE3_FC, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_EAGLE3_D2T, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}}, + {LLM_TENSOR_FC, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_D2T, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}}, }; LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {} diff --git a/src/llama-arch.h b/src/llama-arch.h index dc06d02157ca..723d2f8d2371 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -337,9 +337,9 @@ enum llm_kv { LLM_KV_CLASSIFIER_OUTPUT_LABELS, - LLM_KV_EAGLE3_EXTRACT_LAYERS, - LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, - LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, + LLM_KV_TARGET_LAYERS, + LLM_KV_TARGET_HIDDEN_SIZE, + LLM_KV_NORM_BEFORE_RESIDUAL, LLM_KV_SHORTCONV_L_CACHE, @@ -573,8 +573,8 @@ enum llm_tensor { LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, LLM_TENSOR_MASKED_EMBD_CENTROIDS, LLM_TENSOR_MASKED_EMBD_ORDERING, - LLM_TENSOR_EAGLE3_FC, - LLM_TENSOR_EAGLE3_D2T, + LLM_TENSOR_FC, + LLM_TENSOR_D2T, }; diff --git a/src/llama-ext.h b/src/llama-ext.h index 105daa367c96..b744af52864b 100644 --- a/src/llama-ext.h +++ b/src/llama-ext.h @@ -114,11 +114,7 @@ LLAMA_API llama_context * llama_get_ctx_other(struct llama_context * ctx); // model/context data extraction // -// -// eagle3/DFlash: consume target model extracted features -// - // returns pointer to the target-model layer indices -LLAMA_API const int32_t * llama_model_target_extract_layers (const struct llama_model * model); +LLAMA_API const int32_t * llama_model_target_layer_ids (const struct llama_model * model); // returns the number of extracted layers from target model -LLAMA_API uint32_t llama_model_n_target_extract_layers(const struct llama_model * model); +LLAMA_API uint32_t llama_model_target_layer_ids_n(const struct llama_model * model); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 0e81a49768e0..7281ed79f105 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2692,11 +2692,11 @@ void llama_model_base::create_tensor_qkv(llama_layer & layer, int bid, } } -const int32_t * llama_model_target_extract_layers(const struct llama_model * model) { - const auto & v = model->target_extract_layers; +const int32_t * llama_model_target_layer_ids(const struct llama_model * model) { + const auto & v = model->target_layer_ids; return v.empty() ? nullptr : v.data(); } -uint32_t llama_model_n_target_extract_layers(const struct llama_model * model) { - return (uint32_t) model->target_extract_layers.size(); +uint32_t llama_model_target_layer_ids_n(const struct llama_model * model) { + return (uint32_t) model->target_layer_ids.size(); } diff --git a/src/llama-model.h b/src/llama-model.h index a350de4c01e6..f4718f6d5842 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -574,7 +574,7 @@ struct llama_model { struct ggml_tensor * d2t = nullptr; // draft to target vocabulary mapping // unified vector to store target-model extracted layer ids in eagle3, dflash, etc. - std::vector target_extract_layers; + std::vector target_layer_ids; std::vector layers; diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp index 3a299126b574..a72eca889f6a 100644 --- a/src/models/eagle3.cpp +++ b/src/models/eagle3.cpp @@ -3,29 +3,29 @@ void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - if (!ml.get_arr(LLM_KV_EAGLE3_EXTRACT_LAYERS, target_extract_layers, false)) { + if (!ml.get_arr(LLM_KV_TARGET_LAYERS, target_layer_ids, false)) { throw std::runtime_error("EAGLE3 model requires 'extract_layers' in GGUF metadata"); } - if (target_extract_layers.size() != 3) { + if (target_layer_ids.size() != 3) { throw std::runtime_error("EAGLE3 requires exactly 3 entries in 'extract_layers'"); } LLAMA_LOG_INFO("%s: EAGLE3 extract_layers = [%d, %d, %d]\n", __func__, - target_extract_layers[0], - target_extract_layers[1], - target_extract_layers[2]); + target_layer_ids[0], + target_layer_ids[1], + target_layer_ids[2]); uint32_t n_embd_tgt = 0; - ml.get_key(LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, n_embd_tgt); + ml.get_key(LLM_KV_TARGET_HIDDEN_SIZE, n_embd_tgt); LLAMA_LOG_INFO("%s: EAGLE3 n_embd_tgt = %u (draft n_embd = %u)\n", __func__, n_embd_tgt, hparams.n_embd); - hparams.n_embd_inp_impl = (uint32_t) target_extract_layers.size() * n_embd_tgt; + hparams.n_embd_inp_impl = (uint32_t) target_layer_ids.size() * n_embd_tgt; // eagle3 norm_before_residual (optional, default false) // compatible with Readhat eagle3 speculator model - ml.get_key(LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, hparams.norm_before_residual, false); + ml.get_key(LLM_KV_NORM_BEFORE_RESIDUAL, hparams.norm_before_residual, false); if (hparams.norm_before_residual) { - LLAMA_LOG_INFO("%s: EAGLE3 norm_before_residual = true\n", __func__); + LLAMA_LOG_INFO("%s: EAGLE3gnorm_before_residual = true\n", __func__); } type = LLM_TYPE_UNKNOWN; @@ -43,7 +43,7 @@ void llama_model_eagle3::load_arch_tensors(llama_model_loader &) { const struct ggml_tensor * d2t_meta = ml->get_tensor_meta("d2t"); if (d2t_meta) { n_draft_vocab = d2t_meta->ne[0]; // update draft vocab size - d2t = create_tensor(tn(LLM_TENSOR_EAGLE3_D2T), {n_draft_vocab}, 0); + d2t = create_tensor(tn(LLM_TENSOR_D2T), {n_draft_vocab}, 0); LLAMA_LOG_INFO("%s: EAGLE3 using d2t mapping (draft_vocab_size = %lld)\n", __func__, (long long)n_draft_vocab); } else { d2t = nullptr; // no d2t, use default vocab size @@ -51,7 +51,7 @@ void llama_model_eagle3::load_arch_tensors(llama_model_loader &) { } // Feature fusion layer: projects 3 target layers to draft hidden size - fc = create_tensor(tn(LLM_TENSOR_EAGLE3_FC, "weight"), {n_embd_inp, n_embd}, 0); + fc = create_tensor(tn(LLM_TENSOR_FC, "weight"), {n_embd_inp, n_embd}, 0); // Output layer (uses draft vocab size) output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); From 0bd54498f273bf290a8fd55152deedf8e7c878dc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 11 Jun 2026 16:06:27 +0300 Subject: [PATCH 26/27] cont : add assert + comment --- src/llama-context.cpp | 1 + src/llama-graph.cpp | 1 + src/models/eagle3.cpp | 6 ------ 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 23f5a7ee29cf..168dbabd7667 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1152,6 +1152,7 @@ void llama_context::set_embeddings_layer_inp(uint32_t lid, bool enable) { cparams.embeddings_layer_inp[lid] = enable; + // note: without this reserve, the draft acceptance drops to zero. not sure why - this is unexpected sched_need_reserve = true; } diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index fe28385d3b7e..7468bd9b79ef 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -953,6 +953,7 @@ void llm_graph_result::set_outputs(const llm_graph_params & params) { const auto & embeddings_layer_inp = params.cparams.embeddings_layer_inp; for (size_t il = 0; il < embeddings_layer_inp.size(); ++il) { if (embeddings_layer_inp[il]) { + GGML_ASSERT(t_layer_inp[il] != nullptr && "layer input tensor is null"); ggml_set_output(t_layer_inp[il]); } } diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp index a72eca889f6a..3321b390515d 100644 --- a/src/models/eagle3.cpp +++ b/src/models/eagle3.cpp @@ -192,8 +192,6 @@ llama_model_eagle3::graph::graph(const llama_model & model, const llm_gra const float kq_scale = 1.0f/sqrtf(float(n_embd_head)); - ggml_tensor * inp_out_ids = build_inp_out_ids(); - // Single decoder layer (il = 0) const int il = 0; { @@ -286,10 +284,6 @@ llama_model_eagle3::graph::graph(const llama_model & model, const llm_gra ggml_set_output(cur); res->t_h_nextn = cur; - if (inp_out_ids) { - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - } - cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); From 7c42aff5e3c7c39841d3656c2af228c55f7b5a3d Mon Sep 17 00:00:00 2001 From: Ruixiang Wang Date: Thu, 11 Jun 2026 23:09:09 +0200 Subject: [PATCH 27/27] Update conversion/llama.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- conversion/llama.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conversion/llama.py b/conversion/llama.py index cad802bf8246..b87bf92d4633 100644 --- a/conversion/llama.py +++ b/conversion/llama.py @@ -230,8 +230,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name == "t2d": # not used at runtime, skip return - if name == "model.layers.0.hidden_norm.weight": - yield ("blk.0.attn_norm_2.weight", data_torch) + if name.endswith(".hidden_norm.weight"): + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_NORM_2, bid), data_torch) return n_head = self.find_hparam(["n_heads", "num_attention_heads"])