From 0806a964c5b6e4d28a023142c0b784907b652fed Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 5 May 2026 20:50:20 +0300
Subject: [PATCH 01/27] llama : enable layer input extraction

---
 src/llama-context.cpp     | 16 +++++++++++++++-
 src/llama-context.h       |  2 ++
 src/llama-cparams.h       |  3 +++
 src/llama-ext.h           | 11 +++++++++++
 src/llama-graph.cpp       | 14 +++++++++++++-
 src/llama-graph.h         | 14 +++++++++-----
 src/llama-hparams.h       |  1 +
 src/llama-model.cpp       | 10 +++++++++-
 src/models/llama.cpp      |  2 ++
 src/models/openai-moe.cpp |  2 ++
 src/models/qwen3.cpp      |  2 ++
 src/models/qwen3moe.cpp   |  2 ++
 12 files changed, 71 insertions(+), 8 deletions(-)
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 9a40c4366af1..31f9a530ee7d 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -86,6 +86,7 @@ llama_context::llama_context(
     cparams.cb_eval_user_data = params.cb_eval_user_data;
 
     cparams.ctx_other = nullptr;
+    cparams.output_layer_inp.resize(hparams.n_layer, false);
 
     // TODO: more generic
     if (model.arch == LLM_ARCH_GEMMA4_ASSISTANT) {
@@ -1266,6 +1267,16 @@ bool llama_context::set_adapter_cvec(
     return res;
 }
 
+void llama_context::set_output_layer_inp(uint32_t layer_id, bool enable) {
+    LLAMA_LOG_DEBUG("%s: layer_id = %d, enable = %d\n", __func__, layer_id, enable);
+
+    GGML_ASSERT(layer_id < model.hparams.n_layer);
+
+    cparams.output_layer_inp[layer_id] = enable;
+
+    sched_need_reserve = true;
+}
+
 llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
     if (mctx && !mctx->apply()) {
         LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__);
@@ -2041,7 +2052,6 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
         has_embd   = true;
     }
 
-
     size_t backend_float_count = 0;
     size_t backend_token_count = 0;
 
@@ -4029,3 +4039,7 @@ llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * c
 llama_context * llama_get_ctx_other(struct llama_context * ctx) {
     return ctx->get_cparams().ctx_other;
 }
+
+void llama_set_output_layer_inp(struct llama_context * ctx, uint32_t layer_id, bool enable) {
+    ctx->set_output_layer_inp(layer_id, enable);
+}
diff --git a/src/llama-context.h b/src/llama-context.h
index 6f8f59a22a3e..1b516a7bf2b4 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -126,6 +126,8 @@ struct llama_context {
                 int32_t   il_start,
                 int32_t   il_end);
 
+    void set_output_layer_inp(uint32_t layer_id, bool enable);
+
     // process a single ubatch with a specific graph type
     // if memory_context is provided, it will be applied first to the context's memory
     // ret contains the status of the graph computation
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
index 8a35d389ef40..cb326c8e31ca 100644
--- a/src/llama-cparams.h
+++ b/src/llama-cparams.h
@@ -3,6 +3,7 @@
 #include "llama.h"
 
 #include <cstdint>
+#include <vector>
 
 #define LLAMA_MAX_SEQ 256
 
@@ -44,6 +45,8 @@ struct llama_cparams {
     bool kv_unified;
     bool pipeline_parallel;
 
+    std::vector<bool> output_layer_inp;
+
     enum llama_context_type ctx_type;
     enum llama_pooling_type pooling_type;
 
diff --git a/src/llama-ext.h b/src/llama-ext.h
index bd74544129b4..c118f9fb3feb 100644
--- a/src/llama-ext.h
+++ b/src/llama-ext.h
@@ -102,3 +102,14 @@ LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx);
 LLAMA_API float * llama_get_embeddings_nextn_ith(struct llama_context * ctx, int32_t i);
 
 LLAMA_API llama_context * llama_get_ctx_other(struct llama_context * ctx);
+
+//
+// model/context data extraction
+//
+
+// set if the layer input embeddings should be outputed
+LLAMA_API void llama_set_output_layer_inp(struct llama_context * ctx, uint32_t layer_id, bool enable);
+
+LLAMA_API ggml_tensor * llama_model_get_tok_embd(const struct llama_model * model);
+LLAMA_API void          llama_model_set_tok_embd(      struct llama_model * model, ggml_tensor * tensor);
+
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index da7a9295561c..45f8da1c7940 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -895,6 +895,10 @@ void llm_graph_result::reset() {
     t_logits      = nullptr;
     t_embd        = nullptr;
     t_embd_pooled = nullptr;
+
+    t_layer_inp.resize(LLAMA_MAX_LAYERS);
+    std::fill(t_layer_inp.begin(), t_layer_inp.end(), nullptr);
+
     t_sampled.clear();
     t_sampled_probs.clear();
     t_sampled_logits.clear();
@@ -923,7 +927,7 @@ void llm_graph_result::set_inputs(const llama_ubatch * ubatch) {
     }
 }
 
-void llm_graph_result::set_outputs() {
+void llm_graph_result::set_outputs(const llm_graph_params & params) {
     if (t_logits != nullptr) {
         ggml_set_output(t_logits);
     }
@@ -936,6 +940,14 @@ void llm_graph_result::set_outputs() {
     if (t_h_nextn != nullptr) {
         ggml_set_output(t_h_nextn);
     }
+    {
+        const auto & output_layer_inp = params.cparams.output_layer_inp;
+        for (size_t il = 0; il < output_layer_inp.size(); ++il) {
+            if (output_layer_inp[il]) {
+                ggml_set_output(t_layer_inp[il]);
+            }
+        }
+    }
     for (auto & [seq_id, t] : t_sampled) {
         if (t != nullptr) {
             ggml_set_output(t);
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 6793846e3ea6..cc5cfe51dcdf 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -705,6 +705,8 @@ class llm_graph_result {
     ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
     ggml_tensor * get_h_nextn()     const { return t_h_nextn; }
 
+    ggml_tensor * get_layer_inp(int il) const { return t_layer_inp[il]; }
+
     ggml_cgraph  * get_gf()  const { return gf; }
     ggml_context * get_ctx() const { return ctx_compute.get(); }
 
@@ -713,7 +715,7 @@ class llm_graph_result {
     void reset();
 
     void set_inputs(const llama_ubatch * ubatch);
-    void set_outputs();
+    void set_outputs(const llm_graph_params & params);
 
     // try to update the existing graph result using the new graph parameters in order to reuse it
     // this can only be done if we determine that the resulting graph using the new graph parameters
@@ -734,10 +736,12 @@ class llm_graph_result {
     ggml_tensor * t_embd_pooled = nullptr;
     ggml_tensor * t_h_nextn     = nullptr; // [n_embd, n_outputs] hidden state before final output norm
 
-    std::map<llama_seq_id, ggml_tensor*> t_sampled_logits;
-    std::map<llama_seq_id, ggml_tensor*> t_candidates;
-    std::map<llama_seq_id, ggml_tensor*> t_sampled;
-    std::map<llama_seq_id, ggml_tensor*> t_sampled_probs;
+    std::vector<ggml_tensor *> t_layer_inp;
+
+    std::map<llama_seq_id, ggml_tensor *> t_sampled_logits;
+    std::map<llama_seq_id, ggml_tensor *> t_candidates;
+    std::map<llama_seq_id, ggml_tensor *> t_sampled;
+    std::map<llama_seq_id, ggml_tensor *> t_sampled_probs;
 
     std::vector<llm_graph_input_ptr> inputs;
 
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 032944cb481c..4f23466ce02b 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -48,6 +48,7 @@ struct llama_hparams {
 
     uint32_t n_ctx_train; // context size the model was trained on
     uint32_t n_embd;
+    uint32_t n_embd_inp_impl = 0;
     uint32_t n_layer_all;
     uint32_t n_layer_nextn = 0;
     uint32_t n_expert = 0;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 4f12e0949acb..a31a23c06149 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2238,7 +2238,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
     // TODO: move reranking logic here and generalize
     llm->build_dense_out(dense_2_out_layers, dense_2_out_layers_b, dense_3_out_layers);
 
-    llm->res->set_outputs();
+    llm->res->set_outputs(params);
 
     return llm->res->get_gf();
 }
@@ -2687,3 +2687,11 @@ void llama_model_base::create_tensor_qkv(llama_layer & layer, int bid,
         layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", bid), {n_embd_v_}, TENSOR_NOT_REQUIRED);
     }
 }
+
+ggml_tensor * llama_model_get_tok_embd(const struct llama_model * model) {
+    return model->tok_embd;
+}
+
+void llama_model_set_tok_embd(struct llama_model * model, ggml_tensor * tensor) {
+    model->tok_embd = tensor;
+}
diff --git a/src/models/llama.cpp b/src/models/llama.cpp
index c0ec7e0a9adb..4bfebc8843c6 100644
--- a/src/models/llama.cpp
+++ b/src/models/llama.cpp
@@ -124,6 +124,8 @@ llama_model_llama::graph<embed>::graph(const llama_model & model, const llm_grap
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
     for (int il = 0; il < n_layer; ++il) {
+        res->t_layer_inp[il] = inpL;
+
         ggml_tensor * inpSA = inpL;
 
         // norm
diff --git a/src/models/openai-moe.cpp b/src/models/openai-moe.cpp
index 3ab15d61f08c..6d74f9c7e6ef 100644
--- a/src/models/openai-moe.cpp
+++ b/src/models/openai-moe.cpp
@@ -75,6 +75,8 @@ llama_model_openai_moe::graph::graph(const llama_model & model, const llm_graph_
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
     for (int il = 0; il < n_layer; ++il) {
+        res->t_layer_inp[il] = inpL;
+
         const float freq_base_l  = model.get_rope_freq_base (cparams, il);
         const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
 
diff --git a/src/models/qwen3.cpp b/src/models/qwen3.cpp
index 1d0d2fab362a..f4b2a2aebe0f 100644
--- a/src/models/qwen3.cpp
+++ b/src/models/qwen3.cpp
@@ -69,6 +69,8 @@ llama_model_qwen3::graph::graph(const llama_model & model, const llm_graph_param
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
     for (int il = 0; il < n_layer; ++il) {
+        res->t_layer_inp[il] = inpL;
+
         ggml_tensor * inpSA = inpL;
 
         // norm
diff --git a/src/models/qwen3moe.cpp b/src/models/qwen3moe.cpp
index 317e668bec79..6f6df5390e33 100644
--- a/src/models/qwen3moe.cpp
+++ b/src/models/qwen3moe.cpp
@@ -78,6 +78,8 @@ llama_model_qwen3moe::graph::graph(const llama_model & model, const llm_graph_pa
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
     for (int il = 0; il < n_layer; ++il) {
+        res->t_layer_inp[il] = inpL;
+
         ggml_tensor * inpSA = inpL;
 
         // norm

From 800494f85f1397d096cd4e63d9a7f2ced439d0a6 Mon Sep 17 00:00:00 2001
From: Ruixiang Wang <wangruixiang07@outlook.com>
Date: Mon, 18 May 2026 13:37:43 +0000
Subject: [PATCH 02/27] spec: support eagle3

---
 common/speculative.cpp          | 436 +++++++++++++++++++++++++++++++-
 common/speculative.h            |   4 +
 conversion/base.py              |   4 +
 conversion/llama.py             | 119 ++++++++-
 convert_hf_to_gguf.py           |  10 +
 gguf-py/gguf/constants.py       |  30 +++
 src/llama-arch.cpp              |  12 +
 src/llama-arch.h                |   8 +
 src/llama-context.cpp           |  70 ++++-
 src/llama-context.h             |  11 +
 src/llama-ext.h                 |  18 ++
 src/llama-hparams.h             |   7 +
 src/llama-model-loader.cpp      |   1 +
 src/llama-model.cpp             |  24 ++
 src/llama-model.h               |  10 +
 src/models/eagle3.cpp           | 300 ++++++++++++++++++++++
 src/models/models.h             |  15 ++
 tools/server/server-context.cpp |   3 +
 18 files changed, 1069 insertions(+), 13 deletions(-)
 create mode 100644 src/models/eagle3.cpp

diff --git a/common/speculative.cpp b/common/speculative.cpp
index 86c1e6a42903..79202842023e 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -375,31 +375,425 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl {
     }
 };
 
+
+// EAGLE3 speculative decoding state
+//
+// Input of draft decoder: (This is different compared to MTP)
+//   At "pos P", the decoder takes input pair (t_{P+1}, g_P), with RoPE at P.
+//     - t_{P+1} = token at sequence pos P+1 (the *next* token after P)
+//     - g_P     = encoder output = projection of target's extracted hidden states at P
+//
+// Deferred boundary (MTP doesn't have this issue):
+//   Within a single process() call with n_tokens, we can only write decoder KV for
+//   training pos 0..n_tokens-2. The last training pos (n_tokens-1) needs t_{n_tokens}
+//   which lies *outside* this batch — it is the token target will sample next or the first token from next ubatch.
+//   So the last training pos of each process() call is *deferred* to whichever next call has
+//   the missing token in hand:
+//     - multi-ubatch prefill: the next process()'s first token completes the pair
+//                              (handled by the per-seq "cross-ubatch bridge")
+//     - single-ubatch prefill / after verify: draft()'s seed step uses "dp.id_last"
+//                              (target's freshest sample) to complete the pair
+//
+// Per-seq carry-over state:
+//   pending_g_last    [n_embd_dec]  ┐  the deferred boundary's (g, pos). Set by
+//   pending_pos_last  llama_pos     ┘  process() at end of ubatch (= last row);
+//                                       rebased by accept() to first-non-accepted pos.
+//   verify_g          [N × n_embd_dec] snapshot of process()'s encoder output;
+//   verify_pos_first  llama_pos         consumed by accept() to recover the right
+//   verify_g_rows     int32_t           pending_g_last row for any n_accepted value.
+//
+// Performance is overall good but there is waste in verify cycle:
+//   process() runs encoder + decoder on the *full* verify batch including rows for
+//   rejected drafts. The KV at those positions is then dropped. 
+// 
+// TODO: Not sure if we need optimization for this waste?
+// If so we may need hybrid stash:
+//      in verify mode, have process() only stash features and let draft() seed run
+//      encoder+decoder on n_accepted+1 rows).
 struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
-    //common_params_speculative_eagle3 params;
+    common_params_speculative_draft params;
+    llama_batch batch;
+
+    std::vector<common_sampler_ptr> smpls;
+
+    int32_t         n_embd_dec       = 0;       // draft hidden size
+    int32_t         n_embd_enc       = 0;       // n_extract_layers * target_hidden_size
+    int32_t         tgt_hidden       = 0;       // target model hidden size
+    const int32_t * extract_layers   = nullptr; // model_dft's extract layer indices
+    uint32_t        n_extract_layers = 0;
+
+    // [per-seq] deferred boundary state
+    std::vector<std::vector<float>> pending_g_last;
+    std::vector<llama_pos>          pending_pos_last;
+
+    // [per-seq] snapshot of the most recent process()'s encoder output
+    std::vector<std::vector<float>> verify_g;         // [n_seq][n_rows * n_embd_dec]
+    std::vector<llama_pos>          verify_pos_first; // [n_seq] — pos of verify_g[seq][0]
+    std::vector<int32_t>            verify_g_rows;    // [n_seq] — number of rows
+
+    // scratch buffer for concatenated target features [n_tokens, n_embd_enc]
+    std::vector<float> features_buf;
 
     common_speculative_impl_draft_eagle3(const common_params_speculative & params, uint32_t n_seq)
         : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq)
+        , params(params.draft)
     {
         LOG_INF("%s: adding speculative implementation 'draft-eagle3'\n", __func__);
         LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, params.draft.n_max, params.draft.n_min, params.draft.p_min);
+
+        auto * ctx_tgt = this->params.ctx_tgt;
+        auto * ctx_dft = this->params.ctx_dft;
+        GGML_ASSERT(ctx_tgt && ctx_dft && "EAGLE3 requires ctx_tgt and ctx_dft to be set");
+
+        const llama_model * model_dft = llama_get_model(ctx_dft);
+        const llama_model * model_tgt = llama_get_model(ctx_tgt);
+
+        extract_layers   = llama_model_target_extract_layers  (model_dft);
+        n_extract_layers = llama_model_n_target_extract_layers(model_dft);
+        if (n_extract_layers != 3) {
+            throw std::runtime_error("draft model is not eagle3 (expected 3 extract layers, got " +
+                                     std::to_string(n_extract_layers) + ")");
+        }
+
+        tgt_hidden = (int32_t) llama_model_target_hidden_size(model_dft);
+        if (tgt_hidden != llama_model_n_embd(model_tgt)) {
+            throw std::runtime_error("EAGLE3 target_hidden_size mismatch (draft expects " +
+                                     std::to_string(tgt_hidden) + ", target n_embd is " +
+                                     std::to_string(llama_model_n_embd(model_tgt)) + ")");
+        }
+
+        n_embd_dec = llama_model_n_embd(model_dft);
+        n_embd_enc = (int32_t) n_extract_layers * tgt_hidden;
+
+        const int32_t n_b = (int32_t) llama_n_batch(ctx_dft);
+        batch = llama_batch_init(/*n_tokens=*/ n_b, /*embd=*/ n_embd_dec, /*n_seq_max=*/ 1);
+        // llama_batch_init allocates only one of token/embd; eagle3 decoder needs both.
+        // TODO: fix, how to call without malloc
+        batch.token = (llama_token *) malloc(sizeof(llama_token) * n_b);
+
+        smpls.resize(n_seq);
+        for (auto & s : smpls) {
+            common_params_sampling sparams;
+            sparams.no_perf  = false;
+            sparams.top_k    = 10;
+            sparams.samplers = { COMMON_SAMPLER_TYPE_TOP_K };
+            s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams));
+        }
+
+        // turn on extraction of the target layers' input embeddings
+        for (uint32_t k = 0; k < n_extract_layers; ++k) {
+            llama_set_output_layer_inp(ctx_tgt, (uint32_t) extract_layers[k], true);
+        }
+
+        // turn on extraction of the draft model's pre-norm hidden state
+        // (used both for the encoder output g_embd and the decoder pre-norm output)
+        llama_set_embeddings_pre_norm(ctx_dft, true);
+
+        pending_g_last.assign(n_seq, std::vector<float>(n_embd_dec, 0.0f));
+        pending_pos_last.assign(n_seq, -1);
+
+        verify_g.assign(n_seq, std::vector<float>());
+        verify_pos_first.assign(n_seq, -1);
+        verify_g_rows.assign(n_seq, 0);
     }
 
-    void begin(llama_seq_id /*seq_id*/, const llama_tokens & /*prompt*/) override {
-        // noop
+    ~common_speculative_impl_draft_eagle3() override {
+        if (batch.token != nullptr) {
+            free(batch.token);
+            batch.token = nullptr;
+        }
+        llama_batch_free(batch);
     }
 
-    bool process(const llama_batch & /*batch*/) override {
-        // TODO: implement
+    void begin(llama_seq_id seq_id, const llama_tokens & prompt) override {
+        const int32_t N = (int32_t) prompt.size();
+        if (N <= 0) {
+            return;
+        }
+        // expected state after prefill: ctx_dft has pos 0..N-2 (last position is deferred to
+        // draft()'s seed step). Warn only if more than one position is missing.
+        auto * ctx_dft = this->params.ctx_dft;
+        const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id);
+        if (pos_max < N - 2) {
+            LOG_WRN("%s: ctx_dft pos_max=%d < N-2=%d — process() did not run on every prefill ubatch. "
+                    "Drafts may degrade.\n",
+                    __func__, (int) pos_max, N - 2);
+        }
+    }
+
+    bool process(const llama_batch & batch_in) override {
+        if (batch_in.n_tokens <= 0) {
+            return true;
+        }
+
+        if (batch_in.token == nullptr || batch_in.embd != nullptr) {
+            return true;
+        }
+
+        const int32_t n_tokens = batch_in.n_tokens;
+
+        // i_batch_beg[seq] / i_batch_end[seq]: inclusive batch indices of this seq's
+        // first/last token in batch_in. Assumes per-seq tokens are contiguous within
+        // the ubatch (server's default ordering).
+        std::vector<int32_t> i_batch_beg(n_seq, -1);
+        std::vector<int32_t> i_batch_end(n_seq, -1);
+        for (int k = 0; k < n_tokens; ++k) {
+            GGML_ASSERT(batch_in.n_seq_id[k] == 1);
+            const llama_seq_id seq_id = batch_in.seq_id[k][0];
+            if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) {
+                continue;
+            }
+            i_batch_end[seq_id] = k;
+            if (i_batch_beg[seq_id] < 0) {
+                i_batch_beg[seq_id] = k;
+            }
+        }
+
+        auto * ctx_tgt = this->params.ctx_tgt;
+        auto * ctx_dft = this->params.ctx_dft;
+
+        // Interleave each extract_layer's hidden state into a contiguous buffer of
+        // shape [n_tokens, n_extract_layers * tgt_hidden]. Then run EAGLE3 encoder
+        // to get one g_embd row per token.
+        features_buf.assign((size_t) n_tokens * n_embd_enc, 0.0f);
+
+        for (uint32_t k = 0; k < n_extract_layers; ++k) {
+            const float * layer = llama_get_output_layer_inp(ctx_tgt, (uint32_t) extract_layers[k]);
+            if (!layer) {
+                GGML_ABORT("EAGLE3: target layer %d input not extracted.",
+                           extract_layers[k]);
+            }
+            for (int32_t i = 0; i < n_tokens; ++i) {
+                float * dst = features_buf.data() + (size_t) i * n_embd_enc + k * (size_t) tgt_hidden;
+                const float * src = layer + (size_t) i * tgt_hidden;
+                std::memcpy(dst, src, (size_t) tgt_hidden * sizeof(float));
+            }
+        }
+
+        llama_batch enc_batch = {
+            /*.n_tokens =*/ n_tokens,
+            /*.token    =*/ nullptr,
+            /*.embd     =*/ features_buf.data(),
+            /*.pos      =*/ nullptr,
+            /*.n_seq_id =*/ nullptr,
+            /*.seq_id   =*/ nullptr,
+            /*.logits   =*/ nullptr,
+        };
+        int rc = llama_encode(ctx_dft, enc_batch);
+        if (rc != 0) {
+            LOG_ERR("%s: llama_encode(ctx_dft) failed rc=%d (n_tokens=%d)\n",
+                    __func__, rc, (int) n_tokens);
+            return false;
+        }
+
+        // g_embd has shape [n_tokens, n_embd_dec] in ctx_dft's pre-norm embeddings buffer
+        const float * g_embd = llama_get_embeddings_pre_norm(ctx_dft);
+        GGML_ASSERT(g_embd && "EAGLE3 encoder produced no output.");
+
+        const size_t row_bytes = (size_t) n_embd_dec * sizeof(float);
+
+        // EAGLE3 decoder input convention: at memory pos P the input pair is
+        // (token[P+1], g_embd[P]). This shifts the token index "left by one" relative to g_embd.
+        //
+        // Per seq, in order:
+        //   (a) cross-ubatch bridge — when applicable, write the previously-deferred
+        //       pos using this ubatch's first token + pending_g_last.
+        //   (b) main write loop — for k in [beg, end-1], write (token[k+1], g_embd[k])
+        //       at pos[k]. The last training pos (k=end) is left unwritten = new
+        //       deferred boundary, completed by the next process() or draft() call.
+        //   (c) refresh deferred state — stash this ubatch's full g_embd into verify_g,
+        //       update pending_g_last / pending_pos_last to the last row.
+        common_batch_clear(batch);
+
+        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+            const int32_t beg = i_batch_beg[seq_id];
+            const int32_t end = i_batch_end[seq_id];
+            if (beg < 0 || end < 0) {
+                continue;
+            }
+
+            // cross-ubatch bridge — complete the prior ubatch's deferred boundary.
+            // Fires iff all three preconditions hold:
+            //   1) pending_pos_last >= 0
+            //   2) pending_pos_last + 1 == pos[beg]
+            //   3) pending_pos_last > dft_pos_max
+            const llama_pos pending_pos = pending_pos_last[seq_id];
+            if (pending_pos >= 0 && pending_pos + 1 == batch_in.pos[beg]) {
+                const llama_pos dft_pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id);
+                if (pending_pos > dft_pos_max) {
+                    common_batch_add(batch, batch_in.token[beg], pending_pos, { seq_id }, /*logits=*/ false);
+                    std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec,
+                                pending_g_last[seq_id].data(), row_bytes);
+                }
+            }
+
+            for (int32_t k = beg; k < end; ++k) {
+                common_batch_add(batch, batch_in.token[k + 1], batch_in.pos[k],
+                                 { seq_id }, /*logits=*/ false);
+                std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec,
+                            g_embd + (size_t) k * n_embd_dec, row_bytes);
+            }
+
+            // refresh deferred state
+            const int32_t n_rows = end - beg + 1;
+            verify_pos_first[seq_id] = batch_in.pos[beg];
+            verify_g_rows[seq_id]    = n_rows;
+            verify_g[seq_id].assign((size_t) n_rows * n_embd_dec, 0.0f);
+            std::memcpy(verify_g[seq_id].data(),
+                        g_embd + (size_t) beg * n_embd_dec,
+                        (size_t) n_rows * row_bytes);
+
+            std::memcpy(pending_g_last[seq_id].data(),
+                        g_embd + (size_t) end * n_embd_dec, row_bytes);
+            pending_pos_last[seq_id] = batch_in.pos[end];
+        }
+
+        if (batch.n_tokens > 0) {
+            rc = llama_decode(ctx_dft, batch);
+            if (rc != 0) {
+                LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (n_tokens=%d, ubatch_pos[0]=%d)\n",
+                        __func__, rc, (int) batch.n_tokens, (int) batch_in.pos[0]);
+                return false;
+            }
+        }
+
         return true;
     }
 
-    void draft(common_speculative_draft_params_vec & /*dparams*/) override {
-        // TODO: implement
+    void draft(common_speculative_draft_params_vec & dparams) override {
+        auto & ctx_dft = params.ctx_dft;
+
+        common_batch_clear(batch);
+
+        // keep track of which sequences are still drafting
+        int n_drafting = 0;
+        std::vector<bool> drafting(n_seq);
+
+        const size_t row_bytes = (size_t) n_embd_dec * sizeof(float);
+
+        // Complete the deferred boundary pair (dp.id_last, pending_g_last) at memory
+        // pos pending_pos_last. dp.id_last is target's freshest sample (= corrected
+        // token after verify, or first generated token after prefill), matching the
+        // EAGLE3 input convention (token[P+1], g_embd[P]) at pos P.
+        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+            auto & dp = dparams[seq_id];
+
+            if (!dp.drafting) {
+                continue;
+            }
+            if (pending_pos_last[seq_id] < 0) {
+                continue;
+            }
+
+            n_drafting++;
+            drafting[seq_id] = true;
+            common_sampler_reset(smpls[seq_id].get());
+
+            llama_memory_seq_rm(llama_get_memory(ctx_dft), seq_id, pending_pos_last[seq_id], -1);
+
+            common_batch_add(batch, dp.id_last, pending_pos_last[seq_id], { seq_id }, true);
+            std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec,
+                        pending_g_last[seq_id].data(),
+                        row_bytes);
+        }
+
+        if (batch.n_tokens == 0) {
+            return;
+        }
+
+        int ret = llama_decode(ctx_dft, batch);
+        if (ret != 0) {
+            LOG_WRN("%s: llama_decode returned %d\n", __func__, ret);
+            return;
+        }
+
+        int i = 0;
+
+        while (n_drafting > 0) {
+            int i_batch = 0;
+
+            common_batch_clear(batch);
+
+            for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+                if (!drafting[seq_id]) {
+                    continue;
+                }
+
+                auto * smpl = smpls[seq_id].get();
+
+                common_sampler_sample(smpl, ctx_dft, i_batch, true);
+                // pre-norm hidden state of this position becomes g_embd for the next step
+                const float * prenorm = llama_get_embeddings_pre_norm_ith(ctx_dft, i_batch);
+                ++i_batch;
+
+                const auto * cur_p = common_sampler_get_candidates(smpl, true);
+
+                for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
+                    LOG_DBG(" - seq_id %d, draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
+                            seq_id, k, i, cur_p->data[k].id, cur_p->data[k].p,
+                            common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
+                }
+
+                const llama_token id = cur_p->data[0].id;
+
+                // only collect very high-confidence draft tokens
+                // (configurable via --spec-draft-p-min, set to 0.0 to disable early-stop)
+                if (cur_p->data[0].p < params.p_min) {
+                    drafting[seq_id] = false;
+                    n_drafting--;
+
+                    continue;
+                }
+
+                common_sampler_accept(smpl, id, true);
+
+                auto & dp = dparams.at(seq_id);
+                auto & result = *dp.result;
+
+                result.push_back(id);
+
+                if ((params.n_max <= (int) result.size()) ||
+                    (dp.n_max > 0 && dp.n_max <= (int) result.size())) {
+                    drafting[seq_id] = false;
+                    n_drafting--;
+                    continue;
+                }
+
+                common_batch_add(batch, id, pending_pos_last[seq_id] + (i + 1), { seq_id }, true);
+                std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec,
+                            prenorm,
+                            row_bytes);
+            }
+
+            if (batch.n_tokens == 0) {
+                break;
+            }
+
+            ret = llama_decode(ctx_dft, batch);
+            if (ret != 0) {
+                LOG_WRN("%s: llama_decode[%d] returned %d\n", __func__, i, ret);
+                break;
+            }
+
+            ++i;
+        }
     }
 
-    void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override {
-        // noop
+    void accept(llama_seq_id seq_id, uint16_t n_accepted, bool /*is_other*/) override {
+        if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) {
+            return;
+        }
+
+        const int32_t n_rows = verify_g_rows[seq_id];
+        if (n_rows <= 0) {
+            return;
+        }
+
+        const int32_t i_g = std::min<int32_t>(n_accepted, n_rows - 1);
+        pending_pos_last[seq_id] = verify_pos_first[seq_id] + i_g;
+        std::memcpy(pending_g_last[seq_id].data(),
+                    verify_g[seq_id].data() + (size_t) i_g * n_embd_dec,
+                    (size_t) n_embd_dec * sizeof(float));
     }
 
     bool need_embd() const override {
@@ -1369,9 +1763,11 @@ common_speculative * common_speculative_init(common_params_speculative & params,
         uint32_t enabled_configs = common_get_enabled_speculative_configs(params.types);
 
         bool has_draft_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE));
-        bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3
+        bool has_draft_eagle3 = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3)) && has_draft_model_path;
         bool has_mtp = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP)) && params.draft.ctx_dft != nullptr;
 
+
+
         bool has_ngram_cache   = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_CACHE));
         bool has_ngram_simple  = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE));
         bool has_ngram_map_k   = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K));
@@ -1499,6 +1895,26 @@ void common_speculative_free(common_speculative * spec) {
     delete spec;
 }
 
+void common_speculative_setup_draft_model(struct llama_model * model_dft, const struct llama_model * model_tgt) {
+    if (model_dft == nullptr || model_tgt == nullptr) {
+        return;
+    }
+    if (llama_model_get_tok_embd(model_dft) == nullptr) {
+        ggml_tensor * tgt_tok_embd = llama_model_get_tok_embd(model_tgt);
+        if (tgt_tok_embd != nullptr) {
+            llama_model_set_tok_embd(model_dft, tgt_tok_embd);
+            LOG_INF("%s: draft inheriting target's tok_embd\n", __func__);
+        }
+    }
+    if (llama_model_get_lm_head(model_dft) == nullptr) {
+        ggml_tensor * tgt_lm_head = llama_model_get_lm_head(model_tgt);
+        if (tgt_lm_head != nullptr) {
+            llama_model_set_lm_head(model_dft, tgt_lm_head);
+            LOG_INF("%s: draft inheriting target's lm_head\n", __func__);
+        }
+    }
+}
+
 common_speculative_draft_params & common_speculative_get_draft_params(
         common_speculative * spec,
         llama_seq_id seq_id) {
diff --git a/common/speculative.h b/common/speculative.h
index bf76ad709e26..f1cfcb237f4c 100644
--- a/common/speculative.h
+++ b/common/speculative.h
@@ -27,6 +27,10 @@ common_speculative * common_speculative_init(common_params_speculative & params,
 
 void common_speculative_free(common_speculative * spec);
 
+// Optional setup hook to call once after loading the draft model but before creating its context.
+// Inherits any missing weights from the target model (e.g. tok_embd / lm_head from target model for eagle3 / dflash)
+void common_speculative_setup_draft_model(struct llama_model * model_dft, const struct llama_model * model_tgt);
+
 struct common_speculative_draft_params {
     // this flag is used to chain the drafts through all the available implementations
     // after the first successful draft from an implementation, we set it
diff --git a/conversion/base.py b/conversion/base.py
index 408e209aa884..9d81c19b46de 100644
--- a/conversion/base.py
+++ b/conversion/base.py
@@ -94,6 +94,7 @@ class ModelBase:
     metadata: gguf.Metadata
     dir_model_card: Path
     remote_hf_model_id: str | None
+    target_model_dir: Path | None
 
     # subclasses should define this!
     model_arch: gguf.MODEL_ARCH
@@ -119,6 +120,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
                  small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None,
                  disable_mistral_community_chat_template: bool = False,
                  sentence_transformers_dense_modules: bool = False,
+                 target_model_dir: Path | None = None,
                  fuse_gate_up_exps: bool = False,
                  fp8_as_q8: bool = False):
         if type(self) is ModelBase or \
@@ -139,6 +141,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
         self.dry_run = dry_run
         self.remote_hf_model_id = remote_hf_model_id
         self.sentence_transformers_dense_modules = sentence_transformers_dense_modules
+        self.target_model_dir = target_model_dir
         self.fuse_gate_up_exps = fuse_gate_up_exps
         self._gate_exp_buffer: dict[int, Tensor] = {}
         self._up_exp_buffer: dict[int, Tensor] = {}
@@ -2481,6 +2484,7 @@ class LazyTorchTensor(gguf.LazyBase):
         torch.float16: np.float16,
         torch.float32: np.float32,
         torch.uint8: np.uint8,
+        torch.int64: np.int64,
     }
 
     # only used when byteswapping data. Only correct size is needed
diff --git a/conversion/llama.py b/conversion/llama.py
index fd6167bfd91f..db073b9b361a 100644
--- a/conversion/llama.py
+++ b/conversion/llama.py
@@ -10,7 +10,7 @@
 if TYPE_CHECKING:
     from torch import Tensor
 
-from .base import ModelBase, TextModel, gguf
+from .base import ModelBase, TextModel, gguf, logger
 
 
 @ModelBase.register(
@@ -21,6 +21,9 @@
     "VLlama3ForCausalLM",
     "LlavaForConditionalGeneration",
     "VoxtralForConditionalGeneration",
+    "LlamaForCausalLMEagle3",
+    "Eagle3Speculator",
+    "Eagle3DraftModel",
     "IQuestCoderForCausalLM",
     "LlamaModel")
 class LlamaModel(TextModel):
@@ -39,7 +42,57 @@ def __init__(self, *args, **kwargs):
             hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
             self.origin_hf_arch = hparams.get('architectures', [None])[0]
 
+        # Detect eagle3 draft checkpoint by hparams (some models don't use a distinct HF arch name)
+        if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1:
+            self.is_eagle3 = True
+            self.model_arch = gguf.MODEL_ARCH.EAGLE3
+            logger.info("Detected EAGLE-3 draft model, switching to EAGLE3 architecture")
+            # Re-initialize tensor_map with eagle3 architecture
+            self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
+            # Update gguf_writer architecture
+            self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
+            self.gguf_writer.add_architecture()
+            if self.target_model_dir is None:
+                raise ValueError(
+                    "EAGLE-3 model requires --target-model-dir to be specified. "
+                    "Please provide the path to the target model directory to read config.json"
+                )
+            # Read both eagle3 raw config and target model config
+            with open(self.dir_model / "config.json", 'r', encoding='utf-8') as f:
+                eagle3_raw_config = json.load(f)
+            with open(self.target_model_dir / "config.json", 'r', encoding='utf-8') as f:
+                target_config = json.load(f)
+
+            # extract_layers: derived from target model layer count (low/mid/high)
+            target_num_layers = target_config["num_hidden_layers"]
+            extract_layers = [2, target_num_layers // 2, target_num_layers - 3]
+            logger.info(f"EAGLE-3: extract_layers = {extract_layers} (target model has {target_num_layers} layers)")
+            self.gguf_writer.add_array(f"{self.gguf_writer.arch}.extract_layers", extract_layers)
+
+            # target_hidden_size: prefer eagle3 config, fallback to target config
+            if eagle3_raw_config.get("target_hidden_size") is not None:
+                target_hidden_size = eagle3_raw_config["target_hidden_size"]
+                src = "EAGLE-3 config"
+            else:
+                target_hidden_size = target_config["hidden_size"]
+                src = "target model config"
+            logger.info(f"EAGLE-3: target_hidden_size = {target_hidden_size} (from {src})")
+            self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.target_hidden_size", target_hidden_size)
+
+            # norm_before_residual (RedHat-style eagle3 specific)
+            norm_before_residual = eagle3_raw_config.get("norm_before_residual", False)
+            logger.info(f"EAGLE-3: norm_before_residual = {norm_before_residual}")
+            self.gguf_writer.add_bool(f"{self.gguf_writer.arch}.norm_before_residual", norm_before_residual)
+
     def set_vocab(self):
+        # eagle3: use tokenizer from target model if provided
+        original_dir_model = None
+        if getattr(self, 'is_eagle3', False):
+            assert self.target_model_dir is not None
+            logger.info(f"EAGLE-3: Using tokenizer from target model: {self.target_model_dir}")
+            original_dir_model = self.dir_model
+            self.dir_model = self.target_model_dir
+
         if self.origin_hf_arch == "GlmasrModel":
             return self._set_vocab_glmedge()
 
@@ -85,6 +138,10 @@ def set_vocab(self):
         if self.hparams.get("vocab_size", 32000) == 49152:
             self.gguf_writer.add_add_bos_token(False)
 
+        # eagle3: Restore original dir_model
+        if original_dir_model is not None:
+            self.dir_model = original_dir_model
+
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
         hparams = self.hparams
@@ -129,7 +186,49 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca
 
         return super().filter_tensors((name, gen))
 
+    def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]:
+        tensors = super().index_tensors(remote_hf_model_id)
+
+        # Handle Eagle3Speculator nested config
+        if "transformer_layer_config" in self.hparams:
+            self.hparams = {**self.hparams, **self.hparams["transformer_layer_config"]}
+
+        # eagle3 detection
+        if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1:
+            logger.info("EAGLE-3: renaming midlayer.* / layers.0.* to model.layers.0.*")
+            new_tensors = {}
+            for name, gen in tensors.items():
+                if name.startswith("midlayer."):
+                    new_name = "model.layers.0." + name[len("midlayer."):]
+                    new_tensors[new_name] = gen
+                elif name.startswith("layers.0."):  # Eagle3Speculator format
+                    new_name = "model." + name
+                    new_tensors[new_name] = gen
+                else:
+                    new_tensors[name] = gen
+            return new_tensors
+
+        return tensors
+
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # eagle3: special tensors that bypass standard llama mapping
+        if getattr(self, 'is_eagle3', False):
+            if name == "fc.weight":
+                yield (name, data_torch)
+                return
+            if name == "d2t":
+                # store for manual int64 handling in prepare_tensors (avoid F32 conversion)
+                if not hasattr(self, '_eagle3_int_tensors'):
+                    self._eagle3_int_tensors = {}
+                self._eagle3_int_tensors[name] = data_torch
+                return
+            if name == "t2d":
+                # not used at runtime, skip
+                return
+            if name == "model.layers.0.hidden_norm.weight":
+                yield ("blk.0.hidden_norm.weight", data_torch)
+                return
+
         n_head = self.find_hparam(["n_heads", "num_attention_heads"])
         n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"])
 
@@ -205,8 +304,26 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
                 yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
 
     def prepare_tensors(self):
+        # eagle3: collect d2t original dtype before parent converts tensors to F32
+        eagle3_original_dtypes = {}
+        if getattr(self, 'is_eagle3', False):
+            for name, data_torch in self.get_tensors():
+                if name == "d2t":
+                    eagle3_original_dtypes[name] = data_torch.dtype
+
         super().prepare_tensors()
 
+        # eagle3: write d2t as int64 directly (not converted to F32)
+        if getattr(self, 'is_eagle3', False) and hasattr(self, '_eagle3_int_tensors'):
+            for name, data_torch in self._eagle3_int_tensors.items():
+                old_dtype = eagle3_original_dtypes.get(name, data_torch.dtype)
+                data = data_torch.to(torch.int64).numpy()
+                data_qtype = gguf.GGMLQuantizationType.I64
+
+                shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}"
+                logger.info(f"{name + ',':<30} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
+                self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype)
+
         if self._experts is not None:
             # flatten `list[dict[str, Tensor]]` into `list[str]`
             experts = [k for d in self._experts for k in d.keys()]
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index a6192c039a0a..3b23d5ebc0d3 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -153,6 +153,15 @@ def parse_args() -> argparse.Namespace:
         help="Store tensors dequantized from FP8 as Q8_0 instead of BF16/F16.",
     )
 
+    parser.add_argument(
+        "--target-model-dir", type=str, default=None,
+        help=(
+            "path to the target model directory; required when converting a standalone draft model "
+            "(e.g. EAGLE3 / DFlash) that needs target-model metadata such as tokenizer, hidden size, and "
+            "layer count to populate its GGUF."
+        ),
+    )
+
     args = parser.parse_args()
     if not args.print_supported_models and args.model is None:
         parser.error("the following arguments are required: model")
@@ -269,6 +278,7 @@ def main() -> None:
                                      small_first_shard=args.no_tensor_first_split,
                                      remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template,
                                      sentence_transformers_dense_modules=args.sentence_transformers_dense_modules,
+                                     target_model_dir=Path(args.target_model_dir) if args.target_model_dir else None,
                                      fuse_gate_up_exps=args.fuse_gate_up_exps,
                                      fp8_as_q8=args.fp8_as_q8,
                                      )
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index bd6246137b0a..1ad57f24d3c0 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -154,6 +154,9 @@ class LLM:
         HIDDEN_ACT                        = "{arch}.hidden_activation"
         DENSE_FEAT_IN_SIZE                = "{arch}.{dense}_feat_in"
         DENSE_FEAT_OUT_SIZE               = "{arch}.{dense}_feat_out"
+        EAGLE3_EXTRACT_LAYERS             = "{arch}.extract_layers"
+        EAGLE3_TARGET_HIDDEN_SIZE         = "{arch}.target_hidden_size"
+        EAGLE3_NORM_BEFORE_RESIDUAL       = "{arch}.norm_before_residual"
 
     class Attention:
         HEAD_COUNT                   = "{arch}.attention.head_count"
@@ -510,6 +513,7 @@ class MODEL_ARCH(IntEnum):
     RND1             = auto()
     PANGU_EMBED      = auto()
     MISTRAL3         = auto()
+    EAGLE3           = auto()
     MISTRAL4         = auto()
     PADDLEOCR        = auto()
     MIMO2            = auto()
@@ -906,6 +910,10 @@ class MODEL_TENSOR(IntEnum):
     NEXTN_HNORM          = auto()
     NEXTN_SHARED_HEAD_HEAD = auto()
     NEXTN_SHARED_HEAD_NORM = auto()
+    # eagle3
+    EAGLE3_FC          = auto()  # feature fusion layer
+    EAGLE3_HIDDEN_NORM = auto()  # hidden normalization
+    EAGLE3_D2T         = auto()  # draft to target vocabulary mapping
     # lfm2 audio
     A_ENC_NORM_CONV        = auto()
     A_ENC_LINEAR_POS       = auto()
@@ -1060,6 +1068,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.RND1:             "rnd1",
     MODEL_ARCH.PANGU_EMBED:      "pangu-embedded",
     MODEL_ARCH.MISTRAL3:         "mistral3",
+    MODEL_ARCH.EAGLE3:           "eagle3",
     MODEL_ARCH.MISTRAL4:         "mistral4",
     MODEL_ARCH.PADDLEOCR:        "paddleocr",
     MODEL_ARCH.MIMO2:            "mimo2",
@@ -1483,6 +1492,9 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.NEXTN_HNORM:               "blk.{bid}.nextn.hnorm",
     MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD:    "blk.{bid}.nextn.shared_head_head",
     MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM:    "blk.{bid}.nextn.shared_head_norm",
+    MODEL_TENSOR.EAGLE3_FC:                 "fc",
+    MODEL_TENSOR.EAGLE3_HIDDEN_NORM:        "blk.{bid}.hidden_norm",
+    MODEL_TENSOR.EAGLE3_D2T:                "d2t",
 }
 
 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -4021,6 +4033,24 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN_EXP,
         MODEL_TENSOR.FFN_UP_EXP,
     ],
+    MODEL_ARCH.EAGLE3: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.EAGLE3_FC,
+        MODEL_TENSOR.EAGLE3_HIDDEN_NORM,
+        MODEL_TENSOR.EAGLE3_D2T,
+    ],
     MODEL_ARCH.MISTRAL4: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 6a5d5f8d2ac8..46217c5eb753 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -128,6 +128,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_RND1,             "rnd1"             },
     { LLM_ARCH_PANGU_EMBED,      "pangu-embedded"   },
     { LLM_ARCH_MISTRAL3,         "mistral3"         },
+    { LLM_ARCH_EAGLE3,           "eagle3"           },
     { LLM_ARCH_MISTRAL4,         "mistral4"         },
     { LLM_ARCH_PADDLEOCR,        "paddleocr"        },
     { LLM_ARCH_MIMO2,            "mimo2"            },
@@ -292,6 +293,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
 
     { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
 
+    { LLM_KV_EAGLE3_EXTRACT_LAYERS,        "%s.extract_layers"        },
+    { LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE,    "%s.target_hidden_size"    },
+    { LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL,  "%s.norm_before_residual"  },
+
     { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
     // sentence-transformers dense modules feature dims
     { LLM_KV_DENSE_2_FEAT_IN,        "%s.dense_2_feat_in"  },
@@ -559,6 +564,9 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_INDEXER_PROJ,                           "blk.%d.indexer.proj" },
     { LLM_TENSOR_INDEXER_ATTN_K,                         "blk.%d.indexer.attn_k" },
     { LLM_TENSOR_INDEXER_ATTN_Q_B,                       "blk.%d.indexer.attn_q_b" },
+    { LLM_TENSOR_EAGLE3_HIDDEN_NORM,                     "blk.%d.hidden_norm" },
+    { LLM_TENSOR_EAGLE3_FC,                              "fc" },
+    { LLM_TENSOR_EAGLE3_D2T,                             "d2t" },
 };
 
 // declare information about the model weight tensors:
@@ -783,6 +791,10 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     // latent projections feed ggml_mul_mat, the buft probe must use MUL_MAT to keep them on GPU
     {LLM_TENSOR_FFN_LATENT_DOWN,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_FFN_LATENT_UP,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    // eagle3
+    {LLM_TENSOR_EAGLE3_FC,                  {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_EAGLE3_HIDDEN_NORM,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_EAGLE3_D2T,                 {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_GET_ROWS}},
 };
 
 LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 03b1a265d67a..60581af024da 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -142,6 +142,7 @@ enum llm_arch {
     LLM_ARCH_TALKIE,
     LLM_ARCH_MELLUM,
     LLM_ARCH_UNKNOWN,
+    LLM_ARCH_EAGLE3,
 };
 
 enum llm_kv {
@@ -336,6 +337,10 @@ enum llm_kv {
 
     LLM_KV_CLASSIFIER_OUTPUT_LABELS,
 
+    LLM_KV_EAGLE3_EXTRACT_LAYERS,
+    LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE,
+    LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL,
+
     LLM_KV_SHORTCONV_L_CACHE,
 
     LLM_KV_XIELU_ALPHA_N,
@@ -566,6 +571,9 @@ enum llm_tensor {
     LLM_TENSOR_NEXTN_HNORM,
     LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
     LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
+    LLM_TENSOR_EAGLE3_FC,
+    LLM_TENSOR_EAGLE3_HIDDEN_NORM,
+    LLM_TENSOR_EAGLE3_D2T,
 };
 
 enum llm_tensor_layer {
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 31f9a530ee7d..4c40bdf3703d 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -87,6 +87,7 @@ llama_context::llama_context(
 
     cparams.ctx_other = nullptr;
     cparams.output_layer_inp.resize(hparams.n_layer, false);
+    embd_layer_inp.resize(hparams.n_layer);
 
     // TODO: more generic
     if (model.arch == LLM_ARCH_GEMMA4_ASSISTANT) {
@@ -1277,6 +1278,13 @@ void llama_context::set_output_layer_inp(uint32_t layer_id, bool enable) {
     sched_need_reserve = true;
 }
 
+float * llama_context::get_output_layer_inp(uint32_t layer_id) {
+    if (layer_id >= embd_layer_inp.size() || embd_layer_inp[layer_id].empty()) {
+        return nullptr;
+    }
+    return embd_layer_inp[layer_id].data();
+}
+
 llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
     if (mctx && !mctx->apply()) {
         LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__);
@@ -1361,7 +1369,10 @@ int llama_context::encode(const llama_batch & batch_inp) {
 
     const auto & hparams = model.hparams;
 
-    const int64_t n_embd  = hparams.n_embd_inp();
+    // eagle3/DFlash: features as encoder input, and non-draft paths fall back to model's input dim
+    const int64_t n_embd = (hparams.n_embd_target_features > 0 && batch_inp.embd)
+                             ? (int64_t) hparams.n_embd_target_features
+                             : hparams.n_embd_inp();
     const int64_t n_vocab = model.vocab.n_tokens();
 
     // note: during encode, we always pass the full sequence starting from pos = 0
@@ -1872,7 +1883,39 @@ int llama_context::decode(const llama_batch & batch_inp) {
             if (n_outputs) {
                 GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
                 GGML_ASSERT((n_outputs_prev + n_outputs)*n_vocab <= (int64_t) logits.size);
-                ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float));
+
+                // eagle3: Map draft vocab to target vocab
+                if (model.arch == LLM_ARCH_EAGLE3 && model.d2t) {
+                    static thread_local std::vector<int64_t> eagle3_d2t_map;
+                    static thread_local std::vector<float>   eagle3_draft_logits;
+
+                    const int64_t draft_vocab_size = t_logits->ne[0];
+                    const uint32_t last_idx = n_outputs - 1;
+
+                    if (eagle3_d2t_map.empty()) {
+                        eagle3_d2t_map.resize(model.d2t->ne[0]);
+                        ggml_backend_tensor_get(model.d2t, eagle3_d2t_map.data(), 0,
+                                                eagle3_d2t_map.size() * sizeof(int64_t));
+                    }
+
+                    eagle3_draft_logits.resize(draft_vocab_size);
+                    const size_t last_offset = last_idx * draft_vocab_size * sizeof(float);
+                    ggml_backend_tensor_get_async(backend_res, t_logits, eagle3_draft_logits.data(),
+                                                  last_offset, draft_vocab_size * sizeof(float));
+                    synchronize();
+
+                    float * last_logits_out = logits_out + last_idx * n_vocab;
+                    std::fill(last_logits_out, last_logits_out + n_vocab,
+                              -std::numeric_limits<float>::infinity());
+
+                    for (int64_t j = 0; j < draft_vocab_size; j++) {
+                        const int64_t target_id = j + eagle3_d2t_map[j];
+                        GGML_ASSERT(target_id >= 0 && target_id < n_vocab);
+                        last_logits_out[target_id] = eagle3_draft_logits[j];
+                    }
+                } else {
+                    ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float));
+                }
             }
         }
 
@@ -1936,6 +1979,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
             }
         }
 
+        extract_layer_inputs(res);
+
         // extract nextn embeddings before
         // only meaningful in LLAMA_POOLING_TYPE_NONE (per-token); other pooling modes are ignored.
         {
@@ -2174,6 +2219,23 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
     return n_outputs_max;
 }
 
+void llama_context::extract_layer_inputs(const llm_graph_result * res) {
+    for (uint32_t il = 0; il < cparams.output_layer_inp.size(); ++il) {
+        if (!cparams.output_layer_inp[il]) {
+            continue;
+        }
+        ggml_tensor * t = res->get_layer_inp((int) il);
+        if (!t) {
+            continue;
+        }
+        const size_t nbytes = ggml_nbytes(t);
+        embd_layer_inp[il].resize(nbytes / sizeof(float));
+        ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched.get(), t);
+        GGML_ASSERT(backend != nullptr);
+        ggml_backend_tensor_get_async(backend, t, embd_layer_inp[il].data(), 0, nbytes);
+    }
+}
+
 void llama_context::output_reorder() {
     const uint64_t n_vocab = model.vocab.n_tokens();
     const uint64_t n_embd  = model.hparams.n_embd;
@@ -4043,3 +4105,7 @@ llama_context * llama_get_ctx_other(struct llama_context * ctx) {
 void llama_set_output_layer_inp(struct llama_context * ctx, uint32_t layer_id, bool enable) {
     ctx->set_output_layer_inp(layer_id, enable);
 }
+
+float * llama_get_output_layer_inp(struct llama_context * ctx, uint32_t layer_id) {
+    return ctx->get_output_layer_inp(layer_id);
+}
diff --git a/src/llama-context.h b/src/llama-context.h
index 1b516a7bf2b4..d6d483cb97d6 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -128,6 +128,9 @@ struct llama_context {
 
     void set_output_layer_inp(uint32_t layer_id, bool enable);
 
+    // read back the input embeddings of the specified layer
+    float * get_output_layer_inp(uint32_t layer_id);
+
     // process a single ubatch with a specific graph type
     // if memory_context is provided, it will be applied first to the context's memory
     // ret contains the status of the graph computation
@@ -228,6 +231,10 @@ struct llama_context {
     // map the output row index `i` to batch index
     int64_t output_resolve_row(int32_t i) const;
 
+    // async-copy enabled layer-input tensors (per cparams.output_layer_inp) 
+    // from backend into host-side embd_layer_inp buffers
+    void extract_layer_inputs(const llm_graph_result * res);
+
     //
     // graph
     //
@@ -356,6 +363,10 @@ struct llama_context {
     // host buffer for the model output (logits and embeddings)
     ggml_backend_buffer_ptr buf_output;
 
+    // host buffer for output layer input embeddings, per layer
+    // populated when cparams.output_layer_inp[il] is true
+    std::vector<std::vector<float>> embd_layer_inp;
+
     // keep copies of the per-sequence memory on the device
     std::map<llama_seq_id, llama_memory_buffers> mem_storage;
 
diff --git a/src/llama-ext.h b/src/llama-ext.h
index c118f9fb3feb..51838a761f10 100644
--- a/src/llama-ext.h
+++ b/src/llama-ext.h
@@ -110,6 +110,24 @@ LLAMA_API llama_context * llama_get_ctx_other(struct llama_context * ctx);
 // set if the layer input embeddings should be outputed
 LLAMA_API void llama_set_output_layer_inp(struct llama_context * ctx, uint32_t layer_id, bool enable);
 
+// read back the input embeddings of the specified layer for the most recent ubatch
+// the layer must have been enabled via llama_set_output_layer_inp
+LLAMA_API float * llama_get_output_layer_inp(struct llama_context * ctx, uint32_t layer_id);
+
 LLAMA_API ggml_tensor * llama_model_get_tok_embd(const struct llama_model * model);
 LLAMA_API void          llama_model_set_tok_embd(      struct llama_model * model, ggml_tensor * tensor);
 
+LLAMA_API ggml_tensor * llama_model_get_lm_head(const struct llama_model * model);
+LLAMA_API void          llama_model_set_lm_head(      struct llama_model * model, ggml_tensor * tensor);
+
+//
+// eagle3/DFlash: consume target model extracted features
+//
+
+// returns pointer to the target-model layer indices
+LLAMA_API const int32_t * llama_model_target_extract_layers  (const struct llama_model * model);
+// returns the number of extracted layers from target model
+LLAMA_API uint32_t        llama_model_n_target_extract_layers(const struct llama_model * model);
+// returns the target model hidden size
+LLAMA_API uint32_t        llama_model_target_hidden_size     (const struct llama_model * model);
+
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 4f23466ce02b..62d91129504d 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -236,6 +236,13 @@ struct llama_hparams {
     // >=0 => input embedding index for deepstack injection
     std::array<int32_t, LLAMA_MAX_LAYERS> deepstack_mapping_arr;
 
+    // eagle3/DFlash sahred params
+    // n_embd_target_features = n_extract * target_hidden_size (encoder input dim)
+    uint32_t n_embd_target_features = 0;
+    uint32_t target_hidden_size     = 0;
+    // eagle3: whether to apply hidden_norm before storing residual
+    bool eagle3_norm_before_residual = false;
+
     // gemma4 per-layer embedding
     uint32_t n_embd_per_layer = 0;
 
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index 0d1cf3cc33bb..474cabdfc095 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -394,6 +394,7 @@ namespace GGUFMeta {
 
     template bool llama_model_loader::get_arr<std::vector<std::string>>(enum llm_kv kid, std::vector<std::string> & result, bool required);
     template bool llama_model_loader::get_arr<std::array<int32_t, 512>>(enum llm_kv kid, std::array<int32_t, 512> & result, bool required);
+    template bool llama_model_loader::get_arr<std::vector<int32_t>>(enum llm_kv kid, std::vector<int32_t> & result, bool required);
 
     template<typename T>
     bool llama_model_loader::get_key(const std::string & key, T & result, bool required) {
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index a31a23c06149..a41740f81ce2 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -287,6 +287,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params
             return new llama_model_qwen35moe(params);
         case LLM_ARCH_MISTRAL3:
             return new llama_model_mistral3(params);
+        case LLM_ARCH_EAGLE3:
+            return new llama_model_eagle3(params);
         case LLM_ARCH_MIMO2:
             return new llama_model_mimo2(params);
         case LLM_ARCH_KIMI_LINEAR:
@@ -2406,6 +2408,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_ERNIE4_5:
         case LLM_ARCH_ERNIE4_5_MOE:
         case LLM_ARCH_MISTRAL3:
+        case LLM_ARCH_EAGLE3:
         case LLM_ARCH_MISTRAL4:
         case LLM_ARCH_LLAMA_EMBED:
         case LLM_ARCH_MAINCODER:
@@ -2695,3 +2698,24 @@ ggml_tensor * llama_model_get_tok_embd(const struct llama_model * model) {
 void llama_model_set_tok_embd(struct llama_model * model, ggml_tensor * tensor) {
     model->tok_embd = tensor;
 }
+
+ggml_tensor * llama_model_get_lm_head(const struct llama_model * model) {
+    return model->output;
+}
+
+void llama_model_set_lm_head(struct llama_model * model, ggml_tensor * tensor) {
+    model->output = tensor;
+}
+
+const int32_t * llama_model_target_extract_layers(const struct llama_model * model) {
+    const auto & v = model->target_extract_layers;
+    return v.empty() ? nullptr : v.data();
+}
+
+uint32_t llama_model_n_target_extract_layers(const struct llama_model * model) {
+    return (uint32_t) model->target_extract_layers.size();
+}
+
+uint32_t llama_model_target_hidden_size(const struct llama_model * model) {
+    return model->hparams.target_hidden_size;
+}
diff --git a/src/llama-model.h b/src/llama-model.h
index 992c8d9c8fd9..b28eb7baf256 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -475,6 +475,9 @@ struct llama_layer {
     struct ggml_tensor * ffn_act_beta    = nullptr;
     struct ggml_tensor * ffn_act_eps     = nullptr;
 
+    // eagle3
+    struct ggml_tensor * eagle3_hidden_norm = nullptr;
+
     // Kimi Linear KDA (using ssm_ prefix for consistency)
     // Note: ssm_dt_b already exists above (mamba bias), reused for Kimi dt_bias
     struct ggml_tensor * ssm_q_conv = nullptr;
@@ -569,6 +572,13 @@ struct llama_model {
     struct ggml_tensor * per_layer_model_proj = nullptr;
     struct ggml_tensor * per_layer_proj_norm  = nullptr;
 
+    // eagle3
+    struct ggml_tensor * fc  = nullptr;  // feature fusion layer
+    struct ggml_tensor * d2t = nullptr;  // draft to target vocabulary mapping
+
+    // unified vector to store target-model extracted layer ids in eagle3, dflash, etc.
+    std::vector<int32_t> target_extract_layers;
+
     std::vector<llama_layer> layers;
 
     //Dense linear projections for SentenceTransformers models like embeddinggemma
diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp
new file mode 100644
index 000000000000..3694d262cb85
--- /dev/null
+++ b/src/models/eagle3.cpp
@@ -0,0 +1,300 @@
+#include "models.h"
+
+void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
+    if (!ml.get_arr(LLM_KV_EAGLE3_EXTRACT_LAYERS, target_extract_layers, false)) {
+        throw std::runtime_error("EAGLE3 model requires 'extract_layers' in GGUF metadata");
+    }
+    if (target_extract_layers.size() != 3) {
+        throw std::runtime_error("EAGLE3 requires exactly 3 entries in 'extract_layers'");
+    }
+    LLAMA_LOG_INFO("%s: EAGLE3 extract_layers = [%d, %d, %d]\n", __func__,
+            target_extract_layers[0],
+            target_extract_layers[1],
+            target_extract_layers[2]);
+
+    ml.get_key(LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, hparams.target_hidden_size);
+    LLAMA_LOG_INFO("%s: EAGLE3 target_hidden_size = %u (draft n_embd = %u)\n", __func__,
+            hparams.target_hidden_size, hparams.n_embd);
+
+    hparams.n_embd_target_features = (uint32_t) target_extract_layers.size() * hparams.target_hidden_size;
+
+    // eagle3 norm_before_residual (optional, default false)
+    // compatible with Readhat eagle3 speculator model
+    ml.get_key(LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, hparams.eagle3_norm_before_residual, false);
+    if (hparams.eagle3_norm_before_residual) {
+        LLAMA_LOG_INFO("%s: EAGLE3 norm_before_residual = true\n", __func__);
+    }
+
+    type = LLM_TYPE_UNKNOWN;
+}
+
+void llama_model_eagle3::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    const int64_t n_embd_target_features = (int64_t) hparams.n_embd_target_features;
+    const int64_t n_embd_attn_input = 2 * n_embd;
+
+    // Get vocab size from the d2t tensor in the GGUF file (optional - only needed if eagle3 has different vocab_size than target)
+    // d2t: draft to target vocabulary mapping
+    int64_t n_draft_vocab = n_vocab;  // Default: same as target vocab
+    const struct ggml_tensor * d2t_meta = ml->get_tensor_meta("d2t");
+    if (d2t_meta) {
+        n_draft_vocab = d2t_meta->ne[0]; // update draft vocab size
+        d2t = create_tensor(tn(LLM_TENSOR_EAGLE3_D2T), {n_draft_vocab}, 0);
+        LLAMA_LOG_INFO("%s: EAGLE3 using d2t mapping (draft_vocab_size = %lld)\n", __func__, (long long)n_draft_vocab);
+    } else {
+        d2t = nullptr; // no d2t, use default vocab size
+        LLAMA_LOG_INFO("%s: EAGLE3 without d2t - sharing same vocab_size with target (vocab_size = %lld)\n", __func__, (long long)n_draft_vocab);
+    }
+
+    // Feature fusion layer: projects 3 target layers to draft hidden size
+    fc = create_tensor(tn(LLM_TENSOR_EAGLE3_FC, "weight"), {n_embd_target_features, n_embd}, 0);
+
+    // Output layer (uses draft vocab size)
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_draft_vocab}, 0);
+
+    // Token embeddings (optional - Llama 3.3 70B EAGLE3 has its own)
+    const struct ggml_tensor * tok_embd_meta = ml->get_tensor_meta(tn(LLM_TENSOR_TOKEN_EMBD, "weight").str().c_str());
+    if (tok_embd_meta) {
+        const int64_t n_target_vocab = tok_embd_meta->ne[1];
+        tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_target_vocab}, 0);
+        LLAMA_LOG_INFO("%s: EAGLE3 using its own token_embd (vocab = %lld)\n", __func__, (long long)n_target_vocab);
+    }
+
+    // Single decoder layer
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        // input_layernorm: applied to token embeddings
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        // Attention takes input_embeds_normed + fused_target_normed as input
+        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd_attn_input, n_embd_head_k * n_head}, 0);
+        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd_attn_input, n_embd_k_gqa}, 0);
+        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd_attn_input, n_embd_v_gqa}, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
+
+        // eagle3 specific: hidden_norm applied to fused target features
+        layer.eagle3_hidden_norm = create_tensor(tn(LLM_TENSOR_EAGLE3_HIDDEN_NORM, "weight", i), {n_embd}, 0);
+
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+
+        // rope_freqs for llama3 rope scaling (optional - only if eagle3 config has rope_scaling)
+        layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_eagle3::build_arch_graph(const llm_graph_params & params) const {
+    switch (params.gtype) {
+        case LLM_GRAPH_TYPE_ENCODER:
+            return std::make_unique<graph<true>>(*this, params);
+        case LLM_GRAPH_TYPE_DEFAULT:
+        case LLM_GRAPH_TYPE_DECODER:
+            return std::make_unique<graph<false>>(*this, params);
+        default:
+            GGML_ABORT("invalid graph type");
+    };
+}
+
+template <>
+ggml_tensor * llama_model_eagle3::graph<true>::build_inp_embd_enc() const {
+    const int64_t n_embd_target_features = (int64_t) hparams.n_embd_target_features;
+
+    ggml_tensor * cur = nullptr;
+
+    // Input: Target model features (3 layers concatenated: low, mid, high)
+    // Data will be provided via ubatch->embd in encode_eagle3_features()
+    auto inp_target = std::make_unique<llm_graph_input_embd>(n_embd_target_features);
+    inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_target_features, n_tokens);
+    ggml_set_input(inp_target->embd);
+
+    cur = inp_target->embd;
+    cb(cur, "inp_embd", -1);
+
+    res->add_input(std::move(inp_target));
+
+    return cur;
+}
+
+// eagle3 Encoder: processes target model features through feature fusion layer
+// Input: target_features e.g. [12288, n_tokens] from target model layers low, middle, high
+// Output: g_embeddings e.g. [4096, n_tokens] stored in context
+template <>
+llama_model_eagle3::graph<true>::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    ggml_tensor * cur = nullptr;
+
+    cur = build_inp_embd_enc();
+
+    // Feature fusion layer
+    cur = build_lora_mm(model.fc, cur);
+    cb(cur, "fc_out", -1);
+
+    // Output: g_embeddings e.g. [4096, n_tokens]
+    // store in t_h_pre_norm (same as MTP) so can be read via llama_get_embeddings_pre_norm(ctx_dft)
+    ggml_set_output(cur);
+    res->t_h_pre_norm = cur;
+
+    ggml_build_forward_expand(gf, cur);
+}
+
+// eagle3 Decoder: processes draft tokens using g_embeddings from encoder
+// Input: draft tokens + g_embeddings from encoder
+// Output: draft logits
+template <>
+llama_model_eagle3::graph<false>::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+    const int64_t n_embd_head = hparams.n_embd_head_v();
+
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_layer == 1);  // eagle3 has only one decoder layer
+
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+
+    // eagle3 Decoder receives:
+    // 1. Token embeddings (e.g.from eagle3's own tok_embd for Llama 3.3 70B, or target model for Llama 3.1 8B)
+    // 2. g_embeddings from encoder
+    GGML_ASSERT(model.tok_embd != nullptr && "EAGLE3 decoder requires token embeddings (own or from target model)");
+
+    auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
+
+    inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+    ggml_set_input(inp->tokens);
+
+    inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
+    ggml_set_input(inp->embd);
+
+    ggml_tensor * inp_embd = ggml_get_rows(ctx0, model.tok_embd, inp->tokens);
+    cb(inp_embd, "inp_embd", -1);
+
+    ggml_tensor * inp_g = inp->embd;
+    cb(inp_g, "inp_g_embeddings", -1);
+
+    res->add_input(std::move(inp));
+
+    inpL = inp_g;
+
+    // inp_pos - contains the positions
+    ggml_tensor * inp_pos = build_inp_pos();
+
+    auto * inp_attn = build_attn_inp_kv();
+
+    const float kq_scale = 1.0f/sqrtf(float(n_embd_head));
+
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+    // Single decoder layer (il = 0)
+    const int il = 0;
+    {
+        // Apply input_layernorm to the token embeddings
+        ggml_tensor * embd_norm = build_norm(inp_embd,
+                model.layers[il].attn_norm, NULL,
+                LLM_NORM_RMS, il);
+        cb(embd_norm, "embd_norm", il);
+
+        // Apply hidden_norm to inp_g
+        ggml_tensor * g_norm = build_norm(inp_g,
+                model.layers[il].eagle3_hidden_norm, NULL,
+                LLM_NORM_RMS, -1);
+        cb(g_norm, "g_norm", il);
+
+        // norm_before_residual: determines what goes into the residual connection (compatible with Readhat eagle3 speculator model)
+        // - false (default): use raw inp_g for residual
+        // - true: use normalized g_norm for residual
+        // inpL is the concatenated input (normalized inp_embd + normalized inp_g)
+        ggml_tensor * inpSA = hparams.eagle3_norm_before_residual ? g_norm : inpL;
+
+        // Concatenate normalized inp_embd and normalized inp_g
+        cur = ggml_concat(ctx0, embd_norm, g_norm, il);
+        cb(cur, "concat_embd", il);
+
+        // Self-attention with concatenated input
+        ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+        cb(Qcur, "Qcur", il);
+
+        ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+        cb(Kcur, "Kcur", il);
+
+        ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+        cb(Vcur, "Vcur", il);
+
+        Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+        Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+        Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+        // rope freq factors, returns nullptr if not available
+        ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+
+        // RoPE
+        Qcur = ggml_rope_ext(
+                ctx0, Qcur, inp_pos, rope_factors,
+                n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                ext_factor, attn_factor, beta_fast, beta_slow
+                );
+        Kcur = ggml_rope_ext(
+                ctx0, Kcur, inp_pos, rope_factors,
+                n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                ext_factor, attn_factor, beta_fast, beta_slow
+                );
+
+        cb(Qcur, "Qcur_rope", il);
+        cb(Kcur, "Kcur_rope", il);
+
+        cur = build_attn(inp_attn,
+                model.layers[il].wo, NULL, nullptr,
+                Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+
+        if (inp_out_ids) {
+            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
+            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+        }
+
+        // Add residual and update it
+        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        cb(ffn_inp, "ffn_inp", il);
+
+        // Apply FFN norm to the sum
+        cur = build_norm(ffn_inp,
+                model.layers[il].ffn_norm, NULL,
+                LLM_NORM_RMS, il);
+        cb(cur, "post_attn_norm", il);
+
+        cur = build_ffn(cur,
+                model.layers[il].ffn_up,   NULL, NULL,
+                model.layers[il].ffn_gate, NULL, NULL,
+                model.layers[il].ffn_down, NULL, NULL,
+                NULL,
+                LLM_FFN_SILU, LLM_FFN_PAR, il);
+        cb(cur, "ffn_out", il);
+
+        // Output norm with residual
+        cur = ggml_add(ctx0, cur, ffn_inp);
+        cb(cur, "eagle3_prenorm", il);
+
+        inpL = cur;
+    }
+
+    cur = inpL;
+
+    // Output prenorm state (for next token's g_embeddings in autoregressive generation)
+    ggml_set_output(cur);
+    res->t_h_pre_norm = cur;
+
+    cur = build_norm(cur,
+            model.output_norm, NULL,
+            LLM_NORM_RMS, -1);
+    cb(cur, "result_norm", -1);
+
+    // lm_head - projects to draft vocabulary
+    cur = build_lora_mm(model.output, cur);
+
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+
+    ggml_build_forward_expand(gf, cur);
+}
diff --git a/src/models/models.h b/src/models/models.h
index c137e32e8fd1..bcaee24377f5 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -1089,6 +1089,21 @@ struct llama_model_glm_dsa : public llama_model_base {
     std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
 
+struct llama_model_eagle3 : public llama_model_base {
+    llama_model_eagle3(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    template <bool is_enc>
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+
+        ggml_tensor * build_inp_embd_enc() const;
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
+
 
 struct llama_model_mistral4 : public llama_model_deepseek2 {
     llama_model_mistral4(const struct llama_model_params & params) : llama_model_deepseek2(params) {}
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 6fa302e132f3..0ebb90aba2f5 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -939,6 +939,9 @@ struct server_context_impl {
                 return false;
             }
 
+            // eagle3/DFlash: shares target model's token_embd
+            common_speculative_setup_draft_model(model_dft.get(), model_tgt);
+
             auto cparams = common_context_params_to_llama(params_dft);
 
             const bool spec_mtp = std::find(params_base.speculative.types.begin(),

From 16e65554fb0887f15b72bb459f4d252081002055 Mon Sep 17 00:00:00 2001
From: Ruixiang Wang <wangruixiang07@outlook.com>
Date: Mon, 18 May 2026 16:02:29 +0000
Subject: [PATCH 03/27] eagle3: fix params bug

---
 common/speculative.cpp | 8 ++++----
 src/llama-context.h    | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/common/speculative.cpp b/common/speculative.cpp
index 79202842023e..1d373ccd4fbc 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -404,8 +404,8 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl {
 //
 // Performance is overall good but there is waste in verify cycle:
 //   process() runs encoder + decoder on the *full* verify batch including rows for
-//   rejected drafts. The KV at those positions is then dropped. 
-// 
+//   rejected drafts. The KV at those positions is then dropped.
+//
 // TODO: Not sure if we need optimization for this waste?
 // If so we may need hybrid stash:
 //      in verify mode, have process() only stash features and let draft() seed run
@@ -486,8 +486,8 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
         }
 
         // turn on extraction of the draft model's pre-norm hidden state
-        // (used both for the encoder output g_embd and the decoder pre-norm output)
-        llama_set_embeddings_pre_norm(ctx_dft, true);
+        // (used both for the encoder output g_embd and the decoder pre-norm output).
+        llama_set_embeddings_pre_norm(ctx_dft, true, /*masked*/ true);
 
         pending_g_last.assign(n_seq, std::vector<float>(n_embd_dec, 0.0f));
         pending_pos_last.assign(n_seq, -1);
diff --git a/src/llama-context.h b/src/llama-context.h
index d6d483cb97d6..7d7828319693 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -231,7 +231,7 @@ struct llama_context {
     // map the output row index `i` to batch index
     int64_t output_resolve_row(int32_t i) const;
 
-    // async-copy enabled layer-input tensors (per cparams.output_layer_inp) 
+    // async-copy enabled layer-input tensors (per cparams.output_layer_inp)
     // from backend into host-side embd_layer_inp buffers
     void extract_layer_inputs(const llm_graph_result * res);
 

From 752bf2331f01d05d09632dc06ce7b4fa59ec6380 Mon Sep 17 00:00:00 2001
From: Ruixiang Wang <wangruixiang07@outlook.com>
Date: Wed, 20 May 2026 16:23:42 +0000
Subject: [PATCH 04/27] eagle3: support Gemma4 eagle3 from RedHatAI

---
 conversion/__init__.py | 3 +++
 conversion/llama.py    | 3 +++
 src/models/gemma4.cpp  | 2 ++
 3 files changed, 8 insertions(+)

diff --git a/conversion/__init__.py b/conversion/__init__.py
index 18162976f458..cd6f8e6b937c 100644
--- a/conversion/__init__.py
+++ b/conversion/__init__.py
@@ -130,6 +130,9 @@
     "LlamaBidirectionalModel": "llama",
     "LlamaForCausalLM": "llama",
     "LlamaModel": "llama",
+    "Eagle3DraftModel": "llama",
+    "Eagle3Speculator": "llama",
+    "LlamaForCausalLMEagle3": "llama",
     "LlavaForConditionalGeneration": "llama",
     "LlavaStableLMEpochForCausalLM": "stablelm",
     "MPTForCausalLM": "mpt",
diff --git a/conversion/llama.py b/conversion/llama.py
index db073b9b361a..b08388e456bd 100644
--- a/conversion/llama.py
+++ b/conversion/llama.py
@@ -63,6 +63,9 @@ def __init__(self, *args, **kwargs):
             with open(self.target_model_dir / "config.json", 'r', encoding='utf-8') as f:
                 target_config = json.load(f)
 
+            if "text_config" in target_config:
+                target_config = {**target_config, **target_config["text_config"]}
+
             # extract_layers: derived from target model layer count (low/mid/high)
             target_num_layers = target_config["num_hidden_layers"]
             extract_layers = [2, target_num_layers // 2, target_num_layers - 3]
diff --git a/src/models/gemma4.cpp b/src/models/gemma4.cpp
index 6f7fcd645cbd..d0cc40fab2c8 100644
--- a/src/models/gemma4.cpp
+++ b/src/models/gemma4.cpp
@@ -210,6 +210,8 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para
         const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
         const int   n_rot_l      = hparams.n_rot(il);
 
+        res->t_layer_inp[il] = inpL;
+        
         // norm
         cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
         cb(cur, "attn_norm", il);

From b32d9ebe970d496bbfe38af880dbf6ed76919c07 Mon Sep 17 00:00:00 2001
From: Ruixiang Wang <wangruixiang07@outlook.com>
Date: Wed, 27 May 2026 13:52:08 +0000
Subject: [PATCH 05/27] eagle3: set sync when get features from target

Co-authored-by: tnhnyzc <115956684+tnhnyzc@users.noreply.github.com>
---
 src/llama-context.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 4c40bdf3703d..c9956d12bd4a 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -4107,5 +4107,7 @@ void llama_set_output_layer_inp(struct llama_context * ctx, uint32_t layer_id, b
 }
 
 float * llama_get_output_layer_inp(struct llama_context * ctx, uint32_t layer_id) {
+    ctx->synchronize();
+
     return ctx->get_output_layer_inp(layer_id);
 }

From 7c5f428dc8d902b409c4c34e2fea91d960666a66 Mon Sep 17 00:00:00 2001
From: Ruixiang Wang <wangruixiang07@outlook.com>
Date: Wed, 27 May 2026 16:38:27 +0000
Subject: [PATCH 06/27] eagle3 : fix ubatch handling in embd_layer_inp
 extraction and encoder
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Doğaç Eldenk <dogacel@gmail.com>
---
 common/speculative.cpp | 51 +++++++++++++++++++++++++++---------------
 src/llama-context.cpp  | 45 ++++++++++++++++++++++++++++++-------
 src/llama-context.h    |  4 ++--
 src/llama-ext.h        |  2 +-
 4 files changed, 73 insertions(+), 29 deletions(-)

diff --git a/common/speculative.cpp b/common/speculative.cpp
index 1d373ccd4fbc..0c49f0ee372a 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -433,6 +433,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
 
     // scratch buffer for concatenated target features [n_tokens, n_embd_enc]
     std::vector<float> features_buf;
+    std::vector<float> g_embd_buf;
 
     common_speculative_impl_draft_eagle3(const common_params_speculative & params, uint32_t n_seq)
         : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq)
@@ -570,25 +571,39 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
             }
         }
 
-        llama_batch enc_batch = {
-            /*.n_tokens =*/ n_tokens,
-            /*.token    =*/ nullptr,
-            /*.embd     =*/ features_buf.data(),
-            /*.pos      =*/ nullptr,
-            /*.n_seq_id =*/ nullptr,
-            /*.seq_id   =*/ nullptr,
-            /*.logits   =*/ nullptr,
-        };
-        int rc = llama_encode(ctx_dft, enc_batch);
-        if (rc != 0) {
-            LOG_ERR("%s: llama_encode(ctx_dft) failed rc=%d (n_tokens=%d)\n",
-                    __func__, rc, (int) n_tokens);
-            return false;
+        g_embd_buf.resize((size_t) n_tokens * n_embd_dec);
+
+        // llama_encode() requires the full encoder batch to fit in n_ubatch.
+        // Allow batch > ubatch: eagle3's per-token encoder can be chunked safely.
+        const int32_t n_ubatch_dft = (int32_t) llama_n_ubatch(ctx_dft);
+        for (int32_t i = 0; i < n_tokens; i += n_ubatch_dft) {
+            const int32_t n_chunk = std::min(n_ubatch_dft, n_tokens - i);
+
+            llama_batch enc_batch = {
+                /*.n_tokens =*/ n_chunk,
+                /*.token    =*/ nullptr,
+                /*.embd     =*/ features_buf.data() + (size_t) i * n_embd_enc,
+                /*.pos      =*/ nullptr,
+                /*.n_seq_id =*/ nullptr,
+                /*.seq_id   =*/ nullptr,
+                /*.logits   =*/ nullptr,
+            };
+            const int32_t rc = llama_encode(ctx_dft, enc_batch);
+            if (rc != 0) {
+                LOG_ERR("%s: llama_encode(ctx_dft) failed rc=%d (n_tokens=%d, offset=%d)\n",
+                        __func__, rc, (int) n_chunk, (int) i);
+                return false;
+            }
+
+            // g_embd has shape [n_chunk, n_embd_dec] in ctx_dft's pre-norm embeddings buffer.
+            const float * g_embd_chunk = llama_get_embeddings_pre_norm(ctx_dft);
+            GGML_ASSERT(g_embd_chunk && "EAGLE3 encoder produced no output.");
+            std::memcpy(g_embd_buf.data() + (size_t) i * n_embd_dec,
+                        g_embd_chunk,
+                        (size_t) n_chunk * n_embd_dec * sizeof(float));
         }
 
-        // g_embd has shape [n_tokens, n_embd_dec] in ctx_dft's pre-norm embeddings buffer
-        const float * g_embd = llama_get_embeddings_pre_norm(ctx_dft);
-        GGML_ASSERT(g_embd && "EAGLE3 encoder produced no output.");
+        const float * g_embd = g_embd_buf.data();
 
         const size_t row_bytes = (size_t) n_embd_dec * sizeof(float);
 
@@ -649,7 +664,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
         }
 
         if (batch.n_tokens > 0) {
-            rc = llama_decode(ctx_dft, batch);
+            const int32_t rc = llama_decode(ctx_dft, batch);
             if (rc != 0) {
                 LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (n_tokens=%d, ubatch_pos[0]=%d)\n",
                         __func__, rc, (int) batch.n_tokens, (int) batch_in.pos[0]);
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index c9956d12bd4a..28b3d3880f48 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1279,10 +1279,10 @@ void llama_context::set_output_layer_inp(uint32_t layer_id, bool enable) {
 }
 
 float * llama_context::get_output_layer_inp(uint32_t layer_id) {
-    if (layer_id >= embd_layer_inp.size() || embd_layer_inp[layer_id].empty()) {
+    if (layer_id >= embd_layer_inp.size() || !embd_layer_inp[layer_id].has_data()) {
         return nullptr;
     }
-    return embd_layer_inp[layer_id].data();
+    return embd_layer_inp[layer_id].data;
 }
 
 llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
@@ -1979,7 +1979,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
             }
         }
 
-        extract_layer_inputs(res);
+        extract_layer_inputs(res, n_tokens_prev, ubatch.n_tokens);
 
         // extract nextn embeddings before
         // only meaningful in LLAMA_POOLING_TYPE_NONE (per-token); other pooling modes are ignored.
@@ -2099,6 +2099,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
 
     size_t backend_float_count = 0;
     size_t backend_token_count = 0;
+    size_t embd_layer_inp_float_count = 0;
 
     logits.size     = has_logits     ? n_vocab*n_outputs_max     : 0;
     embd.size       = has_embd       ? n_embd_out*n_outputs_max  : 0;
@@ -2110,6 +2111,12 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
         embd_nextn.size = (size_t) n_embd_out * n_batch;
     }
 
+    for (bool enabled : cparams.output_layer_inp) {
+        if (enabled) {
+            embd_layer_inp_float_count += (size_t) n_embd * n_batch;
+        }
+    }
+
     // Allocate backend sampling output buffers if there are backend samplers configured.
     const bool has_sampling = !sampling.samplers.empty();
     if (has_sampling) {
@@ -2124,8 +2131,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
 
     const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
     const size_t new_size  =
-        (logits.size + embd.size + embd_nextn.size + backend_float_count) * sizeof(float) +
-        (                                               backend_token_count) * sizeof(llama_token);
+        (logits.size + embd.size + embd_nextn.size + embd_layer_inp_float_count + backend_float_count) * sizeof(float) +
+        (                                                                                  backend_token_count) * sizeof(llama_token);
 
     // alloc only when more than the current capacity is required
     // TODO: also consider shrinking the buffer
@@ -2142,6 +2149,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
             logits.data = nullptr;
             embd.data = nullptr;
             embd_nextn.data = nullptr;
+            for (auto & layer_inp : embd_layer_inp) {
+                layer_inp = {nullptr, 0};
+            }
         }
 
         auto * buft = ggml_backend_cpu_buffer_type();
@@ -2173,6 +2183,15 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
     embd_nextn = has_embd_nextn ? buffer_view<float>{(float *) (base + offset), embd_nextn.size} : buffer_view<float>{nullptr, 0};
     offset += embd_nextn.size * sizeof(float);
 
+    for (uint32_t il = 0; il < embd_layer_inp.size(); ++il) {
+        if (cparams.output_layer_inp[il]) {
+            embd_layer_inp[il] = buffer_view<float>{(float *) (base + offset), (size_t) n_embd * n_batch};
+            offset += embd_layer_inp[il].size * sizeof(float);
+        } else {
+            embd_layer_inp[il] = buffer_view<float>{nullptr, 0};
+        }
+    }
+
     if (has_sampling) {
         sampling.logits = {(float *) (base + offset), (size_t)(n_vocab*n_outputs_max)};
         offset += sampling.logits.size * sizeof(float);
@@ -2219,20 +2238,30 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
     return n_outputs_max;
 }
 
-void llama_context::extract_layer_inputs(const llm_graph_result * res) {
+void llama_context::extract_layer_inputs(const llm_graph_result * res, size_t token_offset, size_t n_tokens) {
     for (uint32_t il = 0; il < cparams.output_layer_inp.size(); ++il) {
         if (!cparams.output_layer_inp[il]) {
             continue;
         }
+        if (!embd_layer_inp[il].has_data()) {
+            continue;
+        }
         ggml_tensor * t = res->get_layer_inp((int) il);
         if (!t) {
             continue;
         }
         const size_t nbytes = ggml_nbytes(t);
-        embd_layer_inp[il].resize(nbytes / sizeof(float));
+        const size_t nfloats = nbytes / sizeof(float);
+        GGML_ASSERT(n_tokens > 0);
+        GGML_ASSERT(nfloats % n_tokens == 0);
+
+        const size_t row_floats = nfloats / n_tokens;
+        const size_t dst_offset = token_offset * row_floats;
+        GGML_ASSERT(dst_offset + nfloats <= embd_layer_inp[il].size);
+
         ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched.get(), t);
         GGML_ASSERT(backend != nullptr);
-        ggml_backend_tensor_get_async(backend, t, embd_layer_inp[il].data(), 0, nbytes);
+        ggml_backend_tensor_get_async(backend, t, embd_layer_inp[il].data + dst_offset, 0, nbytes);
     }
 }
 
diff --git a/src/llama-context.h b/src/llama-context.h
index 7d7828319693..af809d280386 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -233,7 +233,7 @@ struct llama_context {
 
     // async-copy enabled layer-input tensors (per cparams.output_layer_inp)
     // from backend into host-side embd_layer_inp buffers
-    void extract_layer_inputs(const llm_graph_result * res);
+    void extract_layer_inputs(const llm_graph_result * res, size_t token_offset, size_t n_tokens);
 
     //
     // graph
@@ -365,7 +365,7 @@ struct llama_context {
 
     // host buffer for output layer input embeddings, per layer
     // populated when cparams.output_layer_inp[il] is true
-    std::vector<std::vector<float>> embd_layer_inp;
+    std::vector<buffer_view<float>> embd_layer_inp;
 
     // keep copies of the per-sequence memory on the device
     std::map<llama_seq_id, llama_memory_buffers> mem_storage;
diff --git a/src/llama-ext.h b/src/llama-ext.h
index 51838a761f10..fdde3c89a01e 100644
--- a/src/llama-ext.h
+++ b/src/llama-ext.h
@@ -110,7 +110,7 @@ LLAMA_API llama_context * llama_get_ctx_other(struct llama_context * ctx);
 // set if the layer input embeddings should be outputed
 LLAMA_API void llama_set_output_layer_inp(struct llama_context * ctx, uint32_t layer_id, bool enable);
 
-// read back the input embeddings of the specified layer for the most recent ubatch
+// read back the input embeddings of the specified layer for the most recent decode batch
 // the layer must have been enabled via llama_set_output_layer_inp
 LLAMA_API float * llama_get_output_layer_inp(struct llama_context * ctx, uint32_t layer_id);
 

From 91b9cfc74220558bff741a59b24adb62d66cb977 Mon Sep 17 00:00:00 2001
From: Ruixiang Wang <wangruixiang07@outlook.com>
Date: Fri, 5 Jun 2026 12:55:05 +0000
Subject: [PATCH 07/27] eagle3: adapt to upstream changes

---
 common/speculative.cpp | 8 ++++----
 src/llama-context.cpp  | 5 +++--
 src/llama-model.cpp    | 5 +++--
 src/models/eagle3.cpp  | 6 +++---
 4 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/common/speculative.cpp b/common/speculative.cpp
index 0c49f0ee372a..0ac0d7ffcaef 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -488,7 +488,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
 
         // turn on extraction of the draft model's pre-norm hidden state
         // (used both for the encoder output g_embd and the decoder pre-norm output).
-        llama_set_embeddings_pre_norm(ctx_dft, true, /*masked*/ true);
+        llama_set_embeddings_nextn(ctx_dft, true, /*masked*/ true);
 
         pending_g_last.assign(n_seq, std::vector<float>(n_embd_dec, 0.0f));
         pending_pos_last.assign(n_seq, -1);
@@ -596,7 +596,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
             }
 
             // g_embd has shape [n_chunk, n_embd_dec] in ctx_dft's pre-norm embeddings buffer.
-            const float * g_embd_chunk = llama_get_embeddings_pre_norm(ctx_dft);
+            const float * g_embd_chunk = llama_get_embeddings_nextn(ctx_dft);
             GGML_ASSERT(g_embd_chunk && "EAGLE3 encoder produced no output.");
             std::memcpy(g_embd_buf.data() + (size_t) i * n_embd_dec,
                         g_embd_chunk,
@@ -738,7 +738,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
 
                 common_sampler_sample(smpl, ctx_dft, i_batch, true);
                 // pre-norm hidden state of this position becomes g_embd for the next step
-                const float * prenorm = llama_get_embeddings_pre_norm_ith(ctx_dft, i_batch);
+                const float * prenorm = llama_get_embeddings_nextn_ith(ctx_dft, i_batch);
                 ++i_batch;
 
                 const auto * cur_p = common_sampler_get_candidates(smpl, true);
@@ -1778,7 +1778,7 @@ common_speculative * common_speculative_init(common_params_speculative & params,
         uint32_t enabled_configs = common_get_enabled_speculative_configs(params.types);
 
         bool has_draft_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE));
-        bool has_draft_eagle3 = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3)) && has_draft_model_path;
+        bool has_draft_eagle3 = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3)) && params.draft.ctx_dft != nullptr;
         bool has_mtp = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP)) && params.draft.ctx_dft != nullptr;
 
 
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 28b3d3880f48..1b3c073c05d5 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -86,6 +86,7 @@ llama_context::llama_context(
     cparams.cb_eval_user_data = params.cb_eval_user_data;
 
     cparams.ctx_other = nullptr;
+
     cparams.output_layer_inp.resize(hparams.n_layer, false);
     embd_layer_inp.resize(hparams.n_layer);
 
@@ -196,7 +197,7 @@ llama_context::llama_context(
 
     cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
 
-    cparams.n_outputs_max = params.n_outputs_max == 0 ? cparams.n_batch : params.n_outputs_max;
+    cparams.n_outputs_max = params.n_outputs_max == 0 || llama_model_has_encoder(&model) ? cparams.n_batch : params.n_outputs_max;
 
     cparams.op_offload = params.op_offload;
     cparams.kv_unified = params.kv_unified;
@@ -1271,7 +1272,7 @@ bool llama_context::set_adapter_cvec(
 void llama_context::set_output_layer_inp(uint32_t layer_id, bool enable) {
     LLAMA_LOG_DEBUG("%s: layer_id = %d, enable = %d\n", __func__, layer_id, enable);
 
-    GGML_ASSERT(layer_id < model.hparams.n_layer);
+    GGML_ASSERT(layer_id < model.hparams.n_layer_all);
 
     cparams.output_layer_inp[layer_id] = enable;
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index a41740f81ce2..ebecd57f550a 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2603,8 +2603,9 @@ uint64_t llama_model_n_params(const llama_model * model) {
 
 bool llama_model_has_encoder(const llama_model * model) {
     switch (model->arch) {
-        case LLM_ARCH_T5:        return true;
-        case LLM_ARCH_T5ENCODER: return true;
+        case LLM_ARCH_T5:
+        case LLM_ARCH_T5ENCODER:
+        case LLM_ARCH_EAGLE3:    return true;
         default:                 return false;
     }
 }
diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp
index 3694d262cb85..5c5769be53dd 100644
--- a/src/models/eagle3.cpp
+++ b/src/models/eagle3.cpp
@@ -136,9 +136,9 @@ llama_model_eagle3::graph<true>::graph(const llama_model & model, const llm_grap
     cb(cur, "fc_out", -1);
 
     // Output: g_embeddings e.g. [4096, n_tokens]
-    // store in t_h_pre_norm (same as MTP) so can be read via llama_get_embeddings_pre_norm(ctx_dft)
+    // store in t_h_nextn (same as MTP) so can be read via llama_get_embeddings_nextn(ctx_dft)
     ggml_set_output(cur);
-    res->t_h_pre_norm = cur;
+    res->t_h_nextn = cur;
 
     ggml_build_forward_expand(gf, cur);
 }
@@ -283,7 +283,7 @@ llama_model_eagle3::graph<false>::graph(const llama_model & model, const llm_gra
 
     // Output prenorm state (for next token's g_embeddings in autoregressive generation)
     ggml_set_output(cur);
-    res->t_h_pre_norm = cur;
+    res->t_h_nextn = cur;
 
     cur = build_norm(cur,
             model.output_norm, NULL,

From 4ca8087b7b80c6486bc17513ff1434e42e77da64 Mon Sep 17 00:00:00 2001
From: Ruixiang Wang <wangruixiang07@outlook.com>
Date: Mon, 8 Jun 2026 12:58:53 +0000
Subject: [PATCH 08/27] eagle3: fix rebase issues and adapt to upstream changes

---
 src/llama-arch.h      | 2 +-
 src/llama-context.cpp | 5 +++--
 src/llama-hparams.h   | 1 -
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/llama-arch.h b/src/llama-arch.h
index 60581af024da..0474d0e6659b 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -141,8 +141,8 @@ enum llm_arch {
     LLM_ARCH_KIMI_LINEAR,
     LLM_ARCH_TALKIE,
     LLM_ARCH_MELLUM,
-    LLM_ARCH_UNKNOWN,
     LLM_ARCH_EAGLE3,
+    LLM_ARCH_UNKNOWN,
 };
 
 enum llm_kv {
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 1b3c073c05d5..f1296b7d4882 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -87,8 +87,8 @@ llama_context::llama_context(
 
     cparams.ctx_other = nullptr;
 
-    cparams.output_layer_inp.resize(hparams.n_layer, false);
-    embd_layer_inp.resize(hparams.n_layer);
+    cparams.output_layer_inp.resize(hparams.n_layer_all, false);
+    embd_layer_inp.resize(hparams.n_layer_all);
 
     // TODO: more generic
     if (model.arch == LLM_ARCH_GEMMA4_ASSISTANT) {
@@ -2086,6 +2086,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
 
     const auto n_batch    = cparams.n_batch;
     const auto n_vocab    = vocab.n_tokens();
+    const auto n_embd     = hparams.n_embd;
     const auto n_embd_out = hparams.n_embd_out();
 
     bool has_logits     = true;
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 62d91129504d..970a8d689e05 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -48,7 +48,6 @@ struct llama_hparams {
 
     uint32_t n_ctx_train; // context size the model was trained on
     uint32_t n_embd;
-    uint32_t n_embd_inp_impl = 0;
     uint32_t n_layer_all;
     uint32_t n_layer_nextn = 0;
     uint32_t n_expert = 0;

From 413c16da6693a77e40b4b0e5c9308d74956a2909 Mon Sep 17 00:00:00 2001
From: Ruixiang Wang <wangruixiang07@outlook.com>
Date: Mon, 8 Jun 2026 13:15:21 +0000
Subject: [PATCH 09/27] eagle3:exclude the eagle3 arch from test-llama-archs

---
 tests/test-llama-archs.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp
index 8037a11398b0..4d06274ef1eb 100644
--- a/tests/test-llama-archs.cpp
+++ b/tests/test-llama-archs.cpp
@@ -450,6 +450,9 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml
         if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) {
             continue; // FIXME: ISWA KV cache initialization needs more fixture params
         }
+        if (arch == LLM_ARCH_EAGLE3) {
+            continue;
+        }
         for (bool moe : {false, true}) {
             if (moe && !moe_implemented(arch)) {
                 continue;
@@ -553,6 +556,9 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
         if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) {
             continue; // FIXME: ISWA KV cache initialization needs more fixture params
         }
+        if (arch == LLM_ARCH_EAGLE3) {
+            continue;
+        }
 
         const bool encode = arch == LLM_ARCH_T5 || arch == LLM_ARCH_DREAM || arch == LLM_ARCH_LLADA || arch == LLM_ARCH_LLADA_MOE || arch == LLM_ARCH_RND1;
         for (bool moe : {false, true}) {

From 6c212225ce132cc57b5b8f53d1cad0de20442f55 Mon Sep 17 00:00:00 2001
From: Ruixiang Wang <wangruixiang07@outlook.com>
Date: Mon, 8 Jun 2026 13:26:41 +0000
Subject: [PATCH 10/27] eagle3: fix editorconfig check failures

---
 src/llama-ext.h       | 1 -
 src/models/gemma4.cpp | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/llama-ext.h b/src/llama-ext.h
index fdde3c89a01e..163e1a674284 100644
--- a/src/llama-ext.h
+++ b/src/llama-ext.h
@@ -130,4 +130,3 @@ LLAMA_API const int32_t * llama_model_target_extract_layers  (const struct llama
 LLAMA_API uint32_t        llama_model_n_target_extract_layers(const struct llama_model * model);
 // returns the target model hidden size
 LLAMA_API uint32_t        llama_model_target_hidden_size     (const struct llama_model * model);
-
diff --git a/src/models/gemma4.cpp b/src/models/gemma4.cpp
index d0cc40fab2c8..6a96979cebde 100644
--- a/src/models/gemma4.cpp
+++ b/src/models/gemma4.cpp
@@ -211,7 +211,7 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para
         const int   n_rot_l      = hparams.n_rot(il);
 
         res->t_layer_inp[il] = inpL;
-        
+
         // norm
         cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il);
         cb(cur, "attn_norm", il);

From ac7e2b2f4fc05905e8581c241b9bea07dc1f44a4 Mon Sep 17 00:00:00 2001
From: Ruixiang Wang <wangruixiang07@outlook.com>
Date: Tue, 9 Jun 2026 17:03:11 +0000
Subject: [PATCH 11/27] eagle3: fix multi-seq issue in d2t vocab mapping

---
 src/llama-context.cpp | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index f1296b7d4882..cc69ae8a1594 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1891,7 +1891,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
                     static thread_local std::vector<float>   eagle3_draft_logits;
 
                     const int64_t draft_vocab_size = t_logits->ne[0];
-                    const uint32_t last_idx = n_outputs - 1;
 
                     if (eagle3_d2t_map.empty()) {
                         eagle3_d2t_map.resize(model.d2t->ne[0]);
@@ -1899,20 +1898,24 @@ int llama_context::decode(const llama_batch & batch_inp) {
                                                 eagle3_d2t_map.size() * sizeof(int64_t));
                     }
 
-                    eagle3_draft_logits.resize(draft_vocab_size);
-                    const size_t last_offset = last_idx * draft_vocab_size * sizeof(float);
+                    // remap every output row (one per sequence) from draft vocab to target vocab.
+                    eagle3_draft_logits.resize((size_t) n_outputs * draft_vocab_size);
                     ggml_backend_tensor_get_async(backend_res, t_logits, eagle3_draft_logits.data(),
-                                                  last_offset, draft_vocab_size * sizeof(float));
+                                                  0, (size_t) n_outputs * draft_vocab_size * sizeof(float));
                     synchronize();
 
-                    float * last_logits_out = logits_out + last_idx * n_vocab;
-                    std::fill(last_logits_out, last_logits_out + n_vocab,
-                              -std::numeric_limits<float>::infinity());
+                    for (uint32_t r = 0; r < n_outputs; r++) {
+                        float       * row_out = logits_out + (size_t) r * n_vocab;
+                        const float * row_in  = eagle3_draft_logits.data() + (size_t) r * draft_vocab_size;
 
-                    for (int64_t j = 0; j < draft_vocab_size; j++) {
-                        const int64_t target_id = j + eagle3_d2t_map[j];
-                        GGML_ASSERT(target_id >= 0 && target_id < n_vocab);
-                        last_logits_out[target_id] = eagle3_draft_logits[j];
+                        std::fill(row_out, row_out + n_vocab,
+                                  -std::numeric_limits<float>::infinity());
+
+                        for (int64_t j = 0; j < draft_vocab_size; j++) {
+                            const int64_t target_id = j + eagle3_d2t_map[j];
+                            GGML_ASSERT(target_id >= 0 && target_id < n_vocab);
+                            row_out[target_id] = row_in[j];
+                        }
                     }
                 } else {
                     ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float));

From 544aaa2faa9c2fb8bf1705629d82df97816fe28e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 10 Jun 2026 10:07:31 +0300
Subject: [PATCH 12/27] cont : minor style / clean-up

---
 common/speculative.cpp | 40 +++++++++++++++++++++-------------------
 src/llama-arch.h       |  2 +-
 src/llama-context.cpp  |  3 +--
 src/models/eagle3.cpp  |  9 ++++-----
 src/models/qwen35.cpp  |  2 +-
 5 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/common/speculative.cpp b/common/speculative.cpp
index 0ac0d7ffcaef..31935dc688b0 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -556,13 +556,12 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
         // Interleave each extract_layer's hidden state into a contiguous buffer of
         // shape [n_tokens, n_extract_layers * tgt_hidden]. Then run EAGLE3 encoder
         // to get one g_embd row per token.
-        features_buf.assign((size_t) n_tokens * n_embd_enc, 0.0f);
+        features_buf.resize((size_t) n_tokens * n_embd_enc, 0.0f);
 
         for (uint32_t k = 0; k < n_extract_layers; ++k) {
             const float * layer = llama_get_output_layer_inp(ctx_tgt, (uint32_t) extract_layers[k]);
             if (!layer) {
-                GGML_ABORT("EAGLE3: target layer %d input not extracted.",
-                           extract_layers[k]);
+                GGML_ABORT("EAGLE3: target layer %d input not extracted.", extract_layers[k]);
             }
             for (int32_t i = 0; i < n_tokens; ++i) {
                 float * dst = features_buf.data() + (size_t) i * n_embd_enc + k * (size_t) tgt_hidden;
@@ -631,7 +630,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
             // Fires iff all three preconditions hold:
             //   1) pending_pos_last >= 0
             //   2) pending_pos_last + 1 == pos[beg]
-            //   3) pending_pos_last > dft_pos_max
+            //   3) pending_pos_last > dft_pos_max // TODO: is this check needed?
             const llama_pos pending_pos = pending_pos_last[seq_id];
             if (pending_pos >= 0 && pending_pos + 1 == batch_in.pos[beg]) {
                 const llama_pos dft_pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id);
@@ -643,8 +642,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
             }
 
             for (int32_t k = beg; k < end; ++k) {
-                common_batch_add(batch, batch_in.token[k + 1], batch_in.pos[k],
-                                 { seq_id }, /*logits=*/ false);
+                common_batch_add(batch, batch_in.token[k + 1], batch_in.pos[k], { seq_id }, /*logits=*/ false);
                 std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec,
                             g_embd + (size_t) k * n_embd_dec, row_bytes);
             }
@@ -652,15 +650,11 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
             // refresh deferred state
             const int32_t n_rows = end - beg + 1;
             verify_pos_first[seq_id] = batch_in.pos[beg];
-            verify_g_rows[seq_id]    = n_rows;
-            verify_g[seq_id].assign((size_t) n_rows * n_embd_dec, 0.0f);
-            std::memcpy(verify_g[seq_id].data(),
-                        g_embd + (size_t) beg * n_embd_dec,
-                        (size_t) n_rows * row_bytes);
-
-            std::memcpy(pending_g_last[seq_id].data(),
-                        g_embd + (size_t) end * n_embd_dec, row_bytes);
             pending_pos_last[seq_id] = batch_in.pos[end];
+            verify_g_rows[seq_id]    = n_rows;
+            verify_g[seq_id].resize((size_t) n_rows * n_embd_dec, 0.0f);
+            std::memcpy(verify_g[seq_id].data(),       g_embd + (size_t) beg * n_embd_dec, row_bytes * n_rows);
+            std::memcpy(pending_g_last[seq_id].data(), g_embd + (size_t) end * n_embd_dec, row_bytes);
         }
 
         if (batch.n_tokens > 0) {
@@ -767,17 +761,14 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
 
                 result.push_back(id);
 
-                if ((params.n_max <= (int) result.size()) ||
-                    (dp.n_max > 0 && dp.n_max <= (int) result.size())) {
+                if (params.n_max <= (int) result.size()) {
                     drafting[seq_id] = false;
                     n_drafting--;
                     continue;
                 }
 
                 common_batch_add(batch, id, pending_pos_last[seq_id] + (i + 1), { seq_id }, true);
-                std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec,
-                            prenorm,
-                            row_bytes);
+                std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec, prenorm, row_bytes);
             }
 
             if (batch.n_tokens == 0) {
@@ -792,6 +783,17 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
 
             ++i;
         }
+
+        for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) {
+            auto & dp = dparams[seq_id];
+            if (!dp.drafting) {
+                continue;
+            }
+
+            if (dp.result->size() < (size_t) params.n_min) {
+                dp.result->clear();
+            }
+        }
     }
 
     void accept(llama_seq_id seq_id, uint16_t n_accepted, bool /*is_other*/) override {
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 0474d0e6659b..6a4f3ec6b841 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -572,7 +572,7 @@ enum llm_tensor {
     LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
     LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
     LLM_TENSOR_EAGLE3_FC,
-    LLM_TENSOR_EAGLE3_HIDDEN_NORM,
+    LLM_TENSOR_EAGLE3_HIDDEN_NORM, // TODO: remove, use LLM_TENSOR_ATTN_NORM instead
     LLM_TENSOR_EAGLE3_D2T,
 };
 
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index cc69ae8a1594..2ff84e5b4cef 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1908,8 +1908,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
                         float       * row_out = logits_out + (size_t) r * n_vocab;
                         const float * row_in  = eagle3_draft_logits.data() + (size_t) r * draft_vocab_size;
 
-                        std::fill(row_out, row_out + n_vocab,
-                                  -std::numeric_limits<float>::infinity());
+                        std::fill(row_out, row_out + n_vocab, -std::numeric_limits<float>::infinity());
 
                         for (int64_t j = 0; j < draft_vocab_size; j++) {
                             const int64_t target_id = j + eagle3_d2t_map[j];
diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp
index 5c5769be53dd..7d2121f6d8a4 100644
--- a/src/models/eagle3.cpp
+++ b/src/models/eagle3.cpp
@@ -249,11 +249,6 @@ llama_model_eagle3::graph<false>::graph(const llama_model & model, const llm_gra
                 model.layers[il].wo, NULL, nullptr,
                 Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
 
-        if (inp_out_ids) {
-            cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
-            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-        }
-
         // Add residual and update it
         ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
         cb(ffn_inp, "ffn_inp", il);
@@ -285,6 +280,10 @@ llama_model_eagle3::graph<false>::graph(const llama_model & model, const llm_gra
     ggml_set_output(cur);
     res->t_h_nextn = cur;
 
+    if (inp_out_ids) {
+        cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+    }
+
     cur = build_norm(cur,
             model.output_norm, NULL,
             LLM_NORM_RMS, -1);
diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp
index 4b642cff467c..6783d98ec204 100644
--- a/src/models/qwen35.cpp
+++ b/src/models/qwen35.cpp
@@ -173,7 +173,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para
         }
 
         if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) {
-            cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
+            cur   = ggml_get_rows(ctx0, cur,   inp_out_ids);
             inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         }
 

From b9f41d181dee4ff3599bd599b373c41bc1eb32d3 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 10 Jun 2026 10:28:42 +0300
Subject: [PATCH 13/27] spec : remove `common_speculative_setup_draft_model()`

---
 common/speculative.cpp          | 20 --------------------
 common/speculative.h            |  4 ----
 src/llama-context.cpp           | 11 ++++++++++-
 src/models/eagle3.cpp           | 11 +++++++++--
 tools/server/server-context.cpp |  3 ---
 5 files changed, 19 insertions(+), 30 deletions(-)

diff --git a/common/speculative.cpp b/common/speculative.cpp
index 31935dc688b0..6af8dc2a9d0b 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -1912,26 +1912,6 @@ void common_speculative_free(common_speculative * spec) {
     delete spec;
 }
 
-void common_speculative_setup_draft_model(struct llama_model * model_dft, const struct llama_model * model_tgt) {
-    if (model_dft == nullptr || model_tgt == nullptr) {
-        return;
-    }
-    if (llama_model_get_tok_embd(model_dft) == nullptr) {
-        ggml_tensor * tgt_tok_embd = llama_model_get_tok_embd(model_tgt);
-        if (tgt_tok_embd != nullptr) {
-            llama_model_set_tok_embd(model_dft, tgt_tok_embd);
-            LOG_INF("%s: draft inheriting target's tok_embd\n", __func__);
-        }
-    }
-    if (llama_model_get_lm_head(model_dft) == nullptr) {
-        ggml_tensor * tgt_lm_head = llama_model_get_lm_head(model_tgt);
-        if (tgt_lm_head != nullptr) {
-            llama_model_set_lm_head(model_dft, tgt_lm_head);
-            LOG_INF("%s: draft inheriting target's lm_head\n", __func__);
-        }
-    }
-}
-
 common_speculative_draft_params & common_speculative_get_draft_params(
         common_speculative * spec,
         llama_seq_id seq_id) {
diff --git a/common/speculative.h b/common/speculative.h
index f1cfcb237f4c..bf76ad709e26 100644
--- a/common/speculative.h
+++ b/common/speculative.h
@@ -27,10 +27,6 @@ common_speculative * common_speculative_init(common_params_speculative & params,
 
 void common_speculative_free(common_speculative * spec);
 
-// Optional setup hook to call once after loading the draft model but before creating its context.
-// Inherits any missing weights from the target model (e.g. tok_embd / lm_head from target model for eagle3 / dflash)
-void common_speculative_setup_draft_model(struct llama_model * model_dft, const struct llama_model * model_tgt);
-
 struct common_speculative_draft_params {
     // this flag is used to chain the drafts through all the available implementations
     // after the first successful draft from an implementation, we set it
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 2ff84e5b4cef..b8e641345ccc 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -94,12 +94,21 @@ llama_context::llama_context(
     if (model.arch == LLM_ARCH_GEMMA4_ASSISTANT) {
         if (params.ctx_other == nullptr) {
             // TODO: change from runtime_error to llama_exception to avoid printing error message
-            throw std::runtime_error("Gemma4Assistant requires ctx_other to be set (this is normal during memory fitting)");
+            throw std::runtime_error("Gemma4Assistant requires ctx_other to be set (this warning is normal during memory fitting)");
         }
 
         cparams.ctx_other = params.ctx_other;
     }
 
+    if (model.arch == LLM_ARCH_EAGLE3) {
+        if (model.tok_embd == nullptr) {
+            if (params.ctx_other == nullptr) {
+                throw std::runtime_error("EAGLE3 requires ctx_other to be set (this warning is normal during memory fitting)");
+            }
+            cparams.ctx_other = params.ctx_other;
+        }
+    }
+
     // Initialize backend samplers here so they are part of the sampling graph
     // before the reserve passes run later in this function. This avoids a later
     // re-reserve when graph nodes change.
diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp
index 7d2121f6d8a4..6179e92274c8 100644
--- a/src/models/eagle3.cpp
+++ b/src/models/eagle3.cpp
@@ -159,7 +159,14 @@ llama_model_eagle3::graph<false>::graph(const llama_model & model, const llm_gra
     // eagle3 Decoder receives:
     // 1. Token embeddings (e.g.from eagle3's own tok_embd for Llama 3.3 70B, or target model for Llama 3.1 8B)
     // 2. g_embeddings from encoder
-    GGML_ASSERT(model.tok_embd != nullptr && "EAGLE3 decoder requires token embeddings (own or from target model)");
+    auto * tok_embd = model.tok_embd;
+    if (model.tok_embd == nullptr) {
+        GGML_ASSERT(cparams.ctx_other != nullptr);
+        const auto * model_other = llama_get_model(cparams.ctx_other);
+
+        GGML_ASSERT(model_other->tok_embd != nullptr && "EAGLE3 decoder requires token embeddings (own or from target model)");
+        tok_embd = model_other->tok_embd;
+    }
 
     auto inp = std::make_unique<llm_graph_input_embd>(n_embd);
 
@@ -169,7 +176,7 @@ llama_model_eagle3::graph<false>::graph(const llama_model & model, const llm_gra
     inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
     ggml_set_input(inp->embd);
 
-    ggml_tensor * inp_embd = ggml_get_rows(ctx0, model.tok_embd, inp->tokens);
+    ggml_tensor * inp_embd = ggml_get_rows(ctx0, tok_embd, inp->tokens);
     cb(inp_embd, "inp_embd", -1);
 
     ggml_tensor * inp_g = inp->embd;
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 3c395f683ab9..bdfa51718080 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -935,9 +935,6 @@ struct server_context_impl {
                 return false;
             }
 
-            // eagle3/DFlash: shares target model's token_embd
-            common_speculative_setup_draft_model(model_dft.get(), model_tgt);
-
             auto cparams = common_context_params_to_llama(params_dft);
 
             const bool spec_mtp = std::find(params_base.speculative.types.begin(),

From f3fbbedfcaf8defa3a09fafe0454b56c4a0573b3 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 10 Jun 2026 11:09:48 +0300
Subject: [PATCH 14/27] llama : clean-up unused API

---
 src/llama-context.cpp |  6 +++---
 src/llama-context.h   |  8 ++++----
 src/llama-ext.h       |  6 ------
 src/llama-model.cpp   | 16 ----------------
 4 files changed, 7 insertions(+), 29 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index b8e641345ccc..ec7a60f65dcf 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -71,6 +71,9 @@ llama_context::llama_context(
     cparams.no_perf                 = params.no_perf;
     cparams.warmup                  = false;
 
+    cparams.output_layer_inp.resize(hparams.n_layer_all, false);
+    embd_layer_inp.resize(hparams.n_layer_all);
+
     cparams.ctx_type     = params.ctx_type;
     cparams.pooling_type = params.pooling_type;
 
@@ -87,9 +90,6 @@ llama_context::llama_context(
 
     cparams.ctx_other = nullptr;
 
-    cparams.output_layer_inp.resize(hparams.n_layer_all, false);
-    embd_layer_inp.resize(hparams.n_layer_all);
-
     // TODO: more generic
     if (model.arch == LLM_ARCH_GEMMA4_ASSISTANT) {
         if (params.ctx_other == nullptr) {
diff --git a/src/llama-context.h b/src/llama-context.h
index af809d280386..d6e321a5049c 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -297,6 +297,10 @@ struct llama_context {
     // sets llm_graph_result::t_h_nextn
     buffer_view<float> embd_nextn = {nullptr, 0};
 
+    // host buffers for output layer input embeddings, per layer
+    // populated when cparams.output_layer_inp[il] is true
+    std::vector<buffer_view<float>> embd_layer_inp;
+
     struct sampling_info {
         // !samplers.empty() to check if any samplers are active
         std::map<llama_seq_id, llama_sampler *> samplers;
@@ -363,10 +367,6 @@ struct llama_context {
     // host buffer for the model output (logits and embeddings)
     ggml_backend_buffer_ptr buf_output;
 
-    // host buffer for output layer input embeddings, per layer
-    // populated when cparams.output_layer_inp[il] is true
-    std::vector<buffer_view<float>> embd_layer_inp;
-
     // keep copies of the per-sequence memory on the device
     std::map<llama_seq_id, llama_memory_buffers> mem_storage;
 
diff --git a/src/llama-ext.h b/src/llama-ext.h
index 163e1a674284..f511f091bb86 100644
--- a/src/llama-ext.h
+++ b/src/llama-ext.h
@@ -114,12 +114,6 @@ LLAMA_API void llama_set_output_layer_inp(struct llama_context * ctx, uint32_t l
 // the layer must have been enabled via llama_set_output_layer_inp
 LLAMA_API float * llama_get_output_layer_inp(struct llama_context * ctx, uint32_t layer_id);
 
-LLAMA_API ggml_tensor * llama_model_get_tok_embd(const struct llama_model * model);
-LLAMA_API void          llama_model_set_tok_embd(      struct llama_model * model, ggml_tensor * tensor);
-
-LLAMA_API ggml_tensor * llama_model_get_lm_head(const struct llama_model * model);
-LLAMA_API void          llama_model_set_lm_head(      struct llama_model * model, ggml_tensor * tensor);
-
 //
 // eagle3/DFlash: consume target model extracted features
 //
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index ebecd57f550a..5a2b7aaec451 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2692,22 +2692,6 @@ void llama_model_base::create_tensor_qkv(llama_layer & layer, int bid,
     }
 }
 
-ggml_tensor * llama_model_get_tok_embd(const struct llama_model * model) {
-    return model->tok_embd;
-}
-
-void llama_model_set_tok_embd(struct llama_model * model, ggml_tensor * tensor) {
-    model->tok_embd = tensor;
-}
-
-ggml_tensor * llama_model_get_lm_head(const struct llama_model * model) {
-    return model->output;
-}
-
-void llama_model_set_lm_head(struct llama_model * model, ggml_tensor * tensor) {
-    model->output = tensor;
-}
-
 const int32_t * llama_model_target_extract_layers(const struct llama_model * model) {
     const auto & v = model->target_extract_layers;
     return v.empty() ? nullptr : v.data();

From 8002c4ce8066af3ab6750b556131acda85924d15 Mon Sep 17 00:00:00 2001
From: Ruixiang Wang <wangruixiang07@outlook.com>
Date: Wed, 10 Jun 2026 21:19:36 +0000
Subject: [PATCH 15/27] eagle3: set d2t vocab mapping in decode graph

---
 conversion/llama.py   | 13 +++++++++++--
 src/llama-context.cpp | 36 +-----------------------------------
 src/models/eagle3.cpp | 15 +++++++++++++++
 3 files changed, 27 insertions(+), 37 deletions(-)

diff --git a/conversion/llama.py b/conversion/llama.py
index b08388e456bd..dd732716545e 100644
--- a/conversion/llama.py
+++ b/conversion/llama.py
@@ -5,6 +5,7 @@
 
 from typing import Callable, Iterable, TYPE_CHECKING
 
+import numpy as np
 import torch
 
 if TYPE_CHECKING:
@@ -65,6 +66,7 @@ def __init__(self, *args, **kwargs):
 
             if "text_config" in target_config:
                 target_config = {**target_config, **target_config["text_config"]}
+            self.target_vocab_size = target_config["vocab_size"]
 
             # extract_layers: derived from target model layer count (low/mid/high)
             target_num_layers = target_config["num_hidden_layers"]
@@ -316,11 +318,18 @@ def prepare_tensors(self):
 
         super().prepare_tensors()
 
-        # eagle3: write d2t as int64 directly (not converted to F32)
+        # eagle3: write d2t as absolute target token ids
         if getattr(self, 'is_eagle3', False) and hasattr(self, '_eagle3_int_tensors'):
             for name, data_torch in self._eagle3_int_tensors.items():
                 old_dtype = eagle3_original_dtypes.get(name, data_torch.dtype)
-                data = data_torch.to(torch.int64).numpy()
+                data = data_torch.to(torch.int64).cpu().numpy()
+                if name == "d2t":
+                    data = data.reshape(-1)
+                    data = data + np.arange(data.size, dtype=np.int64)
+                    if np.any((data < 0) | (data >= self.target_vocab_size)):
+                        raise ValueError(f"EAGLE-3 d2t target ids out of range for target vocab size {self.target_vocab_size}")
+                    if np.unique(data).size != data.size:
+                        raise ValueError("EAGLE-3 d2t contains duplicate target ids")
                 data_qtype = gguf.GGMLQuantizationType.I64
 
                 shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}"
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index ec7a60f65dcf..21043d34b9c2 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1893,41 +1893,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
             if (n_outputs) {
                 GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all);
                 GGML_ASSERT((n_outputs_prev + n_outputs)*n_vocab <= (int64_t) logits.size);
-
-                // eagle3: Map draft vocab to target vocab
-                if (model.arch == LLM_ARCH_EAGLE3 && model.d2t) {
-                    static thread_local std::vector<int64_t> eagle3_d2t_map;
-                    static thread_local std::vector<float>   eagle3_draft_logits;
-
-                    const int64_t draft_vocab_size = t_logits->ne[0];
-
-                    if (eagle3_d2t_map.empty()) {
-                        eagle3_d2t_map.resize(model.d2t->ne[0]);
-                        ggml_backend_tensor_get(model.d2t, eagle3_d2t_map.data(), 0,
-                                                eagle3_d2t_map.size() * sizeof(int64_t));
-                    }
-
-                    // remap every output row (one per sequence) from draft vocab to target vocab.
-                    eagle3_draft_logits.resize((size_t) n_outputs * draft_vocab_size);
-                    ggml_backend_tensor_get_async(backend_res, t_logits, eagle3_draft_logits.data(),
-                                                  0, (size_t) n_outputs * draft_vocab_size * sizeof(float));
-                    synchronize();
-
-                    for (uint32_t r = 0; r < n_outputs; r++) {
-                        float       * row_out = logits_out + (size_t) r * n_vocab;
-                        const float * row_in  = eagle3_draft_logits.data() + (size_t) r * draft_vocab_size;
-
-                        std::fill(row_out, row_out + n_vocab, -std::numeric_limits<float>::infinity());
-
-                        for (int64_t j = 0; j < draft_vocab_size; j++) {
-                            const int64_t target_id = j + eagle3_d2t_map[j];
-                            GGML_ASSERT(target_id >= 0 && target_id < n_vocab);
-                            row_out[target_id] = row_in[j];
-                        }
-                    }
-                } else {
-                    ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float));
-                }
+                ggml_backend_tensor_get_async(backend_res, t_logits, logits_out, 0, n_outputs*n_vocab*sizeof(float));
             }
         }
 
diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp
index 6179e92274c8..ea0e50606fc3 100644
--- a/src/models/eagle3.cpp
+++ b/src/models/eagle3.cpp
@@ -299,6 +299,21 @@ llama_model_eagle3::graph<false>::graph(const llama_model & model, const llm_gra
     // lm_head - projects to draft vocabulary
     cur = build_lora_mm(model.output, cur);
 
+    if (model.d2t) {
+        const int64_t n_draft_vocab = cur->ne[0];
+        const int64_t n_outputs     = cur->ne[1];
+        const int64_t n_vocab       = (int64_t) model.vocab.n_tokens();
+
+        GGML_ASSERT(model.d2t->type == GGML_TYPE_I64);
+        GGML_ASSERT(model.d2t->ne[0] == n_draft_vocab);
+
+        ggml_tensor * logits = ggml_fill(ctx0, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, n_vocab, n_outputs), -INFINITY);
+        cur = ggml_set_rows(ctx0, logits,
+                ggml_reshape_3d(ctx0, cur,       1,             n_draft_vocab, n_outputs),
+                ggml_reshape_3d(ctx0, model.d2t, n_draft_vocab, 1,             1));
+        cur = ggml_reshape_2d(ctx0, cur, n_vocab, n_outputs);
+    }
+
     cb(cur, "result_output", -1);
     res->t_logits = cur;
 

From 33b02df4ca680712220b2b4b2c67741c586c607b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 10 Jun 2026 12:26:22 +0300
Subject: [PATCH 16/27] cont : assert layer inputs are configured

---
 src/llama-context.cpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 21043d34b9c2..8cea70d65237 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -71,8 +71,8 @@ llama_context::llama_context(
     cparams.no_perf                 = params.no_perf;
     cparams.warmup                  = false;
 
-    cparams.output_layer_inp.resize(hparams.n_layer_all, false);
-    embd_layer_inp.resize(hparams.n_layer_all);
+    cparams.output_layer_inp.resize(hparams.n_layer(), false);
+    embd_layer_inp.resize(hparams.n_layer());
 
     cparams.ctx_type     = params.ctx_type;
     cparams.pooling_type = params.pooling_type;
@@ -1281,7 +1281,7 @@ bool llama_context::set_adapter_cvec(
 void llama_context::set_output_layer_inp(uint32_t layer_id, bool enable) {
     LLAMA_LOG_DEBUG("%s: layer_id = %d, enable = %d\n", __func__, layer_id, enable);
 
-    GGML_ASSERT(layer_id < model.hparams.n_layer_all);
+    GGML_ASSERT(layer_id < model.hparams.n_layer());
 
     cparams.output_layer_inp[layer_id] = enable;
 
@@ -2111,7 +2111,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
     const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0;
     const size_t new_size  =
         (logits.size + embd.size + embd_nextn.size + embd_layer_inp_float_count + backend_float_count) * sizeof(float) +
-        (                                                                                  backend_token_count) * sizeof(llama_token);
+        (                                                                         backend_token_count) * sizeof(llama_token);
 
     // alloc only when more than the current capacity is required
     // TODO: also consider shrinking the buffer
@@ -2223,12 +2223,13 @@ void llama_context::extract_layer_inputs(const llm_graph_result * res, size_t to
             continue;
         }
         if (!embd_layer_inp[il].has_data()) {
-            continue;
+            GGML_ABORT("output layer input buffer not allocated");
         }
         ggml_tensor * t = res->get_layer_inp((int) il);
         if (!t) {
-            continue;
+            GGML_ABORT("layer input tensor not found");
         }
+
         const size_t nbytes = ggml_nbytes(t);
         const size_t nfloats = nbytes / sizeof(float);
         GGML_ASSERT(n_tokens > 0);

From 785722184028e4a40bcfa296139ef97ef2e71aab Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 11 Jun 2026 11:21:53 +0300
Subject: [PATCH 17/27] hparams : use n_embd_inp instead of
 n_embd_target_features

---
 src/llama-context.cpp |  4 +---
 src/llama-hparams.h   |  5 ++---
 src/models/eagle3.cpp | 12 +++++-------
 3 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 8cea70d65237..62f05808fe7c 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1380,9 +1380,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
     const auto & hparams = model.hparams;
 
     // eagle3/DFlash: features as encoder input, and non-draft paths fall back to model's input dim
-    const int64_t n_embd = (hparams.n_embd_target_features > 0 && batch_inp.embd)
-                             ? (int64_t) hparams.n_embd_target_features
-                             : hparams.n_embd_inp();
+    const int64_t n_embd = hparams.n_embd_inp();
     const int64_t n_vocab = model.vocab.n_tokens();
 
     // note: during encode, we always pass the full sequence starting from pos = 0
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 970a8d689e05..6d4c0080db20 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -236,9 +236,8 @@ struct llama_hparams {
     std::array<int32_t, LLAMA_MAX_LAYERS> deepstack_mapping_arr;
 
     // eagle3/DFlash sahred params
-    // n_embd_target_features = n_extract * target_hidden_size (encoder input dim)
-    uint32_t n_embd_target_features = 0;
-    uint32_t target_hidden_size     = 0;
+    // n_embd_impl = n_extract * target_hidden_size (encoder input dim)
+    uint32_t target_hidden_size = 0;
     // eagle3: whether to apply hidden_norm before storing residual
     bool eagle3_norm_before_residual = false;
 
diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp
index ea0e50606fc3..683c7ffad3c7 100644
--- a/src/models/eagle3.cpp
+++ b/src/models/eagle3.cpp
@@ -18,7 +18,7 @@ void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) {
     LLAMA_LOG_INFO("%s: EAGLE3 target_hidden_size = %u (draft n_embd = %u)\n", __func__,
             hparams.target_hidden_size, hparams.n_embd);
 
-    hparams.n_embd_target_features = (uint32_t) target_extract_layers.size() * hparams.target_hidden_size;
+    hparams.n_embd_inp_impl = (uint32_t) target_extract_layers.size() * hparams.target_hidden_size;
 
     // eagle3 norm_before_residual (optional, default false)
     // compatible with Readhat eagle3 speculator model
@@ -33,7 +33,7 @@ void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) {
 void llama_model_eagle3::load_arch_tensors(llama_model_loader &) {
     LLAMA_LOAD_LOCALS;
 
-    const int64_t n_embd_target_features = (int64_t) hparams.n_embd_target_features;
+    const int64_t n_embd_inp = hparams.n_embd_inp();
     const int64_t n_embd_attn_input = 2 * n_embd;
 
     // Get vocab size from the d2t tensor in the GGUF file (optional - only needed if eagle3 has different vocab_size than target)
@@ -50,7 +50,7 @@ void llama_model_eagle3::load_arch_tensors(llama_model_loader &) {
     }
 
     // Feature fusion layer: projects 3 target layers to draft hidden size
-    fc = create_tensor(tn(LLM_TENSOR_EAGLE3_FC, "weight"), {n_embd_target_features, n_embd}, 0);
+    fc = create_tensor(tn(LLM_TENSOR_EAGLE3_FC, "weight"), {n_embd_inp, n_embd}, 0);
 
     // Output layer (uses draft vocab size)
     output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
@@ -104,14 +104,12 @@ std::unique_ptr<llm_graph_context> llama_model_eagle3::build_arch_graph(const ll
 
 template <>
 ggml_tensor * llama_model_eagle3::graph<true>::build_inp_embd_enc() const {
-    const int64_t n_embd_target_features = (int64_t) hparams.n_embd_target_features;
-
     ggml_tensor * cur = nullptr;
 
     // Input: Target model features (3 layers concatenated: low, mid, high)
     // Data will be provided via ubatch->embd in encode_eagle3_features()
-    auto inp_target = std::make_unique<llm_graph_input_embd>(n_embd_target_features);
-    inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_target_features, n_tokens);
+    auto inp_target = std::make_unique<llm_graph_input_embd>(hparams.n_embd_inp());
+    inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32,hparams.n_embd_inp(), n_tokens);
     ggml_set_input(inp_target->embd);
 
     cur = inp_target->embd;

From 9b2543d00d8d6bf3f42cfd4bea9ad2bfac05a47c Mon Sep 17 00:00:00 2001
From: Ruixiang Wang <wangruixiang07@outlook.com>
Date: Thu, 11 Jun 2026 10:35:55 +0000
Subject: [PATCH 18/27] eagle3: make output.weight optional and inherit from
 target model when needed

---
 src/llama-context.cpp |  2 +-
 src/models/eagle3.cpp | 13 +++++++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 62f05808fe7c..772ba50dc74b 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -101,7 +101,7 @@ llama_context::llama_context(
     }
 
     if (model.arch == LLM_ARCH_EAGLE3) {
-        if (model.tok_embd == nullptr) {
+        if (model.tok_embd == nullptr || model.output == nullptr) {
             if (params.ctx_other == nullptr) {
                 throw std::runtime_error("EAGLE3 requires ctx_other to be set (this warning is normal during memory fitting)");
             }
diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp
index 683c7ffad3c7..471971ab3b13 100644
--- a/src/models/eagle3.cpp
+++ b/src/models/eagle3.cpp
@@ -54,7 +54,7 @@ void llama_model_eagle3::load_arch_tensors(llama_model_loader &) {
 
     // Output layer (uses draft vocab size)
     output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_draft_vocab}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_draft_vocab}, TENSOR_NOT_REQUIRED);
 
     // Token embeddings (optional - Llama 3.3 70B EAGLE3 has its own)
     const struct ggml_tensor * tok_embd_meta = ml->get_tensor_meta(tn(LLM_TENSOR_TOKEN_EMBD, "weight").str().c_str());
@@ -295,7 +295,16 @@ llama_model_eagle3::graph<false>::graph(const llama_model & model, const llm_gra
     cb(cur, "result_norm", -1);
 
     // lm_head - projects to draft vocabulary
-    cur = build_lora_mm(model.output, cur);
+    // if the draft has no own output projection, inherit the target model's lm_head
+    auto * output = model.output;
+    if (output == nullptr) {
+        GGML_ASSERT(cparams.ctx_other != nullptr);
+        const auto * model_other = llama_get_model(cparams.ctx_other);
+
+        GGML_ASSERT(model_other->output != nullptr && "EAGLE3 decoder requires an output projection (own or from target model)");
+        output = model_other->output;
+    }
+    cur = build_lora_mm(output, cur);
 
     if (model.d2t) {
         const int64_t n_draft_vocab = cur->ne[0];

From 1d55316eb18d781131ae84c463f93b252375e002 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 11 Jun 2026 13:16:32 +0300
Subject: [PATCH 19/27] haparams : generic norm-before-residual param

---
 src/llama-hparams.h   | 5 ++---
 src/models/eagle3.cpp | 6 +++---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 6d4c0080db20..eac464e6b645 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -45,6 +45,7 @@ struct llama_hparams {
     bool rope_finetuned;
     bool use_par_res;
     bool swin_norm;
+    bool norm_before_residual = false;
 
     uint32_t n_ctx_train; // context size the model was trained on
     uint32_t n_embd;
@@ -236,10 +237,8 @@ struct llama_hparams {
     std::array<int32_t, LLAMA_MAX_LAYERS> deepstack_mapping_arr;
 
     // eagle3/DFlash sahred params
-    // n_embd_impl = n_extract * target_hidden_size (encoder input dim)
+    // n_embd_inp = n_extract * target_hidden_size (encoder input dim)
     uint32_t target_hidden_size = 0;
-    // eagle3: whether to apply hidden_norm before storing residual
-    bool eagle3_norm_before_residual = false;
 
     // gemma4 per-layer embedding
     uint32_t n_embd_per_layer = 0;
diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp
index 471971ab3b13..2f9999bc402e 100644
--- a/src/models/eagle3.cpp
+++ b/src/models/eagle3.cpp
@@ -22,8 +22,8 @@ void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) {
 
     // eagle3 norm_before_residual (optional, default false)
     // compatible with Readhat eagle3 speculator model
-    ml.get_key(LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, hparams.eagle3_norm_before_residual, false);
-    if (hparams.eagle3_norm_before_residual) {
+    ml.get_key(LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, hparams.norm_before_residual, false);
+    if (hparams.norm_before_residual) {
         LLAMA_LOG_INFO("%s: EAGLE3 norm_before_residual = true\n", __func__);
     }
 
@@ -212,7 +212,7 @@ llama_model_eagle3::graph<false>::graph(const llama_model & model, const llm_gra
         // - false (default): use raw inp_g for residual
         // - true: use normalized g_norm for residual
         // inpL is the concatenated input (normalized inp_embd + normalized inp_g)
-        ggml_tensor * inpSA = hparams.eagle3_norm_before_residual ? g_norm : inpL;
+        ggml_tensor * inpSA = hparams.norm_before_residual ? g_norm : inpL;
 
         // Concatenate normalized inp_embd and normalized inp_g
         cur = ggml_concat(ctx0, embd_norm, g_norm, il);

From 2de116b28b57b4c59524241a61c92c0b3a200645 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 11 Jun 2026 14:38:10 +0300
Subject: [PATCH 20/27] llama-ext : consistent names

---
 common/speculative.cpp |  4 +--
 src/llama-context.cpp  | 63 ++++++++++++++++++++++++------------------
 src/llama-context.h    |  8 ++----
 src/llama-ext.h        | 14 +++++-----
 4 files changed, 48 insertions(+), 41 deletions(-)

diff --git a/common/speculative.cpp b/common/speculative.cpp
index 6af8dc2a9d0b..a3a64f925a9d 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -483,7 +483,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
 
         // turn on extraction of the target layers' input embeddings
         for (uint32_t k = 0; k < n_extract_layers; ++k) {
-            llama_set_output_layer_inp(ctx_tgt, (uint32_t) extract_layers[k], true);
+            llama_set_embeddings_layer_inp(ctx_tgt, (uint32_t) extract_layers[k], true);
         }
 
         // turn on extraction of the draft model's pre-norm hidden state
@@ -559,7 +559,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
         features_buf.resize((size_t) n_tokens * n_embd_enc, 0.0f);
 
         for (uint32_t k = 0; k < n_extract_layers; ++k) {
-            const float * layer = llama_get_output_layer_inp(ctx_tgt, (uint32_t) extract_layers[k]);
+            const float * layer = llama_get_embeddings_layer_inp(ctx_tgt, (uint32_t) extract_layers[k]);
             if (!layer) {
                 GGML_ABORT("EAGLE3: target layer %d input not extracted.", extract_layers[k]);
             }
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 772ba50dc74b..5a015cba6755 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -950,6 +950,14 @@ float * llama_context::get_embeddings_nextn_ith(int32_t i) {
     }
 }
 
+float * llama_context::get_embeddings_layer_inp(uint32_t lid) {
+    output_reorder();
+
+    GGML_ASSERT(lid < embd_layer_inp.size() && embd_layer_inp[lid].has_data());
+
+    return embd_layer_inp[lid].data;
+}
+
 llama_token llama_context::get_sampled_token_ith(int32_t idx) {
     output_reorder();
 
@@ -1137,6 +1145,16 @@ void llama_context::set_embeddings_nextn(bool value, bool masked) {
     cparams.embeddings_nextn_masked = masked;
 }
 
+void llama_context::set_embeddings_layer_inp(uint32_t lid, bool enable) {
+    LLAMA_LOG_DEBUG("%s: lid = %d, enable = %d\n", __func__, lid, enable);
+
+    GGML_ASSERT(lid < model.hparams.n_layer());
+
+    cparams.output_layer_inp[lid] = enable;
+
+    sched_need_reserve = true;
+}
+
 void llama_context::set_causal_attn(bool value) {
     LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value);
 
@@ -1278,23 +1296,6 @@ bool llama_context::set_adapter_cvec(
     return res;
 }
 
-void llama_context::set_output_layer_inp(uint32_t layer_id, bool enable) {
-    LLAMA_LOG_DEBUG("%s: layer_id = %d, enable = %d\n", __func__, layer_id, enable);
-
-    GGML_ASSERT(layer_id < model.hparams.n_layer());
-
-    cparams.output_layer_inp[layer_id] = enable;
-
-    sched_need_reserve = true;
-}
-
-float * llama_context::get_output_layer_inp(uint32_t layer_id) {
-    if (layer_id >= embd_layer_inp.size() || !embd_layer_inp[layer_id].has_data()) {
-        return nullptr;
-    }
-    return embd_layer_inp[layer_id].data;
-}
-
 llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
     if (mctx && !mctx->apply()) {
         LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__);
@@ -2269,6 +2270,14 @@ void llama_context::output_reorder() {
             }
         }
 
+        if (embd_layer_inp.size() > 0) {
+            for (int lid = 0; lid < (int) embd_layer_inp.size(); ++lid) {
+                for (uint64_t k = 0; k < n_embd; ++k) {
+                    std::swap(embd_layer_inp[lid].data[i0*n_embd + k], embd_layer_inp[lid].data[i1*n_embd + k]);
+                }
+            }
+        }
+
         if (!sampling.samplers.empty()) {
             assert(sampling.logits.size > 0);
             assert(sampling.probs.size > 0);
@@ -3683,6 +3692,10 @@ void llama_set_embeddings_nextn(llama_context * ctx, bool value, bool masked) {
     ctx->set_embeddings_nextn(value, masked);
 }
 
+void llama_set_embeddings_layer_inp(llama_context * ctx, uint32_t lid, bool value) {
+    ctx->set_embeddings_layer_inp(lid, value);
+}
+
 llama_memory_t llama_get_memory(const struct llama_context * ctx) {
     if (!ctx) {
         return nullptr;
@@ -3703,6 +3716,12 @@ float * llama_get_embeddings_nextn_ith(llama_context * ctx, int32_t i) {
     return ctx->get_embeddings_nextn_ith(i);
 }
 
+float * llama_get_embeddings_layer_inp(llama_context * ctx, uint32_t lid) {
+    ctx->synchronize();
+
+    return ctx->get_embeddings_layer_inp(lid);
+}
+
 bool llama_set_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) {
     return ctx->set_sampler(seq_id, smpl);
 }
@@ -4108,13 +4127,3 @@ llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * c
 llama_context * llama_get_ctx_other(struct llama_context * ctx) {
     return ctx->get_cparams().ctx_other;
 }
-
-void llama_set_output_layer_inp(struct llama_context * ctx, uint32_t layer_id, bool enable) {
-    ctx->set_output_layer_inp(layer_id, enable);
-}
-
-float * llama_get_output_layer_inp(struct llama_context * ctx, uint32_t layer_id) {
-    ctx->synchronize();
-
-    return ctx->get_output_layer_inp(layer_id);
-}
diff --git a/src/llama-context.h b/src/llama-context.h
index d6e321a5049c..853052be2cad 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -88,6 +88,8 @@ struct llama_context {
     float * get_embeddings_nextn();
     float * get_embeddings_nextn_ith(int32_t i);
 
+    float * get_embeddings_layer_inp(uint32_t lid);
+
     llama_token * get_sampled_tokens() const;
     llama_token   get_sampled_token_ith(int32_t idx);
 
@@ -112,6 +114,7 @@ struct llama_context {
 
     void set_embeddings (bool value);
     void set_embeddings_nextn(bool value, bool masked);
+    void set_embeddings_layer_inp(uint32_t lid, bool enable);
     void set_causal_attn(bool value);
     void set_warmup(bool value);
 
@@ -126,11 +129,6 @@ struct llama_context {
                 int32_t   il_start,
                 int32_t   il_end);
 
-    void set_output_layer_inp(uint32_t layer_id, bool enable);
-
-    // read back the input embeddings of the specified layer
-    float * get_output_layer_inp(uint32_t layer_id);
-
     // process a single ubatch with a specific graph type
     // if memory_context is provided, it will be applied first to the context's memory
     // ret contains the status of the graph computation
diff --git a/src/llama-ext.h b/src/llama-ext.h
index f511f091bb86..d20bdefc517d 100644
--- a/src/llama-ext.h
+++ b/src/llama-ext.h
@@ -101,19 +101,19 @@ LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx);
 // LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
 LLAMA_API float * llama_get_embeddings_nextn_ith(struct llama_context * ctx, int32_t i);
 
+// Set whether the context outputs the input embeddings of a specific layer
+LLAMA_API void llama_set_embeddings_layer_inp(struct llama_context * ctx, uint32_t lid, bool value);
+
+// mirrors:
+// LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
+LLAMA_API float * llama_get_embeddings_layer_inp(struct llama_context * ctx, uint32_t lid);
+
 LLAMA_API llama_context * llama_get_ctx_other(struct llama_context * ctx);
 
 //
 // model/context data extraction
 //
 
-// set if the layer input embeddings should be outputed
-LLAMA_API void llama_set_output_layer_inp(struct llama_context * ctx, uint32_t layer_id, bool enable);
-
-// read back the input embeddings of the specified layer for the most recent decode batch
-// the layer must have been enabled via llama_set_output_layer_inp
-LLAMA_API float * llama_get_output_layer_inp(struct llama_context * ctx, uint32_t layer_id);
-
 //
 // eagle3/DFlash: consume target model extracted features
 //

From f4088797e483e06008be6e9aaf2be3379c7adfdf Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 11 Jun 2026 15:04:53 +0300
Subject: [PATCH 21/27] cont : fix

---
 src/llama-context.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 5a015cba6755..077eab7753b3 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -2272,8 +2272,10 @@ void llama_context::output_reorder() {
 
         if (embd_layer_inp.size() > 0) {
             for (int lid = 0; lid < (int) embd_layer_inp.size(); ++lid) {
-                for (uint64_t k = 0; k < n_embd; ++k) {
-                    std::swap(embd_layer_inp[lid].data[i0*n_embd + k], embd_layer_inp[lid].data[i1*n_embd + k]);
+                if (embd_layer_inp[lid].size > 0) {
+                    for (uint64_t k = 0; k < n_embd; ++k) {
+                        std::swap(embd_layer_inp[lid].data[i0*n_embd + k], embd_layer_inp[lid].data[i1*n_embd + k]);
+                    }
                 }
             }
         }

From d37323315c16128dd9c3a6e98a4eb13be4c9f1f9 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 11 Jun 2026 15:10:52 +0300
Subject: [PATCH 22/27] hparams : remove target_hidden_size

---
 common/speculative.cpp | 20 +++++++-------------
 src/llama-ext.h        |  2 --
 src/llama-hparams.h    |  4 ----
 src/llama-model.cpp    |  4 ----
 src/models/eagle3.cpp  |  9 +++++----
 5 files changed, 12 insertions(+), 27 deletions(-)

diff --git a/common/speculative.cpp b/common/speculative.cpp
index a3a64f925a9d..06c28aa322dc 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -418,7 +418,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
 
     int32_t         n_embd_dec       = 0;       // draft hidden size
     int32_t         n_embd_enc       = 0;       // n_extract_layers * target_hidden_size
-    int32_t         tgt_hidden       = 0;       // target model hidden size
+    int32_t         n_embd_tgt       = 0;       // target model hidden size
     const int32_t * extract_layers   = nullptr; // model_dft's extract layer indices
     uint32_t        n_extract_layers = 0;
 
@@ -456,15 +456,9 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
                                      std::to_string(n_extract_layers) + ")");
         }
 
-        tgt_hidden = (int32_t) llama_model_target_hidden_size(model_dft);
-        if (tgt_hidden != llama_model_n_embd(model_tgt)) {
-            throw std::runtime_error("EAGLE3 target_hidden_size mismatch (draft expects " +
-                                     std::to_string(tgt_hidden) + ", target n_embd is " +
-                                     std::to_string(llama_model_n_embd(model_tgt)) + ")");
-        }
-
+        n_embd_tgt = llama_model_n_embd(model_tgt);
         n_embd_dec = llama_model_n_embd(model_dft);
-        n_embd_enc = (int32_t) n_extract_layers * tgt_hidden;
+        n_embd_enc = (int32_t) n_extract_layers * n_embd_tgt;
 
         const int32_t n_b = (int32_t) llama_n_batch(ctx_dft);
         batch = llama_batch_init(/*n_tokens=*/ n_b, /*embd=*/ n_embd_dec, /*n_seq_max=*/ 1);
@@ -554,7 +548,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
         auto * ctx_dft = this->params.ctx_dft;
 
         // Interleave each extract_layer's hidden state into a contiguous buffer of
-        // shape [n_tokens, n_extract_layers * tgt_hidden]. Then run EAGLE3 encoder
+        // shape [n_tokens, n_extract_layers * n_embd_tgt]. Then run EAGLE3 encoder
         // to get one g_embd row per token.
         features_buf.resize((size_t) n_tokens * n_embd_enc, 0.0f);
 
@@ -564,9 +558,9 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
                 GGML_ABORT("EAGLE3: target layer %d input not extracted.", extract_layers[k]);
             }
             for (int32_t i = 0; i < n_tokens; ++i) {
-                float * dst = features_buf.data() + (size_t) i * n_embd_enc + k * (size_t) tgt_hidden;
-                const float * src = layer + (size_t) i * tgt_hidden;
-                std::memcpy(dst, src, (size_t) tgt_hidden * sizeof(float));
+                float * dst = features_buf.data() + (size_t) i * n_embd_enc + k * (size_t) n_embd_tgt;
+                const float * src = layer + (size_t) i * n_embd_tgt;
+                std::memcpy(dst, src, (size_t) n_embd_tgt * sizeof(float));
             }
         }
 
diff --git a/src/llama-ext.h b/src/llama-ext.h
index d20bdefc517d..105daa367c96 100644
--- a/src/llama-ext.h
+++ b/src/llama-ext.h
@@ -122,5 +122,3 @@ LLAMA_API llama_context * llama_get_ctx_other(struct llama_context * ctx);
 LLAMA_API const int32_t * llama_model_target_extract_layers  (const struct llama_model * model);
 // returns the number of extracted layers from target model
 LLAMA_API uint32_t        llama_model_n_target_extract_layers(const struct llama_model * model);
-// returns the target model hidden size
-LLAMA_API uint32_t        llama_model_target_hidden_size     (const struct llama_model * model);
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index eac464e6b645..d045059a63e9 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -236,10 +236,6 @@ struct llama_hparams {
     // >=0 => input embedding index for deepstack injection
     std::array<int32_t, LLAMA_MAX_LAYERS> deepstack_mapping_arr;
 
-    // eagle3/DFlash sahred params
-    // n_embd_inp = n_extract * target_hidden_size (encoder input dim)
-    uint32_t target_hidden_size = 0;
-
     // gemma4 per-layer embedding
     uint32_t n_embd_per_layer = 0;
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 5a2b7aaec451..0e81a49768e0 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2700,7 +2700,3 @@ const int32_t * llama_model_target_extract_layers(const struct llama_model * mod
 uint32_t llama_model_n_target_extract_layers(const struct llama_model * model) {
     return (uint32_t) model->target_extract_layers.size();
 }
-
-uint32_t llama_model_target_hidden_size(const struct llama_model * model) {
-    return model->hparams.target_hidden_size;
-}
diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp
index 2f9999bc402e..f23b1f5ee9b0 100644
--- a/src/models/eagle3.cpp
+++ b/src/models/eagle3.cpp
@@ -14,11 +14,12 @@ void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) {
             target_extract_layers[1],
             target_extract_layers[2]);
 
-    ml.get_key(LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, hparams.target_hidden_size);
-    LLAMA_LOG_INFO("%s: EAGLE3 target_hidden_size = %u (draft n_embd = %u)\n", __func__,
-            hparams.target_hidden_size, hparams.n_embd);
+    uint32_t n_embd_tgt = 0;
 
-    hparams.n_embd_inp_impl = (uint32_t) target_extract_layers.size() * hparams.target_hidden_size;
+    ml.get_key(LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, n_embd_tgt);
+    LLAMA_LOG_INFO("%s: EAGLE3 n_embd_tgt = %u (draft n_embd = %u)\n", __func__, n_embd_tgt, hparams.n_embd);
+
+    hparams.n_embd_inp_impl = (uint32_t) target_extract_layers.size() * n_embd_tgt;
 
     // eagle3 norm_before_residual (optional, default false)
     // compatible with Readhat eagle3 speculator model

From 5caedbcd4fbffd057eb0963ff4af1709a070a7c2 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 11 Jun 2026 15:14:27 +0300
Subject: [PATCH 23/27] cparams : rename output_layer_inp ->
 embeddings_layer_inp

---
 src/llama-context.cpp | 12 ++++++------
 src/llama-cparams.h   |  2 +-
 src/llama-graph.cpp   |  6 +++---
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 077eab7753b3..23f5a7ee29cf 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -71,7 +71,7 @@ llama_context::llama_context(
     cparams.no_perf                 = params.no_perf;
     cparams.warmup                  = false;
 
-    cparams.output_layer_inp.resize(hparams.n_layer(), false);
+    cparams.embeddings_layer_inp.resize(hparams.n_layer(), false);
     embd_layer_inp.resize(hparams.n_layer());
 
     cparams.ctx_type     = params.ctx_type;
@@ -1150,7 +1150,7 @@ void llama_context::set_embeddings_layer_inp(uint32_t lid, bool enable) {
 
     GGML_ASSERT(lid < model.hparams.n_layer());
 
-    cparams.output_layer_inp[lid] = enable;
+    cparams.embeddings_layer_inp[lid] = enable;
 
     sched_need_reserve = true;
 }
@@ -2089,7 +2089,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
         embd_nextn.size = (size_t) n_embd_out * n_batch;
     }
 
-    for (bool enabled : cparams.output_layer_inp) {
+    for (bool enabled : cparams.embeddings_layer_inp) {
         if (enabled) {
             embd_layer_inp_float_count += (size_t) n_embd * n_batch;
         }
@@ -2162,7 +2162,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
     offset += embd_nextn.size * sizeof(float);
 
     for (uint32_t il = 0; il < embd_layer_inp.size(); ++il) {
-        if (cparams.output_layer_inp[il]) {
+        if (cparams.embeddings_layer_inp[il]) {
             embd_layer_inp[il] = buffer_view<float>{(float *) (base + offset), (size_t) n_embd * n_batch};
             offset += embd_layer_inp[il].size * sizeof(float);
         } else {
@@ -2217,8 +2217,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
 }
 
 void llama_context::extract_layer_inputs(const llm_graph_result * res, size_t token_offset, size_t n_tokens) {
-    for (uint32_t il = 0; il < cparams.output_layer_inp.size(); ++il) {
-        if (!cparams.output_layer_inp[il]) {
+    for (uint32_t il = 0; il < cparams.embeddings_layer_inp.size(); ++il) {
+        if (!cparams.embeddings_layer_inp[il]) {
             continue;
         }
         if (!embd_layer_inp[il].has_data()) {
diff --git a/src/llama-cparams.h b/src/llama-cparams.h
index cb326c8e31ca..2b109f909c0b 100644
--- a/src/llama-cparams.h
+++ b/src/llama-cparams.h
@@ -45,7 +45,7 @@ struct llama_cparams {
     bool kv_unified;
     bool pipeline_parallel;
 
-    std::vector<bool> output_layer_inp;
+    std::vector<bool> embeddings_layer_inp; // [n_layer()] extract input embeddings for layer
 
     enum llama_context_type ctx_type;
     enum llama_pooling_type pooling_type;
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 83609e5294f9..fe28385d3b7e 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -950,9 +950,9 @@ void llm_graph_result::set_outputs(const llm_graph_params & params) {
         ggml_set_output(t_h_nextn);
     }
     {
-        const auto & output_layer_inp = params.cparams.output_layer_inp;
-        for (size_t il = 0; il < output_layer_inp.size(); ++il) {
-            if (output_layer_inp[il]) {
+        const auto & embeddings_layer_inp = params.cparams.embeddings_layer_inp;
+        for (size_t il = 0; il < embeddings_layer_inp.size(); ++il) {
+            if (embeddings_layer_inp[il]) {
                 ggml_set_output(t_layer_inp[il]);
             }
         }

From 0274f0fc7d7fbab69c33ccf757cf7f3d036c17aa Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 11 Jun 2026 15:26:10 +0300
Subject: [PATCH 24/27] arch : reuse ATTN_NORM_2 instead of adding new hidden
 norm

---
 conversion/llama.py       | 2 +-
 gguf-py/gguf/constants.py | 4 +---
 src/llama-arch.cpp        | 3 ---
 src/llama-arch.h          | 1 -
 src/llama-model.h         | 3 ---
 src/models/eagle3.cpp     | 8 ++++----
 6 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/conversion/llama.py b/conversion/llama.py
index dd732716545e..0cce96a582e0 100644
--- a/conversion/llama.py
+++ b/conversion/llama.py
@@ -231,7 +231,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
                 # not used at runtime, skip
                 return
             if name == "model.layers.0.hidden_norm.weight":
-                yield ("blk.0.hidden_norm.weight", data_torch)
+                yield ("blk.0.attn_norm_2.weight", data_torch)
                 return
 
         n_head = self.find_hparam(["n_heads", "num_attention_heads"])
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index b1f4f00fbecb..545b37db733e 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -914,7 +914,6 @@ class MODEL_TENSOR(IntEnum):
     NEXTN_SHARED_HEAD_NORM = auto()
     # eagle3
     EAGLE3_FC          = auto()  # feature fusion layer
-    EAGLE3_HIDDEN_NORM = auto()  # hidden normalization
     EAGLE3_D2T         = auto()  # draft to target vocabulary mapping
     # lfm2 audio
     A_ENC_NORM_CONV        = auto()
@@ -1497,7 +1496,6 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD:    "blk.{bid}.nextn.shared_head_head",
     MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM:    "blk.{bid}.nextn.shared_head_norm",
     MODEL_TENSOR.EAGLE3_FC:                 "fc",
-    MODEL_TENSOR.EAGLE3_HIDDEN_NORM:        "blk.{bid}.hidden_norm",
     MODEL_TENSOR.EAGLE3_D2T:                "d2t",
 }
 
@@ -4045,6 +4043,7 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.OUTPUT,
         MODEL_TENSOR.ROPE_FREQS,
         MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_NORM_2,
         MODEL_TENSOR.ATTN_Q,
         MODEL_TENSOR.ATTN_K,
         MODEL_TENSOR.ATTN_V,
@@ -4054,7 +4053,6 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
         MODEL_TENSOR.EAGLE3_FC,
-        MODEL_TENSOR.EAGLE3_HIDDEN_NORM,
         MODEL_TENSOR.EAGLE3_D2T,
     ],
     MODEL_ARCH.MISTRAL4: [
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index a58f599712de..f88c3bf74e11 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -3,7 +3,6 @@
 #include "llama-impl.h"
 
 #include <map>
-#include <set>
 #include <vector>
 
 static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
@@ -566,7 +565,6 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_INDEXER_ATTN_Q_B,                       "blk.%d.indexer.attn_q_b" },
     { LLM_TENSOR_MASKED_EMBD_CENTROIDS,                  "masked_embd_centroids" },
     { LLM_TENSOR_MASKED_EMBD_ORDERING,                   "masked_embd_ordering" },
-    { LLM_TENSOR_EAGLE3_HIDDEN_NORM,                     "blk.%d.hidden_norm" },
     { LLM_TENSOR_EAGLE3_FC,                              "fc" },
     { LLM_TENSOR_EAGLE3_D2T,                             "d2t" },
 };
@@ -797,7 +795,6 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_MASKED_EMBD_ORDERING,       {LLM_TENSOR_LAYER_INPUT,     GGML_OP_NONE}},
     // eagle3
     {LLM_TENSOR_EAGLE3_FC,                  {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_EAGLE3_HIDDEN_NORM,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_EAGLE3_D2T,                 {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_GET_ROWS}},
 };
 
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 0303cf6c11f7..dc06d02157ca 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -574,7 +574,6 @@ enum llm_tensor {
     LLM_TENSOR_MASKED_EMBD_CENTROIDS,
     LLM_TENSOR_MASKED_EMBD_ORDERING,
     LLM_TENSOR_EAGLE3_FC,
-    LLM_TENSOR_EAGLE3_HIDDEN_NORM, // TODO: remove, use LLM_TENSOR_ATTN_NORM instead
     LLM_TENSOR_EAGLE3_D2T,
 };
 
diff --git a/src/llama-model.h b/src/llama-model.h
index b28eb7baf256..a350de4c01e6 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -475,9 +475,6 @@ struct llama_layer {
     struct ggml_tensor * ffn_act_beta    = nullptr;
     struct ggml_tensor * ffn_act_eps     = nullptr;
 
-    // eagle3
-    struct ggml_tensor * eagle3_hidden_norm = nullptr;
-
     // Kimi Linear KDA (using ssm_ prefix for consistency)
     // Note: ssm_dt_b already exists above (mamba bias), reused for Kimi dt_bias
     struct ggml_tensor * ssm_q_conv = nullptr;
diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp
index f23b1f5ee9b0..3a299126b574 100644
--- a/src/models/eagle3.cpp
+++ b/src/models/eagle3.cpp
@@ -72,15 +72,15 @@ void llama_model_eagle3::load_arch_tensors(llama_model_loader &) {
         // input_layernorm: applied to token embeddings
         layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
+        // eagle3 specific: hidden_norm applied to fused target features
+        layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0);
+
         // Attention takes input_embeds_normed + fused_target_normed as input
         layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd_attn_input, n_embd_head_k * n_head}, 0);
         layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd_attn_input, n_embd_k_gqa}, 0);
         layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd_attn_input, n_embd_v_gqa}, 0);
         layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);
 
-        // eagle3 specific: hidden_norm applied to fused target features
-        layer.eagle3_hidden_norm = create_tensor(tn(LLM_TENSOR_EAGLE3_HIDDEN_NORM, "weight", i), {n_embd}, 0);
-
         layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
         layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
         layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
@@ -205,7 +205,7 @@ llama_model_eagle3::graph<false>::graph(const llama_model & model, const llm_gra
 
         // Apply hidden_norm to inp_g
         ggml_tensor * g_norm = build_norm(inp_g,
-                model.layers[il].eagle3_hidden_norm, NULL,
+                model.layers[il].attn_norm_2, NULL,
                 LLM_NORM_RMS, -1);
         cb(g_norm, "g_norm", il);
 

From 9baa68be99c0969e1181d805d649b8cb132b15a0 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 11 Jun 2026 15:42:08 +0300
Subject: [PATCH 25/27] llama : clean-up names

---
 common/speculative.cpp    | 33 +++++++++++++++++----------------
 conversion/llama.py       |  8 ++++----
 gguf-py/gguf/constants.py | 34 +++++++++++++++++-----------------
 src/llama-arch.cpp        | 20 ++++++++++----------
 src/llama-arch.h          | 10 +++++-----
 src/llama-ext.h           |  8 ++------
 src/llama-model.cpp       |  8 ++++----
 src/llama-model.h         |  2 +-
 src/models/eagle3.cpp     | 22 +++++++++++-----------
 9 files changed, 71 insertions(+), 74 deletions(-)

diff --git a/common/speculative.cpp b/common/speculative.cpp
index 06c28aa322dc..87e9047de73a 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -416,11 +416,12 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
 
     std::vector<common_sampler_ptr> smpls;
 
-    int32_t         n_embd_dec       = 0;       // draft hidden size
-    int32_t         n_embd_enc       = 0;       // n_extract_layers * target_hidden_size
-    int32_t         n_embd_tgt       = 0;       // target model hidden size
-    const int32_t * extract_layers   = nullptr; // model_dft's extract layer indices
-    uint32_t        n_extract_layers = 0;
+    int32_t n_embd_dec = 0;       // draft hidden size
+    int32_t n_embd_enc = 0;       // target_layer_ids_n * target_hidden_size
+    int32_t n_embd_tgt = 0;       // target model hidden size
+
+    const int32_t * target_layer_ids   = nullptr; // model_dft's extract layer indices
+    uint32_t        target_layer_ids_n = 0;
 
     // [per-seq] deferred boundary state
     std::vector<std::vector<float>> pending_g_last;
@@ -449,16 +450,16 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
         const llama_model * model_dft = llama_get_model(ctx_dft);
         const llama_model * model_tgt = llama_get_model(ctx_tgt);
 
-        extract_layers   = llama_model_target_extract_layers  (model_dft);
-        n_extract_layers = llama_model_n_target_extract_layers(model_dft);
-        if (n_extract_layers != 3) {
+        target_layer_ids   = llama_model_target_layer_ids  (model_dft);
+        target_layer_ids_n = llama_model_target_layer_ids_n(model_dft);
+        if (target_layer_ids_n != 3) {
             throw std::runtime_error("draft model is not eagle3 (expected 3 extract layers, got " +
-                                     std::to_string(n_extract_layers) + ")");
+                                     std::to_string(target_layer_ids_n) + ")");
         }
 
         n_embd_tgt = llama_model_n_embd(model_tgt);
         n_embd_dec = llama_model_n_embd(model_dft);
-        n_embd_enc = (int32_t) n_extract_layers * n_embd_tgt;
+        n_embd_enc = (int32_t) target_layer_ids_n * n_embd_tgt;
 
         const int32_t n_b = (int32_t) llama_n_batch(ctx_dft);
         batch = llama_batch_init(/*n_tokens=*/ n_b, /*embd=*/ n_embd_dec, /*n_seq_max=*/ 1);
@@ -476,8 +477,8 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
         }
 
         // turn on extraction of the target layers' input embeddings
-        for (uint32_t k = 0; k < n_extract_layers; ++k) {
-            llama_set_embeddings_layer_inp(ctx_tgt, (uint32_t) extract_layers[k], true);
+        for (uint32_t k = 0; k < target_layer_ids_n; ++k) {
+            llama_set_embeddings_layer_inp(ctx_tgt, (uint32_t) target_layer_ids[k], true);
         }
 
         // turn on extraction of the draft model's pre-norm hidden state
@@ -548,14 +549,14 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
         auto * ctx_dft = this->params.ctx_dft;
 
         // Interleave each extract_layer's hidden state into a contiguous buffer of
-        // shape [n_tokens, n_extract_layers * n_embd_tgt]. Then run EAGLE3 encoder
+        // shape [n_tokens, target_layer_ids_n * n_embd_tgt]. Then run EAGLE3 encoder
         // to get one g_embd row per token.
         features_buf.resize((size_t) n_tokens * n_embd_enc, 0.0f);
 
-        for (uint32_t k = 0; k < n_extract_layers; ++k) {
-            const float * layer = llama_get_embeddings_layer_inp(ctx_tgt, (uint32_t) extract_layers[k]);
+        for (uint32_t k = 0; k < target_layer_ids_n; ++k) {
+            const float * layer = llama_get_embeddings_layer_inp(ctx_tgt, (uint32_t) target_layer_ids[k]);
             if (!layer) {
-                GGML_ABORT("EAGLE3: target layer %d input not extracted.", extract_layers[k]);
+                GGML_ABORT("EAGLE3: target layer %d input not extracted.", target_layer_ids[k]);
             }
             for (int32_t i = 0; i < n_tokens; ++i) {
                 float * dst = features_buf.data() + (size_t) i * n_embd_enc + k * (size_t) n_embd_tgt;
diff --git a/conversion/llama.py b/conversion/llama.py
index 0cce96a582e0..cad802bf8246 100644
--- a/conversion/llama.py
+++ b/conversion/llama.py
@@ -68,11 +68,11 @@ def __init__(self, *args, **kwargs):
                 target_config = {**target_config, **target_config["text_config"]}
             self.target_vocab_size = target_config["vocab_size"]
 
-            # extract_layers: derived from target model layer count (low/mid/high)
+            # target_layers: derived from target model layer count (low/mid/high)
             target_num_layers = target_config["num_hidden_layers"]
-            extract_layers = [2, target_num_layers // 2, target_num_layers - 3]
-            logger.info(f"EAGLE-3: extract_layers = {extract_layers} (target model has {target_num_layers} layers)")
-            self.gguf_writer.add_array(f"{self.gguf_writer.arch}.extract_layers", extract_layers)
+            target_layers = [2, target_num_layers // 2, target_num_layers - 3]
+            logger.info(f"EAGLE-3: target_layers = {target_layers} (target model has {target_num_layers} layers)")
+            self.gguf_writer.add_array(f"{self.gguf_writer.arch}.target_layers", target_layers)
 
             # target_hidden_size: prefer eagle3 config, fallback to target config
             if eagle3_raw_config.get("target_hidden_size") is not None:
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 545b37db733e..bebc57a1b615 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -154,9 +154,9 @@ class LLM:
         HIDDEN_ACT                        = "{arch}.hidden_activation"
         DENSE_FEAT_IN_SIZE                = "{arch}.{dense}_feat_in"
         DENSE_FEAT_OUT_SIZE               = "{arch}.{dense}_feat_out"
-        EAGLE3_EXTRACT_LAYERS             = "{arch}.extract_layers"
-        EAGLE3_TARGET_HIDDEN_SIZE         = "{arch}.target_hidden_size"
-        EAGLE3_NORM_BEFORE_RESIDUAL       = "{arch}.norm_before_residual"
+        TARGET_LAYERS                     = "{arch}.target_layers"
+        TARGET_HIDDEN_SIZE                = "{arch}.target_hidden_size"
+        NORM_BEFORE_RESIDUAL              = "{arch}.norm_before_residual"
 
     class Attention:
         HEAD_COUNT                   = "{arch}.attention.head_count"
@@ -904,17 +904,17 @@ class MODEL_TENSOR(IntEnum):
     A_PER_DIM_K_SCALE     = auto() # gemma4
     A_PER_DIM_SCALE       = auto() # gemma4
     # nextn/mtp
-    NEXTN_PROJ_PRE       = auto()
-    NEXTN_PROJ_POST      = auto()
-    NEXTN_EH_PROJ        = auto()
-    NEXTN_EMBED_TOKENS   = auto()
-    NEXTN_ENORM          = auto()
-    NEXTN_HNORM          = auto()
+    NEXTN_PROJ_PRE         = auto()
+    NEXTN_PROJ_POST        = auto()
+    NEXTN_EH_PROJ          = auto()
+    NEXTN_EMBED_TOKENS     = auto()
+    NEXTN_ENORM            = auto()
+    NEXTN_HNORM            = auto()
     NEXTN_SHARED_HEAD_HEAD = auto()
     NEXTN_SHARED_HEAD_NORM = auto()
     # eagle3
-    EAGLE3_FC          = auto()  # feature fusion layer
-    EAGLE3_D2T         = auto()  # draft to target vocabulary mapping
+    FC                     = auto()  # feature fusion layer
+    D2T                    = auto()  # draft to target vocabulary mapping
     # lfm2 audio
     A_ENC_NORM_CONV        = auto()
     A_ENC_LINEAR_POS       = auto()
@@ -1102,8 +1102,8 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.POS_EMBD:                  "position_embd",
     MODEL_TENSOR.OUTPUT_NORM:               "output_norm",
     MODEL_TENSOR.OUTPUT:                    "output",
-    MODEL_TENSOR.DENSE_2_OUT:                "dense_2", # embeddinggemma 2_Dense
-    MODEL_TENSOR.DENSE_3_OUT:                "dense_3", # embeddinggemma 2_Dense
+    MODEL_TENSOR.DENSE_2_OUT:               "dense_2", # embeddinggemma 2_Dense
+    MODEL_TENSOR.DENSE_3_OUT:               "dense_3", # embeddinggemma 2_Dense
     MODEL_TENSOR.ROPE_FREQS:                "rope_freqs",
     MODEL_TENSOR.ROPE_FACTORS_LONG:         "rope_factors_long",
     MODEL_TENSOR.ROPE_FACTORS_SHORT:        "rope_factors_short",
@@ -1495,8 +1495,8 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.NEXTN_HNORM:               "blk.{bid}.nextn.hnorm",
     MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD:    "blk.{bid}.nextn.shared_head_head",
     MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM:    "blk.{bid}.nextn.shared_head_norm",
-    MODEL_TENSOR.EAGLE3_FC:                 "fc",
-    MODEL_TENSOR.EAGLE3_D2T:                "d2t",
+    MODEL_TENSOR.FC:                        "fc",
+    MODEL_TENSOR.D2T:                       "d2t",
 }
 
 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -4052,8 +4052,8 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_GATE,
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
-        MODEL_TENSOR.EAGLE3_FC,
-        MODEL_TENSOR.EAGLE3_D2T,
+        MODEL_TENSOR.FC,
+        MODEL_TENSOR.D2T,
     ],
     MODEL_ARCH.MISTRAL4: [
         MODEL_TENSOR.TOKEN_EMBD,
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index f88c3bf74e11..6af9b0df9848 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -292,16 +292,16 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
 
     { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
 
-    { LLM_KV_EAGLE3_EXTRACT_LAYERS,        "%s.extract_layers"        },
-    { LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE,    "%s.target_hidden_size"    },
-    { LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL,  "%s.norm_before_residual"  },
+    { LLM_KV_TARGET_LAYERS,         "%s.target_layers"        },
+    { LLM_KV_TARGET_HIDDEN_SIZE,    "%s.target_hidden_size"   },
+    { LLM_KV_NORM_BEFORE_RESIDUAL,  "%s.norm_before_residual" },
 
     { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" },
     // sentence-transformers dense modules feature dims
     { LLM_KV_DENSE_2_FEAT_IN,        "%s.dense_2_feat_in"  },
-    { LLM_KV_DENSE_2_FEAT_OUT,       "%s.dense_2_feat_out"  },
-    { LLM_KV_DENSE_3_FEAT_IN,        "%s.dense_3_feat_in"   },
-    { LLM_KV_DENSE_3_FEAT_OUT,       "%s.dense_3_feat_out"  },
+    { LLM_KV_DENSE_2_FEAT_OUT,       "%s.dense_2_feat_out" },
+    { LLM_KV_DENSE_3_FEAT_IN,        "%s.dense_3_feat_in"  },
+    { LLM_KV_DENSE_3_FEAT_OUT,       "%s.dense_3_feat_out" },
 
     { LLM_KV_TOKENIZER_MODEL,                "tokenizer.ggml.model"                    },
     { LLM_KV_TOKENIZER_PRE,                  "tokenizer.ggml.pre"                      },
@@ -565,8 +565,8 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_INDEXER_ATTN_Q_B,                       "blk.%d.indexer.attn_q_b" },
     { LLM_TENSOR_MASKED_EMBD_CENTROIDS,                  "masked_embd_centroids" },
     { LLM_TENSOR_MASKED_EMBD_ORDERING,                   "masked_embd_ordering" },
-    { LLM_TENSOR_EAGLE3_FC,                              "fc" },
-    { LLM_TENSOR_EAGLE3_D2T,                             "d2t" },
+    { LLM_TENSOR_FC,                                     "fc" },
+    { LLM_TENSOR_D2T,                                    "d2t" },
 };
 
 // declare information about the model weight tensors:
@@ -794,8 +794,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_MASKED_EMBD_CENTROIDS,      {LLM_TENSOR_LAYER_INPUT,     GGML_OP_NONE}},
     {LLM_TENSOR_MASKED_EMBD_ORDERING,       {LLM_TENSOR_LAYER_INPUT,     GGML_OP_NONE}},
     // eagle3
-    {LLM_TENSOR_EAGLE3_FC,                  {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_EAGLE3_D2T,                 {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_GET_ROWS}},
+    {LLM_TENSOR_FC,                         {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_D2T,                        {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_GET_ROWS}},
 };
 
 LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
diff --git a/src/llama-arch.h b/src/llama-arch.h
index dc06d02157ca..723d2f8d2371 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -337,9 +337,9 @@ enum llm_kv {
 
     LLM_KV_CLASSIFIER_OUTPUT_LABELS,
 
-    LLM_KV_EAGLE3_EXTRACT_LAYERS,
-    LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE,
-    LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL,
+    LLM_KV_TARGET_LAYERS,
+    LLM_KV_TARGET_HIDDEN_SIZE,
+    LLM_KV_NORM_BEFORE_RESIDUAL,
 
     LLM_KV_SHORTCONV_L_CACHE,
 
@@ -573,8 +573,8 @@ enum llm_tensor {
     LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
     LLM_TENSOR_MASKED_EMBD_CENTROIDS,
     LLM_TENSOR_MASKED_EMBD_ORDERING,
-    LLM_TENSOR_EAGLE3_FC,
-    LLM_TENSOR_EAGLE3_D2T,
+    LLM_TENSOR_FC,
+    LLM_TENSOR_D2T,
 };
 
 
diff --git a/src/llama-ext.h b/src/llama-ext.h
index 105daa367c96..b744af52864b 100644
--- a/src/llama-ext.h
+++ b/src/llama-ext.h
@@ -114,11 +114,7 @@ LLAMA_API llama_context * llama_get_ctx_other(struct llama_context * ctx);
 // model/context data extraction
 //
 
-//
-// eagle3/DFlash: consume target model extracted features
-//
-
 // returns pointer to the target-model layer indices
-LLAMA_API const int32_t * llama_model_target_extract_layers  (const struct llama_model * model);
+LLAMA_API const int32_t * llama_model_target_layer_ids  (const struct llama_model * model);
 // returns the number of extracted layers from target model
-LLAMA_API uint32_t        llama_model_n_target_extract_layers(const struct llama_model * model);
+LLAMA_API uint32_t        llama_model_target_layer_ids_n(const struct llama_model * model);
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 0e81a49768e0..7281ed79f105 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2692,11 +2692,11 @@ void llama_model_base::create_tensor_qkv(llama_layer & layer, int bid,
     }
 }
 
-const int32_t * llama_model_target_extract_layers(const struct llama_model * model) {
-    const auto & v = model->target_extract_layers;
+const int32_t * llama_model_target_layer_ids(const struct llama_model * model) {
+    const auto & v = model->target_layer_ids;
     return v.empty() ? nullptr : v.data();
 }
 
-uint32_t llama_model_n_target_extract_layers(const struct llama_model * model) {
-    return (uint32_t) model->target_extract_layers.size();
+uint32_t llama_model_target_layer_ids_n(const struct llama_model * model) {
+    return (uint32_t) model->target_layer_ids.size();
 }
diff --git a/src/llama-model.h b/src/llama-model.h
index a350de4c01e6..f4718f6d5842 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -574,7 +574,7 @@ struct llama_model {
     struct ggml_tensor * d2t = nullptr;  // draft to target vocabulary mapping
 
     // unified vector to store target-model extracted layer ids in eagle3, dflash, etc.
-    std::vector<int32_t> target_extract_layers;
+    std::vector<int32_t> target_layer_ids;
 
     std::vector<llama_layer> layers;
 
diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp
index 3a299126b574..a72eca889f6a 100644
--- a/src/models/eagle3.cpp
+++ b/src/models/eagle3.cpp
@@ -3,29 +3,29 @@
 void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
-    if (!ml.get_arr(LLM_KV_EAGLE3_EXTRACT_LAYERS, target_extract_layers, false)) {
+    if (!ml.get_arr(LLM_KV_TARGET_LAYERS, target_layer_ids, false)) {
         throw std::runtime_error("EAGLE3 model requires 'extract_layers' in GGUF metadata");
     }
-    if (target_extract_layers.size() != 3) {
+    if (target_layer_ids.size() != 3) {
         throw std::runtime_error("EAGLE3 requires exactly 3 entries in 'extract_layers'");
     }
     LLAMA_LOG_INFO("%s: EAGLE3 extract_layers = [%d, %d, %d]\n", __func__,
-            target_extract_layers[0],
-            target_extract_layers[1],
-            target_extract_layers[2]);
+            target_layer_ids[0],
+            target_layer_ids[1],
+            target_layer_ids[2]);
 
     uint32_t n_embd_tgt = 0;
 
-    ml.get_key(LLM_KV_EAGLE3_TARGET_HIDDEN_SIZE, n_embd_tgt);
+    ml.get_key(LLM_KV_TARGET_HIDDEN_SIZE, n_embd_tgt);
     LLAMA_LOG_INFO("%s: EAGLE3 n_embd_tgt = %u (draft n_embd = %u)\n", __func__, n_embd_tgt, hparams.n_embd);
 
-    hparams.n_embd_inp_impl = (uint32_t) target_extract_layers.size() * n_embd_tgt;
+    hparams.n_embd_inp_impl = (uint32_t) target_layer_ids.size() * n_embd_tgt;
 
     // eagle3 norm_before_residual (optional, default false)
     // compatible with Readhat eagle3 speculator model
-    ml.get_key(LLM_KV_EAGLE3_NORM_BEFORE_RESIDUAL, hparams.norm_before_residual, false);
+    ml.get_key(LLM_KV_NORM_BEFORE_RESIDUAL, hparams.norm_before_residual, false);
     if (hparams.norm_before_residual) {
-        LLAMA_LOG_INFO("%s: EAGLE3 norm_before_residual = true\n", __func__);
+        LLAMA_LOG_INFO("%s: EAGLE3gnorm_before_residual = true\n", __func__);
     }
 
     type = LLM_TYPE_UNKNOWN;
@@ -43,7 +43,7 @@ void llama_model_eagle3::load_arch_tensors(llama_model_loader &) {
     const struct ggml_tensor * d2t_meta = ml->get_tensor_meta("d2t");
     if (d2t_meta) {
         n_draft_vocab = d2t_meta->ne[0]; // update draft vocab size
-        d2t = create_tensor(tn(LLM_TENSOR_EAGLE3_D2T), {n_draft_vocab}, 0);
+        d2t = create_tensor(tn(LLM_TENSOR_D2T), {n_draft_vocab}, 0);
         LLAMA_LOG_INFO("%s: EAGLE3 using d2t mapping (draft_vocab_size = %lld)\n", __func__, (long long)n_draft_vocab);
     } else {
         d2t = nullptr; // no d2t, use default vocab size
@@ -51,7 +51,7 @@ void llama_model_eagle3::load_arch_tensors(llama_model_loader &) {
     }
 
     // Feature fusion layer: projects 3 target layers to draft hidden size
-    fc = create_tensor(tn(LLM_TENSOR_EAGLE3_FC, "weight"), {n_embd_inp, n_embd}, 0);
+    fc = create_tensor(tn(LLM_TENSOR_FC, "weight"), {n_embd_inp, n_embd}, 0);
 
     // Output layer (uses draft vocab size)
     output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);

From 0bd54498f273bf290a8fd55152deedf8e7c878dc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 11 Jun 2026 16:06:27 +0300
Subject: [PATCH 26/27] cont : add assert + comment

---
 src/llama-context.cpp | 1 +
 src/llama-graph.cpp   | 1 +
 src/models/eagle3.cpp | 6 ------
 3 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 23f5a7ee29cf..168dbabd7667 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1152,6 +1152,7 @@ void llama_context::set_embeddings_layer_inp(uint32_t lid, bool enable) {
 
     cparams.embeddings_layer_inp[lid] = enable;
 
+    // note: without this reserve, the draft acceptance drops to zero. not sure why - this is unexpected
     sched_need_reserve = true;
 }
 
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index fe28385d3b7e..7468bd9b79ef 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -953,6 +953,7 @@ void llm_graph_result::set_outputs(const llm_graph_params & params) {
         const auto & embeddings_layer_inp = params.cparams.embeddings_layer_inp;
         for (size_t il = 0; il < embeddings_layer_inp.size(); ++il) {
             if (embeddings_layer_inp[il]) {
+                GGML_ASSERT(t_layer_inp[il] != nullptr && "layer input tensor is null");
                 ggml_set_output(t_layer_inp[il]);
             }
         }
diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp
index a72eca889f6a..3321b390515d 100644
--- a/src/models/eagle3.cpp
+++ b/src/models/eagle3.cpp
@@ -192,8 +192,6 @@ llama_model_eagle3::graph<false>::graph(const llama_model & model, const llm_gra
 
     const float kq_scale = 1.0f/sqrtf(float(n_embd_head));
 
-    ggml_tensor * inp_out_ids = build_inp_out_ids();
-
     // Single decoder layer (il = 0)
     const int il = 0;
     {
@@ -286,10 +284,6 @@ llama_model_eagle3::graph<false>::graph(const llama_model & model, const llm_gra
     ggml_set_output(cur);
     res->t_h_nextn = cur;
 
-    if (inp_out_ids) {
-        cur = ggml_get_rows(ctx0, cur, inp_out_ids);
-    }
-
     cur = build_norm(cur,
             model.output_norm, NULL,
             LLM_NORM_RMS, -1);

From 7c42aff5e3c7c39841d3656c2af228c55f7b5a3d Mon Sep 17 00:00:00 2001
From: Ruixiang Wang <wangruixiang07@outlook.com>
Date: Thu, 11 Jun 2026 23:09:09 +0200
Subject: [PATCH 27/27] Update conversion/llama.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
---
 conversion/llama.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conversion/llama.py b/conversion/llama.py
index cad802bf8246..b87bf92d4633 100644
--- a/conversion/llama.py
+++ b/conversion/llama.py
@@ -230,8 +230,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             if name == "t2d":
                 # not used at runtime, skip
                 return
-            if name == "model.layers.0.hidden_norm.weight":
-                yield ("blk.0.attn_norm_2.weight", data_torch)
+            if name.endswith(".hidden_norm.weight"):
+                yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_NORM_2, bid), data_torch)
                 return
 
         n_head = self.find_hparam(["n_heads", "num_attention_heads"])