From 47106bd3ae2c65259a4502aa876ebd7c295724f7 Mon Sep 17 00:00:00 2001 From: TheTom Date: Mon, 8 Jun 2026 12:40:25 -0500 Subject: [PATCH 01/19] fork: drop pre-norm Qwen MTP (#149) ahead of upstream MTP lineage This fork's pre-norm MTP optimization is superseded by upstream's maintained post-norm/nextn MTP, brought in by the commits that follow. Removing it here so that lineage applies onto a clean base instead of colliding with it. --- common/speculative.cpp | 5 ---- src/llama-context.cpp | 45 +++++++++++++-------------------- src/llama-context.h | 5 ++-- src/llama-graph.cpp | 8 ------ src/models/qwen35.cpp | 29 +++------------------ src/models/qwen35moe.cpp | 29 +++------------------ tools/server/server-context.cpp | 9 +++---- 7 files changed, 31 insertions(+), 99 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index ebaef15bde11..3488b9393c5a 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -539,12 +539,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { set_h(i_batch_beg[seq_id], pending_h[seq_id].data()); } - // During prompt sync the draft MTP context only needs its cache/state - // updated. Host-visible pre-norm rows are consumed during draft() - // generation, not while mirroring prompt batches. - llama_set_embeddings_pre_norm(ctx_dft, false); const int32_t rc = llama_decode(ctx_dft, batch); - llama_set_embeddings_pre_norm(ctx_dft, true); if (rc != 0) { LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (pos=%d)\n", __func__, (int) rc, (int) batch_in.pos[0]); return false; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index c6ce7c707db1..083cb673ef71 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -888,23 +888,14 @@ float * llama_context::get_embeddings_pre_norm() { } float * llama_context::get_embeddings_pre_norm_ith(int32_t i) { + output_reorder(); + try { if (embd_pre_norm.data == nullptr) { throw std::runtime_error("no pre-norm embeddings"); } - int64_t j = i; - if (j < 0) { - j = n_outputs_pre_norm + j; - if (j < 0) { - throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs_pre_norm)); - } - } - - if (j >= n_outputs_pre_norm) { - throw std::runtime_error(format("pre-norm embeddings id out of range [0, %d)", n_outputs_pre_norm)); - } - + const int64_t j = output_resolve_row(i); const uint32_t n_embd = model.hparams.n_embd; return embd_pre_norm.data + j*n_embd; } catch (const std::exception & err) { @@ -1355,7 +1346,6 @@ int llama_context::encode(const llama_batch & batch_inp) { } n_outputs = n_tokens; - n_outputs_pre_norm = cparams.embeddings_pre_norm ? n_tokens : 0; const auto causal_attn_org = cparams.causal_attn; @@ -1741,13 +1731,12 @@ int llama_context::decode(const llama_batch & batch_inp) { } // reserve output buffer - if (output_reserve(n_outputs_all, cparams.embeddings_pre_norm ? n_tokens_all : n_outputs_all) < n_outputs_all) { + if (output_reserve(n_outputs_all) < n_outputs_all) { LLAMA_LOG_ERROR("%s: could not reserve space for batch with %d outputs\n", __func__, n_outputs_all); return -2; }; int64_t n_outputs_prev = 0; - int64_t n_outputs_pre_norm_prev = 0; do { const auto & ubatch = mctx->get_ubatch(); @@ -1893,17 +1882,16 @@ int llama_context::decode(const llama_batch & batch_inp) { // extract pre-norm embeddings (hidden state before the final output norm) // only meaningful in LLAMA_POOLING_TYPE_NONE (per-token); other pooling modes are ignored. - if (embd_pre_norm.data && t_h_pre_norm && cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { + if (embd_pre_norm.data && t_h_pre_norm && n_outputs > 0 && cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { ggml_backend_t backend_h = ggml_backend_sched_get_tensor_backend(sched.get(), t_h_pre_norm); GGML_ASSERT(backend_h != nullptr); const uint32_t n_embd = hparams.n_embd; - const int64_t n_outputs_pre_norm_new = t_h_pre_norm->ne[1]; - float * embd_pre_norm_out = embd_pre_norm.data + n_outputs_pre_norm_prev*n_embd; + float * embd_pre_norm_out = embd_pre_norm.data + n_outputs_prev*n_embd; - GGML_ASSERT(n_outputs_pre_norm_prev + n_outputs_pre_norm_new <= (int64_t) embd_pre_norm.size/(int64_t)n_embd); - ggml_backend_tensor_get_async(backend_h, t_h_pre_norm, embd_pre_norm_out, 0, n_outputs_pre_norm_new*n_embd*sizeof(float)); - n_outputs_pre_norm_prev += n_outputs_pre_norm_new; + GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all); + GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd <= (int64_t) embd_pre_norm.size); + ggml_backend_tensor_get_async(backend_h, t_h_pre_norm, embd_pre_norm_out, 0, n_outputs*n_embd*sizeof(float)); } // Copy backend sampling output if this ubatch produced any sampling tensors. @@ -1924,7 +1912,6 @@ int llama_context::decode(const llama_batch & batch_inp) { // set to total number of outputs in the batch, for use in llama_get_logits_ith n_outputs = n_outputs_all; - n_outputs_pre_norm = n_outputs_pre_norm_prev; // set output mappings if (n_outputs > 0) { @@ -1983,14 +1970,11 @@ int llama_context::decode(const llama_batch & batch_inp) { // output // -uint32_t llama_context::output_reserve(int32_t n_outputs, int32_t n_outputs_pre_norm_req) { +uint32_t llama_context::output_reserve(int32_t n_outputs) { const auto & hparams = model.hparams; const auto & vocab = model.vocab; const int64_t n_outputs_max = std::max(n_outputs, n_seq_max()); - const int64_t n_outputs_pre_norm_max = cparams.embeddings_pre_norm - ? std::max(n_outputs_pre_norm_req < 0 ? n_outputs : n_outputs_pre_norm_req, n_seq_max()) - : 0; const auto n_batch = cparams.n_batch; const auto n_vocab = vocab.n_tokens(); @@ -2013,7 +1997,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, int32_t n_outputs_pre_ logits.size = has_logits ? n_vocab*n_outputs_max : 0; embd.size = has_embd ? n_embd_out*n_outputs_max : 0; - embd_pre_norm.size = has_embd_pre_norm ? n_embd*n_outputs_pre_norm_max : 0; + embd_pre_norm.size = has_embd_pre_norm ? n_embd*n_outputs_max : 0; // Allocate backend sampling output buffers if there are backend samplers configured. const bool has_sampling = !sampling.samplers.empty(); @@ -2118,7 +2102,6 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, int32_t n_outputs_pre_ std::fill(output_ids.begin(), output_ids.end(), -1); this->n_outputs = 0; - this->n_outputs_pre_norm = 0; return n_outputs_max; } @@ -2143,6 +2126,12 @@ void llama_context::output_reorder() { } } + if (embd_pre_norm.size > 0) { + for (uint64_t k = 0; k < n_embd; k++) { + std::swap(embd_pre_norm.data[i0*n_embd + k], embd_pre_norm.data[i1*n_embd + k]); + } + } + if (!sampling.samplers.empty()) { assert(sampling.logits.size > 0); assert(sampling.probs.size > 0); diff --git a/src/llama-context.h b/src/llama-context.h index 0a9e40de1628..e16ac4c618ba 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -218,7 +218,7 @@ struct llama_context { // Make sure enough space is available for outputs. // Returns max number of outputs for which space was reserved. - uint32_t output_reserve(int32_t n_outputs, int32_t n_outputs_pre_norm = -1); + uint32_t output_reserve(int32_t n_outputs); void output_reorder(); @@ -282,11 +282,10 @@ struct llama_context { // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE buffer_view embd = {nullptr, 0}; - // hidden state before the final output norm (2-dimensional array: [n_outputs_pre_norm][n_embd]) + // hidden state before the final output norm (2-dimensional array: [n_outputs][n_embd]) // populated only when cparams.embeddings_pre_norm is enabled and the model graph // sets llm_graph_result::t_h_pre_norm buffer_view embd_pre_norm = {nullptr, 0}; - int32_t n_outputs_pre_norm = 0; struct sampling_info { // !samplers.empty() to check if any samplers are active diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index be10cd28a164..f4ca37c5160f 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -177,14 +177,6 @@ void llm_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) { void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) { GGML_ASSERT(out_ids); - if (n_outputs == 0) { - return; - } - - if (out_ids->buffer == nullptr) { - return; - } - const int64_t n_tokens = ubatch->n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer)); diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index 54b5fb186907..2b4d5b14cd42 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -155,7 +155,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para auto * inp = build_inp_mem_hybrid(); ggml_tensor * inp_pos = build_inp_pos(); - ggml_tensor * inp_out_ids = (n_outputs > 0 && (!cparams.embeddings_pre_norm || n_outputs < n_tokens)) ? build_inp_out_ids() : nullptr; + ggml_tensor * inp_out_ids = build_inp_out_ids(); // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass. const int n_transformer_layers = n_layer - (int) hparams.nextn_predict_layers; @@ -176,7 +176,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il); } - if (il == n_transformer_layers - 1 && inp_out_ids && !cparams.embeddings_pre_norm) { + if (il == n_transformer_layers - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -211,16 +211,6 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para cb(cur, "h_pre_norm", -1); res->t_h_pre_norm = cur; - if (n_outputs == 0) { - ggml_build_forward_expand(gf, cur); - return; - } - - if (inp_out_ids && cparams.embeddings_pre_norm && n_outputs < n_tokens) { - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - cb(cur, "h_pre_norm_out", -1); - } - // Final norm cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); @@ -530,9 +520,8 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr res->add_input(std::move(inp)); - ggml_tensor * inp_pos = build_inp_pos(); - ggml_tensor * inp_out_ids = (n_outputs > 0 && n_outputs < n_tokens) ? build_inp_out_ids() : nullptr; - auto * inp_attn = build_attn_inp_kv(); + ggml_tensor * inp_pos = build_inp_pos(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il); cb(h_norm, "mtp_hnorm", il); @@ -621,16 +610,6 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr cb(cur, "h_pre_norm", -1); res->t_h_pre_norm = cur; - if (n_outputs == 0) { - ggml_build_forward_expand(gf, cur); - return; - } - - if (inp_out_ids && n_outputs < n_tokens) { - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - cb(cur, "mtp_h_pre_norm_out", -1); - } - ggml_tensor * head_norm_w = layer.nextn.shared_head_norm ? layer.nextn.shared_head_norm : model.output_norm; diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index 029e1a4febfd..22e3e1107655 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -178,7 +178,7 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p auto * inp = build_inp_mem_hybrid(); ggml_tensor * inp_pos = build_inp_pos(); - ggml_tensor * inp_out_ids = (n_outputs > 0 && (!cparams.embeddings_pre_norm || n_outputs < n_tokens)) ? build_inp_out_ids() : nullptr; + ggml_tensor * inp_out_ids = build_inp_out_ids(); // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass. const int n_transformer_layers = n_layer - (int) hparams.nextn_predict_layers; @@ -199,7 +199,7 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il); } - if (il == n_transformer_layers - 1 && inp_out_ids && !cparams.embeddings_pre_norm) { + if (il == n_transformer_layers - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -234,16 +234,6 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p cb(cur, "h_pre_norm", -1); res->t_h_pre_norm = cur; - if (n_outputs == 0) { - ggml_build_forward_expand(gf, cur); - return; - } - - if (inp_out_ids && cparams.embeddings_pre_norm && n_outputs < n_tokens) { - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - cb(cur, "h_pre_norm_out", -1); - } - // Final norm cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); @@ -594,9 +584,8 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm res->add_input(std::move(inp)); - ggml_tensor * inp_pos = build_inp_pos(); - ggml_tensor * inp_out_ids = (n_outputs > 0 && n_outputs < n_tokens) ? build_inp_out_ids() : nullptr; - auto * inp_attn = build_attn_inp_kv(); + ggml_tensor * inp_pos = build_inp_pos(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il); cb(h_norm, "mtp_hnorm", il); @@ -717,16 +706,6 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm cb(cur, "h_pre_norm", -1); res->t_h_pre_norm = cur; - if (n_outputs == 0) { - ggml_build_forward_expand(gf, cur); - return; - } - - if (inp_out_ids && n_outputs < n_tokens) { - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - cb(cur, "mtp_h_pre_norm_out", -1); - } - ggml_tensor * head_norm_w = layer.nextn.shared_head_norm ? layer.nextn.shared_head_norm : model.output_norm; diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index abae3d1f2655..3dcc03d0290b 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -241,11 +241,7 @@ struct server_slot { bool need_embd() const { GGML_ASSERT(task); - return task->need_embd(); - } - - bool need_embd_pre_norm() const { - return spec && common_speculative_need_embd(spec); + return task->need_embd() || (spec && common_speculative_need_embd(spec)); } // if the context does not have a memory module then all embeddings have to be computed within a single ubatch @@ -2893,6 +2889,9 @@ struct server_context_impl { break; } + // embedding requires all tokens in the batch to be output; + // MTP also wants logits at every prompt position so the + // streaming hook can mirror t_h_pre_norm into ctx_dft. common_batch_add(batch, cur_tok, slot.prompt.tokens.pos_next(), From 19c7616dd6911a703f5bf6528e0eb025a7fb2ce8 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sun, 17 May 2026 23:30:25 +0800 Subject: [PATCH 02/19] llama: avoid copying logits during prompt decode in MTP (#23198) * llama: avoid copying logits during prompt decode in MTP * review: update comment * llama-graph: call set_output for t_h_pre_norm (cherry picked from commit 3e12fbdea5c1ac4225c7dcf79506d30950283fc3) --- common/speculative.cpp | 27 +++++++++++++++-- common/speculative.h | 5 +++- src/llama-context.cpp | 51 ++++++++++++++++++++++++--------- src/llama-context.h | 2 +- src/llama-cparams.h | 3 +- src/llama-ext.h | 10 +++---- src/llama-graph.cpp | 3 ++ src/models/qwen35.cpp | 6 +++- src/models/qwen35moe.cpp | 6 +++- tools/server/server-context.cpp | 5 ++++ 10 files changed, 91 insertions(+), 27 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index 3488b9393c5a..e591bab875db 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -146,8 +146,11 @@ struct common_speculative_impl { virtual void accept(llama_seq_id seq_id, uint16_t n_accepted) = 0; - // true if this implementation requires the target context to extract embeddings + // true if this implementation requires the target context to extract post-norm embeddings virtual bool need_embd() const = 0; + + // true if this implementation requires the target context to extract pre-norm embeddings + virtual bool need_embd_pre_norm() const { return false; } }; struct common_speculative_impl_draft_simple : public common_speculative_impl { @@ -429,8 +432,8 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams)); } - llama_set_embeddings_pre_norm(ctx_tgt, true); - llama_set_embeddings_pre_norm(ctx_dft, true); + llama_set_embeddings_pre_norm(ctx_tgt, true, /*masked*/ false); + llama_set_embeddings_pre_norm(ctx_dft, true, /*masked*/ true); pending_h.assign(n_seq, std::vector(n_embd, 0.0f)); @@ -691,6 +694,10 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { } bool need_embd() const override { + return false; + } + + bool need_embd_pre_norm() const override { return true; } }; @@ -1408,6 +1415,20 @@ bool common_speculative_need_embd(common_speculative * spec) { return false; } +bool common_speculative_need_embd_pre_norm(common_speculative * spec) { + if (spec == nullptr) { + return false; + } + + for (auto & impl : spec->impls) { + if (impl->need_embd_pre_norm()) { + return true; + } + } + + return false; +} + void common_speculative_draft(common_speculative * spec) { if (spec == nullptr) { return; diff --git a/common/speculative.h b/common/speculative.h index 614db9b1b509..f24bac79edb7 100644 --- a/common/speculative.h +++ b/common/speculative.h @@ -53,9 +53,12 @@ void common_speculative_begin(common_speculative * spec, llama_seq_id seq_id, co // process the batch and update the internal state of the speculative context bool common_speculative_process(common_speculative * spec, const llama_batch & batch); -// true if any implementation requires target embeddings to be extracted +// true if any implementation requires target post-norm embeddings to be extracted bool common_speculative_need_embd(common_speculative * spec); +// true if any implementation requires target pre-norm embeddings to be extracted +bool common_speculative_need_embd_pre_norm(common_speculative * spec); + // generate drafts for the sequences specified with `common_speculative_get_draft_params` void common_speculative_draft(common_speculative * spec); diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 083cb673ef71..691700978fdc 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -895,8 +895,17 @@ float * llama_context::get_embeddings_pre_norm_ith(int32_t i) { throw std::runtime_error("no pre-norm embeddings"); } - const int64_t j = output_resolve_row(i); const uint32_t n_embd = model.hparams.n_embd; + + if (!cparams.embeddings_pre_norm_masked) { + // unmasked: pre-norm rows are stored densely, indexed by raw token position. + if (i < 0 || (size_t)(i + 1) * n_embd > embd_pre_norm.size) { + throw std::runtime_error(format("out of range [0, %zu)", embd_pre_norm.size / n_embd)); + } + return embd_pre_norm.data + (size_t) i * n_embd; + } + + const int64_t j = output_resolve_row(i); return embd_pre_norm.data + j*n_embd; } catch (const std::exception & err) { LLAMA_LOG_ERROR("%s: invalid pre-norm embeddings id %d, reason: %s\n", __func__, i, err.what()); @@ -1088,10 +1097,11 @@ void llama_context::set_embeddings(bool value) { //sched_need_reserve = true; } -void llama_context::set_embeddings_pre_norm(bool value) { - LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value); +void llama_context::set_embeddings_pre_norm(bool value, bool masked) { + LLAMA_LOG_DEBUG("%s: value = %d, masked = %d\n", __func__, value, masked); - cparams.embeddings_pre_norm = value; + cparams.embeddings_pre_norm = value; + cparams.embeddings_pre_norm_masked = masked; } void llama_context::set_causal_attn(bool value) { @@ -1737,6 +1747,7 @@ int llama_context::decode(const llama_batch & batch_inp) { }; int64_t n_outputs_prev = 0; + int64_t n_tokens_prev = 0; do { const auto & ubatch = mctx->get_ubatch(); @@ -1882,16 +1893,21 @@ int llama_context::decode(const llama_batch & batch_inp) { // extract pre-norm embeddings (hidden state before the final output norm) // only meaningful in LLAMA_POOLING_TYPE_NONE (per-token); other pooling modes are ignored. - if (embd_pre_norm.data && t_h_pre_norm && n_outputs > 0 && cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { - ggml_backend_t backend_h = ggml_backend_sched_get_tensor_backend(sched.get(), t_h_pre_norm); - GGML_ASSERT(backend_h != nullptr); + { + const bool masked = cparams.embeddings_pre_norm_masked; + const int64_t n_rows = masked ? n_outputs : (int64_t) ubatch.n_tokens; + const int64_t offset = masked ? n_outputs_prev : n_tokens_prev; + + if (embd_pre_norm.data && t_h_pre_norm && n_rows > 0 && cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { + ggml_backend_t backend_h = ggml_backend_sched_get_tensor_backend(sched.get(), t_h_pre_norm); + GGML_ASSERT(backend_h != nullptr); - const uint32_t n_embd = hparams.n_embd; - float * embd_pre_norm_out = embd_pre_norm.data + n_outputs_prev*n_embd; + const uint32_t n_embd = hparams.n_embd; + float * embd_pre_norm_out = embd_pre_norm.data + offset*n_embd; - GGML_ASSERT( n_outputs_prev + n_outputs <= n_outputs_all); - GGML_ASSERT((n_outputs_prev + n_outputs)*n_embd <= (int64_t) embd_pre_norm.size); - ggml_backend_tensor_get_async(backend_h, t_h_pre_norm, embd_pre_norm_out, 0, n_outputs*n_embd*sizeof(float)); + GGML_ASSERT((offset + n_rows)*n_embd <= (int64_t) embd_pre_norm.size); + ggml_backend_tensor_get_async(backend_h, t_h_pre_norm, embd_pre_norm_out, 0, n_rows*n_embd*sizeof(float)); + } } // Copy backend sampling output if this ubatch produced any sampling tensors. @@ -1908,6 +1924,7 @@ int llama_context::decode(const llama_batch & batch_inp) { } n_outputs_prev += n_outputs; + n_tokens_prev += ubatch.n_tokens; } while (mctx->next()); // set to total number of outputs in the batch, for use in llama_get_logits_ith @@ -1999,6 +2016,12 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { embd.size = has_embd ? n_embd_out*n_outputs_max : 0; embd_pre_norm.size = has_embd_pre_norm ? n_embd*n_outputs_max : 0; + if (has_embd_pre_norm && !cparams.embeddings_pre_norm_masked) { + // unmasked: pre-norm row exists for every token in the batch, not just + // those flagged via batch.logits[i] -> size by token count instead. + embd_pre_norm.size = (size_t) n_embd * n_batch; + } + // Allocate backend sampling output buffers if there are backend samplers configured. const bool has_sampling = !sampling.samplers.empty(); if (has_sampling) { @@ -3572,8 +3595,8 @@ float * llama_get_embeddings_seq(llama_context * ctx, llama_seq_id seq_id) { return ctx->get_embeddings_seq(seq_id); } -void llama_set_embeddings_pre_norm(llama_context * ctx, bool value) { - ctx->set_embeddings_pre_norm(value); +void llama_set_embeddings_pre_norm(llama_context * ctx, bool value, bool masked) { + ctx->set_embeddings_pre_norm(value, masked); } float * llama_get_embeddings_pre_norm(llama_context * ctx) { diff --git a/src/llama-context.h b/src/llama-context.h index e16ac4c618ba..d03f681d4a13 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -110,7 +110,7 @@ struct llama_context { void set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data); void set_embeddings (bool value); - void set_embeddings_pre_norm(bool value); + void set_embeddings_pre_norm(bool value, bool masked); void set_causal_attn(bool value); void set_warmup(bool value); diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 5898a1c38d51..20ec59fe3357 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -28,7 +28,8 @@ struct llama_cparams { float yarn_beta_slow; bool embeddings; - bool embeddings_pre_norm; // also extract the hidden state before the final output norm + bool embeddings_pre_norm; // also extract the hidden state before the final output norm + bool embeddings_pre_norm_masked; // extract for only rows where batch.logits != 0 bool causal_attn; bool offload_kqv; bool flash_attn; diff --git a/src/llama-ext.h b/src/llama-ext.h index 11f1986676a5..edfa71c207c5 100644 --- a/src/llama-ext.h +++ b/src/llama-ext.h @@ -93,14 +93,14 @@ LLAMA_API llama_memory_breakdown llama_get_memory_breakdown(const struct llama_c // pre-norm embeddings (hidden state before the final output norm) // -// mirrors: -// LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings); -LLAMA_API void llama_set_embeddings_pre_norm(struct llama_context * ctx, bool value); +// Set whether the context outputs pre-norm embeddings or not +// If masked == true, output the embeddings only for the tokens with batch.logits != 0 +// If masked == false, output the embeddings for all tokens in the batch regardless of batch.logits +LLAMA_API void llama_set_embeddings_pre_norm(struct llama_context * ctx, bool value, bool masked); // mirrors: // LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); -LLAMA_API float * llama_get_embeddings_pre_norm(struct llama_context * ctx); +LLAMA_API float * llama_get_embeddings_pre_norm (struct llama_context * ctx); -// mirrors: // LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i); LLAMA_API float * llama_get_embeddings_pre_norm_ith(struct llama_context * ctx, int32_t i); diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index f4ca37c5160f..f156451f6817 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -848,6 +848,9 @@ void llm_graph_result::set_outputs() { if (t_embd_pooled != nullptr) { ggml_set_output(t_embd_pooled); } + if (t_h_pre_norm != nullptr) { + ggml_set_output(t_h_pre_norm); + } for (auto & [seq_id, t] : t_sampled) { if (t != nullptr) { ggml_set_output(t); diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index 2b4d5b14cd42..361d7538a038 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -176,7 +176,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il); } - if (il == n_transformer_layers - 1 && inp_out_ids) { + if (il == n_transformer_layers - 1 && inp_out_ids && cparams.embeddings_pre_norm_masked) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -211,6 +211,10 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para cb(cur, "h_pre_norm", -1); res->t_h_pre_norm = cur; + if (!cparams.embeddings_pre_norm_masked && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + } + // Final norm cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index 22e3e1107655..4f63c410d668 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -199,7 +199,7 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il); } - if (il == n_transformer_layers - 1 && inp_out_ids) { + if (il == n_transformer_layers - 1 && inp_out_ids && cparams.embeddings_pre_norm_masked) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -234,6 +234,10 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p cb(cur, "h_pre_norm", -1); res->t_h_pre_norm = cur; + if (!cparams.embeddings_pre_norm_masked && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + } + // Final norm cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 3dcc03d0290b..bb28fad4a5ee 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -244,6 +244,11 @@ struct server_slot { return task->need_embd() || (spec && common_speculative_need_embd(spec)); } + bool need_embd_pre_norm() const { + GGML_ASSERT(task); + return spec && common_speculative_need_embd_pre_norm(spec); + } + // if the context does not have a memory module then all embeddings have to be computed within a single ubatch // also we cannot split if the pooling would require any past tokens // (MTP supports splitting — uses task->need_embd() not need_embd()) From 672261e7214951ff6a9cd6574d6f57e101784a6b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 19 May 2026 15:32:58 +0300 Subject: [PATCH 03/19] llama : MTP clean-up (#23269) * llama : disable equal splits for recurrent memory with partial rollback * spec : re-enable p-min with MTP drafts * spec : re-enable ngram spec in combination with RS rollback * spec : fix ngram-map-* params * spec : fix acceptance logic in combined ngram + draft configs * graph : fix reuse for combined `token` + `embd` batches * spec : log parameters for each speculative implementation - add LOG_INF in each constructor with implementation type and parameters - extract device string logic into common_speculative_get_devices_str() - move 'adding speculative implementation' log from init into constructors Assisted-by: llama.cpp:local pi * spec : extend --spec-default with ngram-map-k4v Assisted-by: llama.cpp:local pi * minor : fix n_embd log * args : update draft.n_max == 3 + regen docs * spec : relax ngram-mod rejection thold to 0.25 @ 5 low * logs : improve * docs : update speculative decoding CLI argument documentation - Add missing draft model CPU scheduling and tensor override parameters - Update --spec-type to include all available types (excluding draft-eagle3 WIP) - Fix default values to match implementation (n_max=3, n_min=0, p_min=0.0) - Remove deprecated options (spec-draft-ctx-size, spec-draft-replace) - Add environment variables for new parameters Assisted-by: llama.cpp:local pi * arg : step-back on adding k4v to the default spec config * cont : fix name (cherry picked from commit d14ce3dab4de197adec5166faa54ac5db8262f26) --- common/arg.cpp | 18 +++- common/common.cpp | 23 ----- common/common.h | 8 +- common/ngram-map.cpp | 2 +- common/speculative.cpp | 150 +++++++++++++++++++++++-------- docs/speculative.md | 77 +++++++++++++--- src/llama-graph.h | 3 +- src/llama-memory-hybrid-iswa.cpp | 12 ++- src/llama-memory-hybrid.cpp | 12 ++- src/llama-memory-recurrent.cpp | 12 ++- src/llama-memory-recurrent.h | 1 + src/models/delta-net-base.cpp | 97 ++++++++++++-------- src/models/models.h | 3 - tools/cli/README.md | 4 +- tools/server/README.md | 5 +- 15 files changed, 293 insertions(+), 134 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index a4e5bc661202..0ec3d67700c5 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -540,7 +540,11 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str())); } if (!seen_args.insert(arg).second) { - LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str()); + const bool skip = (arg == "--spec-type"); + + if (!skip) { + LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str()); + } } auto & tmp = arg_to_options[arg]; auto opt = *tmp.first; @@ -903,7 +907,11 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::mapsamplers_seq_config.size(); } - // [TAG_RS_STATE_ROLLBACK_SUPPORT] - // TODO: ngram speculative methods require checkpointing in addition to partial RS rollback - // currently this is not supported. so we disable the partial rollback - if (cparams.n_rs_seq > 0 && (llama_model_is_recurrent(model) || llama_model_is_hybrid(model))) { - auto & types = params.speculative.types; - - for (int i = 0; i < (int) types.size(); i++) { - if (types[i] == COMMON_SPECULATIVE_TYPE_NONE) { - continue; - } - if (types[i] == COMMON_SPECULATIVE_TYPE_DRAFT_MTP) { - continue; - } - - cparams.n_rs_seq = 0; - - LOG_WRN("%s: recurrent state rollback is not compatible with '%s' - disabling rollback support\n", __func__, - common_speculative_type_to_str(types[i]).c_str()); - - break; - } - } - llama_context * lctx = llama_init_from_model(model, cparams); if (lctx == NULL) { LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str()); diff --git a/common/common.h b/common/common.h index 46efe749185b..b53a6ed86f60 100644 --- a/common/common.h +++ b/common/common.h @@ -299,11 +299,11 @@ struct common_params_model { // draft-model-based speculative decoding parameters struct common_params_speculative_draft { - int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding - int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding + int32_t n_max = 3; // maximum number of tokens to draft during speculative decoding + int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding - float p_split = 0.1f; // speculative decoding split probability - float p_min = 0.75f; // minimum speculative decoding probability (greedy) // TODO: change default to 0.0f + float p_split = 0.1f; // speculative decoding split probability + float p_min = 0.0f; // minimum speculative decoding probability (greedy) common_params_model mparams; diff --git a/common/ngram-map.cpp b/common/ngram-map.cpp index 8e3978f7ed02..a2a4e50575f2 100644 --- a/common/ngram-map.cpp +++ b/common/ngram-map.cpp @@ -500,7 +500,7 @@ void common_ngram_map_draft(common_ngram_map & map, draft.push_back(inp[match_pos + n + i]); } - LOG_INF("%s: key_offset = %zu, slot_max = %d, key_num = %d, draft.size = %zu\n", __func__, + LOG_DBG("%s: key_offset = %zu, slot_max = %d, key_num = %d, draft.size = %zu\n", __func__, key_offset, slot_max, curr_key.key_num, draft.size()); diff --git a/common/speculative.cpp b/common/speculative.cpp index e591bab875db..4d1b61a13ad9 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -32,6 +32,19 @@ const std::map common_speculative_type_fro {"ngram-cache", COMMON_SPECULATIVE_TYPE_NGRAM_CACHE} }; +static std::string common_speculative_get_devices_str(const std::vector & devices) { + if (devices.empty()) { + return "default"; + } + + std::string result; + for (size_t i = 0; i < devices.size(); i++) { + if (i > 0) result += ", "; + result += ggml_backend_dev_name(devices[i]); + } + return result; +} + struct common_speculative_config { common_speculative_type type; common_params_speculative params; @@ -144,7 +157,7 @@ struct common_speculative_impl { virtual void draft(common_speculative_draft_params_vec & dparams) = 0; - virtual void accept(llama_seq_id seq_id, uint16_t n_accepted) = 0; + virtual void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) = 0; // true if this implementation requires the target context to extract post-norm embeddings virtual bool need_embd() const = 0; @@ -167,6 +180,16 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl { auto * ctx_dft = this->params.ctx_dft; auto * ctx_tgt = this->params.ctx_tgt; + LOG_INF("%s: adding speculative implementation 'draft-simple'\n", __func__); + LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min); + LOG_INF("%s: - gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n", __func__, + this->params.n_gpu_layers, + ggml_type_name(this->params.cache_type_k), + ggml_type_name(this->params.cache_type_v), + ctx_tgt ? "yes" : "no", + ctx_dft ? "yes" : "no", + common_speculative_get_devices_str(this->params.devices).c_str()); + batch = llama_batch_init(llama_n_batch(ctx_dft), 0, 1); // TODO: optimize or pass from outside? @@ -343,7 +366,7 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl { } } - void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override { + void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override { // noop } @@ -355,8 +378,12 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl { struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { //common_params_speculative_eagle3 params; - common_speculative_impl_draft_eagle3(const common_params_speculative & /*params*/, uint32_t n_seq) - : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq) {} + common_speculative_impl_draft_eagle3(const common_params_speculative & params, uint32_t n_seq) + : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq) + { + LOG_INF("%s: adding speculative implementation 'draft-eagle3'\n", __func__); + LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, params.draft.n_max, params.draft.n_min, params.draft.p_min); + } void begin(llama_seq_id /*seq_id*/, const llama_tokens & /*prompt*/) override { // noop @@ -371,7 +398,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { // TODO: implement } - void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override { + void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override { // noop } @@ -380,7 +407,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { } }; -struct common_speculative_state_draft_mtp : public common_speculative_impl { +struct common_speculative_impl_draft_mtp : public common_speculative_impl { common_params_speculative_draft params; // reuses the draft-model params slot (ctx_tgt/ctx_dft) llama_batch batch; @@ -407,7 +434,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { // pre-advancement before process() mirrored the verify batch. std::vector last_n_drafted; - common_speculative_state_draft_mtp(const common_params_speculative & params, uint32_t n_seq) + common_speculative_impl_draft_mtp(const common_params_speculative & params, uint32_t n_seq) : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_MTP, n_seq) , params(params.draft) { @@ -417,6 +444,16 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { n_embd = llama_model_n_embd(llama_get_model(ctx_dft)); + LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__); + LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd); + LOG_INF("%s: - gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n", __func__, + this->params.n_gpu_layers, + ggml_type_name(this->params.cache_type_k), + ggml_type_name(this->params.cache_type_v), + ctx_tgt ? "yes" : "no", + ctx_dft ? "yes" : "no", + common_speculative_get_devices_str(this->params.devices).c_str()); + const int32_t n_b = (int32_t) llama_n_batch(ctx_dft); batch = llama_batch_init(/*n_tokens=*/ n_b, /*embd=*/ n_embd, /*n_seq_max=*/ 1); // llama_batch_init allocates only one of token/embd; MTP needs both. @@ -427,7 +464,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { for (auto & s : smpls) { common_params_sampling sparams; sparams.no_perf = false; - sparams.top_k = 1; // TODO: re-enable top_k == 10 and utilize `p_min` spec param + sparams.top_k = 10; sparams.samplers = { COMMON_SAMPLER_TYPE_TOP_K }; s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams)); } @@ -446,7 +483,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { last_n_drafted.assign(n_seq, 0); } - ~common_speculative_state_draft_mtp() override { + ~common_speculative_impl_draft_mtp() override { if (batch.token != nullptr) { free(batch.token); batch.token = nullptr; @@ -462,7 +499,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { auto * ctx_dft = this->params.ctx_dft; const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id); if (pos_max < N - 1) { - LOG_WRN("%s: ctx_dft pos_max=%d < N-1=%d — " + LOG_WRN("%s: ctx_dft pos_max=%d < N-1=%d - " "process() hook may not have run on every prefill ubatch " "(need_embd / logits=1 on every prompt position?). " "Drafts may degrade.\n", @@ -633,6 +670,14 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { // add drafted token for each sequence const llama_token id = cur_p->data[0].id; + // only collect very high-confidence draft tokens + if (cur_p->data[0].p < params.p_min) { + drafting[seq_id] = false; + n_drafting--; + + continue; + } + common_sampler_accept(smpl, id, true); auto & dp = dparams.at(seq_id); @@ -678,7 +723,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { } } - void accept(llama_seq_id seq_id, uint16_t n_accepted) override { + void accept(llama_seq_id seq_id, uint16_t n_accepted, bool /*is_other*/) override { if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) { return; } @@ -714,7 +759,12 @@ struct common_speculative_impl_ngram_simple : public common_speculative_impl { common_ngram_simple_config config) : common_speculative_impl(COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, n_seq) , params(params.ngram_simple) - , config(config) {} + , config(config) + { + LOG_INF("%s: adding speculative implementation 'ngram-simple'\n", __func__); + LOG_INF("%s: - size_n=%d, size_m=%d, min_hits=%d\n", __func__, + this->params.size_n, this->params.size_m, this->params.min_hits); + } void begin(llama_seq_id /*seq_id*/, const llama_tokens & /*prompt*/) override { // noop @@ -738,7 +788,7 @@ struct common_speculative_impl_ngram_simple : public common_speculative_impl { } } - void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override { + void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override { // noop } @@ -748,20 +798,21 @@ struct common_speculative_impl_ngram_simple : public common_speculative_impl { }; struct common_speculative_impl_ngram_map_k : public common_speculative_impl { - common_params_speculative_ngram_map params; - // n_seq configs std::vector config; common_speculative_impl_ngram_map_k( - const common_params_speculative & params, const common_ngram_map & config, uint32_t n_seq) : common_speculative_impl(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, n_seq) - , params(params.ngram_map_k) { + { for (uint32_t i = 0; i < n_seq; i++) { this->config.push_back(config); } + + LOG_INF("%s: adding speculative implementation '%s'\n", __func__, common_speculative_type_to_str(this->type).c_str()); + LOG_INF("%s: - size_key=%d, size_value=%d, key_only=%d, min_hits=%d\n", __func__, + config.size_key, config.size_value, config.key_only, config.min_hits); } void begin(llama_seq_id seq_id, const llama_tokens & prompt) override { @@ -788,9 +839,13 @@ struct common_speculative_impl_ngram_map_k : public common_speculative_impl { } } - void accept(llama_seq_id seq_id, uint16_t n_accepted) override { + void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) override { GGML_ASSERT((seq_id < (llama_seq_id) config.size())); + if (is_other) { + return; + } + common_ngram_map_accept(config[seq_id], n_accepted); } @@ -812,7 +867,7 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl { // the last position in the prompt that was added to the ngram container size_t i_last = 0; - // length of the last drafted n‑gram (number of tokens returned by draft) + // length of the last drafted n-gram (number of tokens returned by draft) size_t n_draft_last = 0; // consecutive accept rounds with low acceptance fraction (< 0.5) @@ -830,8 +885,11 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl { , verbose(std::getenv("LLAMA_TRACE") != nullptr) { static_assert(sizeof(llama_token) == sizeof(common_ngram_mod::entry_t)); - LOG_INF("%s: initialized ngram_mod with n_match=%d, size=%zu (%.3f MB)\n", __func__, - this->params.n_match, mod.size(), (float)(mod.size_bytes())/1024/1024); + LOG_INF("%s: adding speculative implementation 'ngram-mod'\n", __func__); + LOG_INF("%s: - n_match=%d, n_max=%d, n_min=%d\n", __func__, + this->params.n_match, this->params.n_max, this->params.n_min); + LOG_INF("%s: - mod size=%zu (%.3f MB)\n", __func__, + mod.size(), (float)(mod.size_bytes())/1024/1024); if (this->params.n_match < 16) { LOG_WRN("%s: ngram_mod n_match=%d is too small - poor quality is possible, " @@ -921,7 +979,7 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl { } result.resize(result.size() - n); - // store length of drafted n‑gram for later acceptance analysis + // store length of drafted n-gram for later acceptance analysis sinfo.n_draft_last = result.size(); } @@ -943,17 +1001,21 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl { } } - void accept(llama_seq_id seq_id, uint16_t n_accepted) override { + void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) override { + if (is_other) { + return; + } + auto & sinfo = sinfos[seq_id]; // compute acceptance fraction if we have a recorded draft length if (sinfo.n_draft_last > 0) { const double f_acc = (double)n_accepted / (double)sinfo.n_draft_last; - if (f_acc < 0.5) { + if (f_acc < 0.25) { sinfo.n_low++; - if (sinfo.n_low >= 3) { + if (sinfo.n_low >= 5) { if (verbose) { - LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, sinfo.n_low); + LOG_WRN("%s: low acceptance streak (%d) - resetting ngram_mod\n", __func__, sinfo.n_low); } mod.reset(); @@ -1003,6 +1065,12 @@ struct common_speculative_impl_ngram_cache : public common_speculative_impl { , save_dynamic(save_dynamic) , save_static(save_static) { + LOG_INF("%s: adding speculative implementation 'ngram-cache'\n", __func__); + LOG_INF("%s: - n_draft=%d, cache_static=%s, cache_dynamic=%s\n", __func__, + n_draft, + path_static.empty() ? "none" : path_static.c_str(), + path_dynamic.empty() ? "none" : path_dynamic.c_str()); + sinfos.resize(n_seq); if (!path_static.empty()) { @@ -1099,7 +1167,7 @@ struct common_speculative_impl_ngram_cache : public common_speculative_impl { } } - void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override { + void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override { // noop } @@ -1285,7 +1353,6 @@ common_speculative * common_speculative_init(common_params_speculative & params, std::vector> impls = {}; for (const common_speculative_config & config : configs) { - LOG_INF("%s: adding speculative implementation '%s'\n", __func__, common_speculative_type_to_str(config.type).c_str()); switch (config.type) { case COMMON_SPECULATIVE_TYPE_NONE: break; @@ -1298,7 +1365,7 @@ common_speculative * common_speculative_init(common_params_speculative & params, break; } case COMMON_SPECULATIVE_TYPE_DRAFT_MTP: { - impls.push_back(std::make_unique(config.params, n_seq)); + impls.push_back(std::make_unique(config.params, n_seq)); break; } case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: { @@ -1319,11 +1386,16 @@ common_speculative * common_speculative_init(common_params_speculative & params, impls.push_back(std::move(state)); break; } - case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: + case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: { + impls.push_back( + std::make_unique( + get_common_ngram_map(config.type, config.params.ngram_map_k), n_seq)); + break; + } case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: { impls.push_back( std::make_unique( - config.params, get_common_ngram_map(config.type, config.params.ngram_map_k), n_seq)); + get_common_ngram_map(config.type, config.params.ngram_map_k4v), n_seq)); break; } case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: { @@ -1515,11 +1587,6 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u GGML_ASSERT(impl); - // TODO: currently only the implementation that generated the draft is used to accept it - // however, some implementations (such as MTP) need to also "see" the accepted tokens - // extend `common_speculative_impl::accept()` with an extra argument `bool is_other` to - // inform the implementation if the accepted tokens are from another implementation and - // pass the accepted tokens to all remaining implementations using `is_other == true` { common_time_meas tm(impl->t_accept_us, !impl->gen_perf); if (n_accepted > 0) { @@ -1527,9 +1594,16 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u impl->n_acc_tokens += n_accepted; } - impl->accept(seq_id, n_accepted); + impl->accept(seq_id, n_accepted, false); impl->n_call_accept++; } + + // accept with the rest of the implementations, using is_other == true + for (auto & impl_other : spec->impls) { + if (impl_other.get() != impl) { + impl_other->accept(seq_id, n_accepted, true); + } + } } void common_speculative_print_stats(const common_speculative * spec) { @@ -1549,7 +1623,7 @@ void common_speculative_print_stats(const common_speculative * spec) { str_perf = ""; } - LOG_INF("statistics %s: #calls(b,g,a) = %zu %zu %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n", + LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s\n", common_speculative_type_to_str(impl->type).c_str(), impl->n_call_begin, impl->n_call_draft, impl->n_call_accept, impl->n_gen_drafts, diff --git a/docs/speculative.md b/docs/speculative.md index fb6ef03067d3..45e42d42a436 100644 --- a/docs/speculative.md +++ b/docs/speculative.md @@ -108,11 +108,12 @@ If a draft model is combined with a draftless decoding the draftless decoding ha ### General Speculative Parameters ``` ---spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod] - type of speculative decoding to use when no draft model is provided +--spec-type [none|draft-simple|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod] + comma-separated list of types of speculative decoding to use (default: none) (env: LLAMA_ARG_SPEC_TYPE) ---spec-default use default speculative decoding +--spec-default use default speculative decoding config + (enables ngram-mod) ``` ### Draft Model Parameters @@ -123,8 +124,9 @@ If a draft model is combined with a draftless decoding the draftless decoding ha (env: LLAMA_ARG_SPEC_DRAFT_MODEL) --spec-draft-hf, -hfd, -hfrd, --hf-repo-draft /[:quant] HuggingFace repository for the draft model + (env: LLAMA_ARG_SPEC_DRAFT_HF_REPO) --spec-draft-n-max N - number of tokens to draft for speculative decoding (default: 16) + number of tokens to draft for speculative decoding (default: 3) (env: LLAMA_ARG_SPEC_DRAFT_N_MAX) --spec-draft-n-min N minimum number of draft tokens to use for speculative decoding (default: 0) @@ -133,18 +135,64 @@ If a draft model is combined with a draftless decoding the draftless decoding ha speculative decoding split probability (default: 0.10) (env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT) --spec-draft-p-min, --draft-p-min P - minimum speculative decoding probability (greedy) (default: 0.75) + minimum speculative decoding probability (greedy) (default: 0.00) (env: LLAMA_ARG_SPEC_DRAFT_P_MIN) ---spec-draft-ctx-size, -cd, --ctx-size-draft N - size of the prompt context for the draft model (default: 0, 0 = loaded from model) - (env: LLAMA_ARG_SPEC_DRAFT_CTX_SIZE) --spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto) (env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) --spec-draft-device, -devd, --device-draft comma-separated list of devices to use for offloading the draft model ---spec-draft-replace, --spec-replace TARGET DRAFT - translate the string in TARGET into DRAFT if the draft model and main model are not compatible + (use --list-devices to see available devices) +``` + +### Draft Model CPU Scheduling Parameters + +``` +--spec-draft-threads, -td, --threads-draft N + number of CPU threads to use during generation +--spec-draft-threads-batch, -tbd, --threads-batch-draft N + number of threads to use during batch and prompt processing (default: same as --threads-draft) +--spec-draft-cpu-mask, -Cd, --cpu-mask-draft M + Draft model CPU affinity mask. Complements cpu-range-draft +--spec-draft-cpu-range, -Crd, --cpu-range-draft lo-hi + Ranges of CPUs for affinity. Complements --cpu-mask-draft +--spec-draft-cpu-strict, --cpu-strict-draft <0|1> + Use strict CPU placement for draft model (default: same as --cpu-strict) +--spec-draft-prio, --prio-draft N + set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime +--spec-draft-poll, --poll-draft <0|1> + Use polling to wait for draft model work (default: same as --poll) +--spec-draft-cpu-mask-batch, -Cbd, --cpu-mask-batch-draft M + Draft model CPU affinity mask for batch. Complements cpu-range-batch-draft +--spec-draft-cpu-range-batch, -Crbd, --cpu-range-batch-draft lo-hi + Ranges of CPUs for affinity for batch. Complements --cpu-mask-batch-draft +--spec-draft-cpu-strict-batch, --cpu-strict-batch-draft <0|1> + Use strict CPU placement for draft model batch (default: --cpu-strict-draft) +--spec-draft-prio-batch, --prio-batch-draft N + set draft process/thread priority for batch : 0-normal, 1-medium, 2-high, 3-realtime +--spec-draft-poll-batch, --poll-batch-draft <0|1> + Use polling to wait for draft model work for batch (default: --poll-draft) +``` + +### Draft Model KV Cache and Tensor Override Parameters + +``` +--spec-draft-type-k, -ctkd, --cache-type-k-draft TYPE + KV cache data type for K for the draft model + allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1 + (env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_K) +--spec-draft-type-v, -ctvd, --cache-type-v-draft TYPE + KV cache data type for V for the draft model + allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1 + (env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_V) +--spec-draft-override-tensor, -otd, --override-tensor-draft =,... + override tensor buffer type for draft model +--spec-draft-cpu-moe, -cmoed, --cpu-moe-draft + keep all Mixture of Experts (MoE) weights in the CPU for the draft model + (env: LLAMA_ARG_SPEC_DRAFT_CPU_MOE) +--spec-draft-n-cpu-moe, --spec-draft-ncmoe, -ncmoed, --n-cpu-moe-draft N + keep the MoE weights of the first N layers in the CPU for the draft model + (env: LLAMA_ARG_SPEC_DRAFT_N_CPU_MOE) ``` ### n-gram Mod Parameters @@ -193,11 +241,13 @@ If a draft model is combined with a draftless decoding the draftless decoding ha ### `--spec-type TYPE` -Specifies a type of speculative decoding without draft model. +Specifies a comma-separated list of speculative decoding types to use. | Type | Description | |------|-------------| | `none` | No speculative decoding (default) | +| `draft-simple` | Use a simple draft model for speculation | +| `draft-mtp` | Use Masked Token Prediction (MTP) heads from the main model | | `ngram-cache` | Use n-gram cache lookup | | `ngram-simple` | Use simple n-gram pattern matching | | `ngram-map-k` | Use n-gram pattern matching with n-gram-keys | @@ -209,6 +259,11 @@ Specifies a type of speculative decoding without draft model. ./llama-server [...] --spec-type ngram-simple ``` +**Example:** Multiple speculative implementations. +```bash +./llama-server [...] --spec-type ngram-mod,ngram-map-k4v +``` + ### `--spec-ngram-*-size-n N` Sets the size N of the lookup n-gram for n-gram map based speculative decoding. diff --git a/src/llama-graph.h b/src/llama-graph.h index 9e55d0a675e0..bf6778237e6f 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -581,7 +581,8 @@ struct llm_graph_params { ubatch.n_seqs_unq == other.ubatch.n_seqs_unq && ( (!ubatch.token && !other.ubatch.token) || - (!ubatch.embd && !other.ubatch.embd) + (!ubatch.embd && !other.ubatch.embd) || + (ubatch.token && other.ubatch.token && ubatch.embd && other.ubatch.embd) ); // when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same diff --git a/src/llama-memory-hybrid-iswa.cpp b/src/llama-memory-hybrid-iswa.cpp index a59561ea54dd..72f5c2fea725 100644 --- a/src/llama-memory-hybrid-iswa.cpp +++ b/src/llama-memory-hybrid-iswa.cpp @@ -75,9 +75,15 @@ llama_memory_context_ptr llama_memory_hybrid_iswa::init_batch(llama_batch_allocr // if all tokens are output, split by sequence ubatch = balloc.split_seq(n_ubatch); } else { - // Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice) - const bool unified = (mem_attn->get_base()->get_n_stream() == 1); - ubatch = balloc.split_equal(n_ubatch, !unified); + if (mem_recr->n_rs_seq > 0) { + // [TAG_RECURRENT_ROLLBACK_SPLITS] + // TODO: recurrent state rollback does not support equal splits + ubatch = balloc.split_seq(n_ubatch); + } else { + // Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice) + const bool unified = (mem_attn->get_base()->get_n_stream() == 1); + ubatch = balloc.split_equal(n_ubatch, !unified); + } } if (ubatch.n_tokens == 0) { diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp index 5c2e2afdbef1..ed28625870b7 100644 --- a/src/llama-memory-hybrid.cpp +++ b/src/llama-memory-hybrid.cpp @@ -75,9 +75,15 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba // if all tokens are output, split by sequence ubatch = balloc.split_seq(n_ubatch); } else { - // Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice) - const bool unified = (mem_attn->get_n_stream() == 1); - ubatch = balloc.split_equal(n_ubatch, !unified); + if (mem_recr->n_rs_seq > 0) { + // [TAG_RECURRENT_ROLLBACK_SPLITS] + // TODO: recurrent state rollback does not support equal splits + ubatch = balloc.split_seq(n_ubatch); + } else { + // Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice) + const bool unified = (mem_attn->get_n_stream() == 1); + ubatch = balloc.split_equal(n_ubatch, !unified); + } } if (ubatch.n_tokens == 0) { diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp index aeb866657f24..ec5dc5835dd4 100644 --- a/src/llama-memory-recurrent.cpp +++ b/src/llama-memory-recurrent.cpp @@ -416,9 +416,15 @@ llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & // if all tokens are output, split by sequence ubatch = balloc.split_seq(n_ubatch); } else { - // TODO: non-sequential equal split can be done if using unified KV cache - // for simplicity, we always use sequential equal split for now - ubatch = balloc.split_equal(n_ubatch, true); + if (n_rs_seq > 0) { + // [TAG_RECURRENT_ROLLBACK_SPLITS] + // TODO: recurrent state rollback does not support equal splits + ubatch = balloc.split_seq(n_ubatch); + } else { + // TODO: non-sequential equal split can be done if using unified KV cache + // for simplicity, we always use sequential equal split for now + ubatch = balloc.split_equal(n_ubatch, true); + } } if (ubatch.n_tokens == 0) { diff --git a/src/llama-memory-recurrent.h b/src/llama-memory-recurrent.h index 29c58afc9c27..b13b7b748f5e 100644 --- a/src/llama-memory-recurrent.h +++ b/src/llama-memory-recurrent.h @@ -72,6 +72,7 @@ class llama_memory_recurrent : public llama_memory_i { // number of recurrent-state snapshots per seq for rollback; tensors are widened to (1 + n_rs_seq) groups uint32_t n_rs_seq = 0; + // per-seq rollback index std::vector rs_idx; diff --git a/src/models/delta-net-base.cpp b/src/models/delta-net-base.cpp index 2a4e00384e95..a67238383eda 100644 --- a/src/models/delta-net-base.cpp +++ b/src/models/delta-net-base.cpp @@ -447,13 +447,6 @@ std::pair llm_build_delta_net_base::build_delta_ne return build_delta_net_chunking(q, k, v, g, b, s, il); } -bool llm_build_delta_net_base::keep_rs() const { - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - return cparams.n_rs_seq > 0 - && n_seq_tokens > 1 - && (uint32_t) n_seq_tokens <= 1 + cparams.n_rs_seq; -} - ggml_tensor * llm_build_delta_net_base::build_conv_state( llm_graph_input_rs * inp, ggml_tensor * conv_states_all, @@ -461,12 +454,12 @@ ggml_tensor * llm_build_delta_net_base::build_conv_state( int64_t conv_kernel_size, int64_t conv_channels, int il) { - const auto * mctx_cur = inp->mctx; - const auto kv_head = mctx_cur->get_head(); - const uint32_t mem_size = mctx_cur->get_size(); - const int64_t n_seqs = ubatch.n_seqs; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const bool keep = keep_rs(); + const auto * mctx_cur = inp->mctx; + + const auto kv_head = mctx_cur->get_head(); + const auto mem_size = mctx_cur->get_size(); + + const int64_t n_seqs = ubatch.n_seqs; ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs); cb(conv_states, "conv_states", il); @@ -480,32 +473,52 @@ ggml_tensor * llm_build_delta_net_base::build_conv_state( ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0); cb(conv_input, "conv_input", il); - if (!keep) { - ggml_tensor * last_conv_states = - ggml_view_3d(ctx0, conv_input, conv_kernel_size - 1, conv_channels, n_seqs, conv_input->nb[1], - conv_input->nb[2], (conv_input->ne[0] - conv_states->ne[0]) * ggml_element_size(conv_input)); - cb(last_conv_states, "last_conv_states", il); + const int64_t row_count = (conv_kernel_size - 1) * conv_channels; + + const size_t row_size = ggml_row_size(conv_states_all->type, row_count); + + if (cparams.n_rs_seq == 0) { + const int64_t s_idx = conv_input->ne[0] - conv_states->ne[0]; + const int64_t s_slot = 0; + + ggml_tensor * conv_state_last = + ggml_view_3d(ctx0, conv_input, + conv_kernel_size - 1, conv_channels, n_seqs, + conv_input->nb[1], conv_input->nb[2], + ggml_row_size(conv_input->type, s_idx)); + cb(conv_state_last, "conv_state_last", il); - ggml_tensor * state_update_target = - ggml_view_2d(ctx0, conv_states_all, (conv_kernel_size - 1) * conv_channels, n_seqs, conv_states_all->nb[1], - kv_head * (conv_kernel_size - 1) * conv_channels * ggml_element_size(conv_states_all)); - cb(state_update_target, "state_update_target", il); + ggml_tensor * conv_state_update = + ggml_view_2d(ctx0, conv_states_all, + row_count, n_seqs, conv_states_all->nb[1], + (s_slot * mem_size + kv_head) * row_size); + cb(conv_state_update, "conv_state_update", il); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_state_last, conv_state_update)); } else { - const int64_t row_count = (conv_kernel_size - 1) * conv_channels; - const size_t row_size = row_count * ggml_element_size(conv_states_all); - for (int64_t t = 1; t <= n_seq_tokens; ++t) { - const uint32_t slot = (uint32_t)(n_seq_tokens - t); - ggml_tensor * src = - ggml_view_3d(ctx0, conv_input, conv_kernel_size - 1, conv_channels, n_seqs, - conv_input->nb[1], conv_input->nb[2], - t * ggml_element_size(conv_input)); - ggml_tensor * dst = - ggml_view_2d(ctx0, conv_states_all, row_count, n_seqs, - conv_states_all->nb[1], - ((size_t) slot * mem_size + kv_head) * row_size); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, src, dst)); + // [TAG_RECURRENT_ROLLBACK_SPLITS] + // TODO: this logic incorrectly assumes that the last (n_rs_seq + 1) tokens of a sequence in a batch are + // inside the same ubatch. currently with `split_equal()` this is not correct + + const int64_t K = (int64_t) cparams.n_rs_seq + 1; + + for (int64_t t = 1; t <= K; ++t) { + const int64_t s_idx = std::max(0, conv_input->ne[0] - conv_states->ne[0] - K + t); + const int64_t s_slot = K - t; + + ggml_tensor * conv_state_last = + ggml_view_3d(ctx0, conv_input, + conv_kernel_size - 1, conv_channels, n_seqs, + conv_input->nb[1], conv_input->nb[2], + ggml_row_size(conv_input->type, s_idx)); + + ggml_tensor * conv_state_update = + ggml_view_2d(ctx0, + conv_states_all, row_count, n_seqs, + conv_states_all->nb[1], + (s_slot * mem_size + kv_head) * row_size); + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_state_last, conv_state_update)); } } @@ -531,7 +544,9 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn( const int64_t n_seqs = s->ne[3]; const int64_t n_seq_tokens = q->ne[2]; - if (!keep_rs()) { + const bool keep = cparams.n_rs_seq > 0; + + if (!keep) { auto attn_out = build_delta_net(q, k, v, g, b, s, il); ggml_tensor * output = attn_out.first; ggml_tensor * new_state = attn_out.second; @@ -554,7 +569,11 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn( ggml_tensor * state_3d = ggml_pad(ctx0, state_in_3d, 0, K - 1, 0, 0); ggml_tensor * gdn_out = ggml_gated_delta_net(ctx0, q, k, v, g, b, state_3d); - cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_CH, il); + if (n_seq_tokens > 1) { + cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_CH, il); + } else { + cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_AR, il); + } const int64_t attn_score_elems = S_v * H_v * n_seq_tokens * n_seqs; const int64_t state_size_per_snap = S_v * S_v * H_v * n_seqs; @@ -576,9 +595,11 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn( ggml_row_size(gdn_out->type, S_v * S_v), ggml_row_size(gdn_out->type, S_v * S_v * H_v), ggml_row_size(gdn_out->type, attn_score_elems + k_i * state_size_per_snap)); + ggml_tensor * dst = ggml_view_2d(ctx0, ssm_states_all, hparams.n_embd_s(), n_seqs, ssm_states_all->nb[1], ((size_t) cache_slot * mem_size + kv_head) * row_size); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, src, dst)); } diff --git a/src/models/models.h b/src/models/models.h index 4e40536a5ea3..7e551eb965b1 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -66,9 +66,6 @@ struct llm_build_delta_net_base : public llm_graph_context { ggml_tensor * s, int il); - // true when speculative rollback is enabled and the batch fits in the rs cache - bool keep_rs() const; - // read conv state from cache, concat with qkv_mixed, write back (single slot or per-token) // qkv_mixed: (qkv_dim, n_seq_tokens, n_seqs); returns conv_input: (kernel_size + n_seq_tokens - 1, channels, n_seqs) ggml_tensor * build_conv_state( diff --git a/tools/cli/README.md b/tools/cli/README.md index c40b5a21cc0f..38bc78a3fdf8 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -191,10 +191,10 @@ | `--spec-draft-override-tensor, -otd, --override-tensor-draft =,...` | override tensor buffer type for draft model | | `--spec-draft-cpu-moe, -cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model
(env: LLAMA_ARG_SPEC_DRAFT_CPU_MOE) | | `--spec-draft-n-cpu-moe, --spec-draft-ncmoe, -ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model
(env: LLAMA_ARG_SPEC_DRAFT_N_CPU_MOE) | -| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) | +| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 3)
(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) | | `--spec-draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)
(env: LLAMA_ARG_SPEC_DRAFT_N_MIN) | | `--spec-draft-p-split, --draft-p-split P` | speculative decoding split probability (default: 0.10)
(env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT) | -| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.75)
(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) | +| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.00)
(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) | | `--spec-draft-device, -devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices | | `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | | `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_SPEC_DRAFT_MODEL) | diff --git a/tools/server/README.md b/tools/server/README.md index 1ecfb9cdd816..e0a25398c032 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -183,6 +183,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)
(env: LLAMA_ARG_IMAGE_MAX_TOKENS) | | `-a, --alias STRING` | set model name aliases, comma-separated (to be used by API)
(env: LLAMA_ARG_ALIAS) | | `--tags STRING` | set model tags, comma-separated (informational, not used for routing)
(env: LLAMA_ARG_TAGS) | +| `--embd-normalize N` | normalisation for embeddings (default: 2) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) | | `--host HOST` | ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: 127.0.0.1)
(env: LLAMA_ARG_HOST) | | `--port PORT` | port to listen (default: 8080)
(env: LLAMA_ARG_PORT) | | `--reuse-port` | allow multiple sockets to bind to the same port (default: disabled)
(env: LLAMA_ARG_REUSE_PORT) | @@ -246,10 +247,10 @@ For the full list of features, please refer to [server's changelog](https://gith | `--spec-draft-override-tensor, -otd, --override-tensor-draft =,...` | override tensor buffer type for draft model | | `--spec-draft-cpu-moe, -cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model
(env: LLAMA_ARG_SPEC_DRAFT_CPU_MOE) | | `--spec-draft-n-cpu-moe, --spec-draft-ncmoe, -ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model
(env: LLAMA_ARG_SPEC_DRAFT_N_CPU_MOE) | -| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) | +| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 3)
(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) | | `--spec-draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)
(env: LLAMA_ARG_SPEC_DRAFT_N_MIN) | | `--spec-draft-p-split, --draft-p-split P` | speculative decoding split probability (default: 0.10)
(env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT) | -| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.75)
(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) | +| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.00)
(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) | | `--spec-draft-device, -devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices | | `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | | `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_SPEC_DRAFT_MODEL) | From 41aef76b0d48e0ae6dbe15cb202b4fd1993edc47 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 19 May 2026 18:41:44 +0200 Subject: [PATCH 04/19] model : clarify MTP layer comment in qwen35.cpp [no ci] (#23338) This commit attempts to clarify a code comment in graph_mtp regarding where the MTP layer is stored. The motivation for this is that it was not obvious to me what the original comment meant and hopefully this makes it clearer. (cherry picked from commit baf3cc6e1d70ce73f66a0665811e5c2228cddc5d) --- src/models/qwen35.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index 361d7538a038..35a0158e8548 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -496,7 +496,8 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr const int64_t n_embd_head = hparams.n_embd_head_v(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); - // The MTP block lives at the source file's original layer index. + // hparams.n_layer includes both main model layers and MTP layers. The MTP + // layer is stored immediately after the main layers in model.layers[]. const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers; const auto & layer = model.layers[il]; From 1c9f2dfaf3ae42c61b18748d25c98650d5a1e8dd Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 20 May 2026 19:44:30 +0300 Subject: [PATCH 05/19] common/speculative : fix nullptr crash in get_devices_str (#23386) ggml_backend_dev_by_name always appends a nullptr sentinel to the devices vector. Skipping nullptr entries prevents assertion failure in ggml_backend_dev_name. Assisted-by: llama.cpp:local pi (cherry picked from commit 510b5c2a35652390c71327ecb29c2fb14bfe0e8c) --- common/speculative.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index 4d1b61a13ad9..6ca6bd29670e 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -33,16 +33,15 @@ const std::map common_speculative_type_fro }; static std::string common_speculative_get_devices_str(const std::vector & devices) { - if (devices.empty()) { - return "default"; - } - std::string result; for (size_t i = 0; i < devices.size(); i++) { - if (i > 0) result += ", "; + if (devices[i] == nullptr) { + continue; + } + if (!result.empty()) result += ", "; result += ggml_backend_dev_name(devices[i]); } - return result; + return result.empty() ? "default" : result; } struct common_speculative_config { From 697d19c3328a2bdbca8bdd82ba650e118e10a3a1 Mon Sep 17 00:00:00 2001 From: Gaurav Garg Date: Wed, 20 May 2026 22:34:45 +0530 Subject: [PATCH 06/19] Move to backend sampling for MTP draft path (#23287) * Move to backend sampling for MTP draft path Run top_k(10) on the draft backend. D2H transfers happen only for the top 10 logits Make backend sampling more robust and fallback to CPU on failure cases, such as with "-sm tensor" or when a backend doesn't support TOP_K. * Allow sampler chains to be partially offloaded to backend * Add --spec-draft-backend-sampling argument. Enabled by default. (cherry picked from commit ad277572619fcfb6ddd38f4c6437283a4b2b8636) --- common/arg.cpp | 9 +++++++++ common/common.h | 2 ++ common/speculative.cpp | 33 ++++++++++++++++++++++++++++++++- src/llama-context.cpp | 13 +++++++++++++ 4 files changed, 56 insertions(+), 1 deletion(-) diff --git a/common/arg.cpp b/common/arg.cpp index 0ec3d67700c5..b3878fd304c4 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3614,6 +3614,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.speculative.draft.p_min = std::stof(value); } ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_P_MIN")); + add_opt(common_arg( + {"--spec-draft-backend-sampling"}, + {"--no-spec-draft-backend-sampling"}, + string_format("offload draft sampling to the backend (default: %s)", + params.speculative.draft.backend_sampling ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.speculative.draft.backend_sampling = value; + } + ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_BACKEND_SAMPLING")); add_opt(common_arg( {"--spec-draft-device", "-devd", "--device-draft"}, "", "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n" diff --git a/common/common.h b/common/common.h index b53a6ed86f60..85c562286c6e 100644 --- a/common/common.h +++ b/common/common.h @@ -305,6 +305,8 @@ struct common_params_speculative_draft { float p_split = 0.1f; // speculative decoding split probability float p_min = 0.0f; // minimum speculative decoding probability (greedy) + bool backend_sampling = true; // offload draft sampling to the backend (default: on) + common_params_model mparams; llama_context * ctx_tgt = nullptr; diff --git a/common/speculative.cpp b/common/speculative.cpp index 6ca6bd29670e..253a5ececbb5 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -413,6 +413,9 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { std::vector smpls; + // backend sampler chain per seq, attached to ctx_dft + std::vector backend_chains; + int32_t n_embd = 0; // Per-sequence cross-batch carryover: pair (h_p, x_{p+1}) at MTP pos p+1. @@ -444,7 +447,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { n_embd = llama_model_n_embd(llama_get_model(ctx_dft)); LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__); - LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd); + LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d, backend_sampling=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd, (int) this->params.backend_sampling); LOG_INF("%s: - gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n", __func__, this->params.n_gpu_layers, ggml_type_name(this->params.cache_type_k), @@ -468,6 +471,22 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams)); } + // offload draft sampling to the backend + backend_chains.assign(n_seq, nullptr); + if (this->params.backend_sampling) { + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + llama_sampler * chain = llama_sampler_chain_init(llama_sampler_chain_default_params()); + llama_sampler_chain_add(chain, llama_sampler_init_top_k(10)); + + if (!llama_set_sampler(ctx_dft, seq_id, chain)) { + LOG_WRN("%s: backend offload failed for seq_id=%d; using CPU sampler\n", __func__, (int) seq_id); + llama_sampler_free(chain); + chain = nullptr; + } + backend_chains[seq_id] = chain; + } + } + llama_set_embeddings_pre_norm(ctx_tgt, true, /*masked*/ false); llama_set_embeddings_pre_norm(ctx_dft, true, /*masked*/ true); @@ -483,6 +502,18 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { } ~common_speculative_impl_draft_mtp() override { + auto * ctx_dft = this->params.ctx_dft; + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) backend_chains.size(); ++seq_id) { + if (backend_chains[seq_id] == nullptr) { + continue; + } + if (ctx_dft) { + llama_set_sampler(ctx_dft, seq_id, nullptr); + } + llama_sampler_free(backend_chains[seq_id]); + } + backend_chains.clear(); + if (batch.token != nullptr) { free(batch.token); batch.token = nullptr; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 691700978fdc..8114d0d173cf 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1136,6 +1136,19 @@ bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) { LLAMA_LOG_DEBUG("%s: seq_id = %d, sampler = %p\n", __func__, (int) seq_id, (void *) sampler); + if (sampler && model.split_mode() == LLAMA_SPLIT_MODE_TENSOR) { + static bool warned = false; + if (!warned) { + LLAMA_LOG_WARN("%s: backend sampling not supported with SPLIT_MODE_TENSOR; using CPU\n", __func__); + warned = true; + } + if (sampling.samplers.count(seq_id) > 0) { + sched_need_reserve = true; + } + sampling.samplers.erase(seq_id); + return false; + } + const bool can_offload = sampler && sampler->iface->backend_init && From 57cffb2ae52ce688a5703f2f5e4e72a2a567c312 Mon Sep 17 00:00:00 2001 From: Daniel Elliott Date: Wed, 20 May 2026 23:20:51 -0700 Subject: [PATCH 07/19] llama-graph: fix null-buffer crash in llm_graph_input_attn_kv_iswa for SWA-only models (#23131) When a model has zero non-SWA attention layers (e.g. a SWA-only slice of Gemma 4), the base KV cache has no layer tensors. The input tensors (self_k_idxs, self_v_idxs, self_kq_mask) are created as graph input nodes but never consumed by any compute node, so the backend scheduler never allocates a buffer for them. Calling mctx->get_base()->set_input_k_idxs() on an unallocated tensor then hits GGML_ASSERT(buffer) at ggml-backend.cpp:194. The same scenario applies symmetrically: if a model had zero SWA layers, the SWA tensors would be unallocated. Fix: guard both the base and SWA set_input calls with null/buffer checks, matching the pattern already used by llm_graph_input_mem_hybrid_iswa::set_input (line ~674) which has the comment: 'base tensors may not be allocated if there are no non-SWA attention layers'. Also fix can_reuse() in the same class to skip the ne[0] and kq_mask checks for unallocated tensors, preventing a null-dereference on the reuse path. (cherry picked from commit eeeaf6180b1edc6c8479d10327f0e378a7911cb9) --- src/llama-graph.cpp | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index f156451f6817..9dddf400a65a 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -500,15 +500,21 @@ bool llm_graph_input_attn_k::can_reuse(const llm_graph_params & params) { } void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) { - mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch); - mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch); + // base tensors may not be allocated if there are no non-SWA attention layers + if (self_k_idxs && self_k_idxs->buffer) { + mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch); + mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch); - mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); + mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); + } - mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch); - mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch); + // swa tensors may not be allocated if there are no SWA attention layers + if (self_k_idxs_swa && self_k_idxs_swa->buffer) { + mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch); + mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch); - mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); + mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); + } if (self_k_rot) { mctx->get_base()->set_input_k_rot(self_k_rot); @@ -534,14 +540,21 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) { bool res = true; - res &= self_k_idxs->ne[0] == params.ubatch.n_tokens; - //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there + // base tensors may not be allocated if there are no non-SWA attention layers + if (self_k_idxs && self_k_idxs->buffer) { + res &= self_k_idxs->ne[0] == params.ubatch.n_tokens; + //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there - res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens; - //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there + res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams); + } - res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams); - res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams); + // swa tensors may not be allocated if there are no SWA attention layers + if (self_k_idxs_swa && self_k_idxs_swa->buffer) { + res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens; + //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there + + res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams); + } return res; } From 676b3edcb92a790c762d6bd24e9826dc200964fb Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 21 May 2026 15:23:14 +0800 Subject: [PATCH 08/19] mtp: use inp_out_ids for skipping logit computation (#23433) when doing a follow-up decode for the draft model, we were always doing the logit computation even though it is not required. (cherry picked from commit 12e5d99078af2feaab0f62f69d230529d66ac623) --- src/models/qwen35.cpp | 7 +++++-- src/models/qwen35moe.cpp | 8 ++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index 35a0158e8548..adeb0c26e4e2 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -525,8 +525,9 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr res->add_input(std::move(inp)); - ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv(); + ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il); cb(h_norm, "mtp_hnorm", il); @@ -615,6 +616,8 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr cb(cur, "h_pre_norm", -1); res->t_h_pre_norm = cur; + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + ggml_tensor * head_norm_w = layer.nextn.shared_head_norm ? layer.nextn.shared_head_norm : model.output_norm; diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index 4f63c410d668..e4512116d382 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -588,8 +588,10 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm res->add_input(std::move(inp)); - ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv(); + ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + auto * inp_attn = build_attn_inp_kv(); + ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il); cb(h_norm, "mtp_hnorm", il); @@ -710,6 +712,8 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm cb(cur, "h_pre_norm", -1); res->t_h_pre_norm = cur; + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + ggml_tensor * head_norm_w = layer.nextn.shared_head_norm ? layer.nextn.shared_head_norm : model.output_norm; From 46cc11a79d6c58723928fed63a48d56d5000d82b Mon Sep 17 00:00:00 2001 From: Michael Wand Date: Sat, 23 May 2026 07:30:31 -0400 Subject: [PATCH 09/19] model : add NVFP4 MTP scale tensors (#23563) * Add NVFP4 MTP scale tensors * Link Qwen3.5 MTP tensors * Aligned nullptr (cherry picked from commit b0df4c0cfd2cda10738056771714a5290dc95454) --- src/llama-model.cpp | 12 ++++++++++++ src/llama-model.h | 16 ++++++++++------ src/models/qwen35.cpp | 5 +++-- src/models/qwen35moe.cpp | 5 +++-- 4 files changed, 28 insertions(+), 10 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 8bf20a716eba..0d21b2a53c57 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1334,6 +1334,12 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { if (!layer.ssm_beta_s && layer.ssm_beta) { layer.ssm_beta_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "scale", i), {1}, TENSOR_NOT_REQUIRED); } + if (!layer.nextn.eh_proj_s && layer.nextn.eh_proj) { + layer.nextn.eh_proj_s = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.nextn.shared_head_head_s && layer.nextn.shared_head_head) { + layer.nextn.shared_head_head_s = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "scale", i), {1}, TENSOR_NOT_REQUIRED); + } // input scales if (!layer.wq_in_s && layer.wq) { @@ -1393,6 +1399,12 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { if (!layer.ssm_beta_in_s && layer.ssm_beta) { layer.ssm_beta_in_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); } + if (!layer.nextn.eh_proj_in_s && layer.nextn.eh_proj) { + layer.nextn.eh_proj_in_s = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.nextn.shared_head_head_in_s && layer.nextn.shared_head_head) { + layer.nextn.shared_head_head_in_s = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); + } } // output scales if (output && output->type == GGML_TYPE_NVFP4) { diff --git a/src/llama-model.h b/src/llama-model.h index 01c87a75271f..398a0aa725c3 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -202,12 +202,16 @@ struct llama_layer_shortconv { }; struct llama_layer_nextn { - struct ggml_tensor * eh_proj = nullptr; - struct ggml_tensor * embed_tokens = nullptr; - struct ggml_tensor * enorm = nullptr; - struct ggml_tensor * hnorm = nullptr; - struct ggml_tensor * shared_head_head = nullptr; - struct ggml_tensor * shared_head_norm = nullptr; + struct ggml_tensor * eh_proj = nullptr; + struct ggml_tensor * eh_proj_s = nullptr; + struct ggml_tensor * eh_proj_in_s = nullptr; + struct ggml_tensor * embed_tokens = nullptr; + struct ggml_tensor * enorm = nullptr; + struct ggml_tensor * hnorm = nullptr; + struct ggml_tensor * shared_head_head = nullptr; + struct ggml_tensor * shared_head_head_s = nullptr; + struct ggml_tensor * shared_head_head_in_s = nullptr; + struct ggml_tensor * shared_head_norm = nullptr; }; struct llama_layer { diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index adeb0c26e4e2..04ecc18fcdc0 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -538,7 +538,7 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0); cb(concat, "mtp_concat", il); - ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat); + ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat, layer.nextn.eh_proj_s); cb(cur, "mtp_eh_proj", il); ggml_tensor * inpSA = cur; @@ -626,8 +626,9 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr cb(cur, "mtp_shared_head_norm", -1); ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output; + ggml_tensor * head_s = layer.nextn.shared_head_head ? layer.nextn.shared_head_head_s : model.output_s; GGML_ASSERT(head_w && "QWEN35 MTP: missing LM head (nextn.shared_head_head or model.output)"); - cur = build_lora_mm(head_w, cur); + cur = build_lora_mm(head_w, cur, head_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index e4512116d382..dc24f6ed5374 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -602,7 +602,7 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0); cb(concat, "mtp_concat", il); - ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat); + ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat, layer.nextn.eh_proj_s); cb(cur, "mtp_eh_proj", il); ggml_tensor * inpSA = cur; @@ -722,8 +722,9 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm cb(cur, "mtp_shared_head_norm", -1); ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output; + ggml_tensor * head_s = layer.nextn.shared_head_head ? layer.nextn.shared_head_head_s : model.output_s; GGML_ASSERT(head_w && "QWEN35MOE MTP: missing LM head (nextn.shared_head_head or model.output)"); - cur = build_lora_mm(head_w, cur); + cur = build_lora_mm(head_w, cur, head_s); cb(cur, "result_output", -1); res->t_logits = cur; From ab11a715a072dbc78923bf83d3349811f8420978 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Fri, 29 May 2026 14:17:32 +0800 Subject: [PATCH 10/19] llama: add llm_graph_input_mtp (#23643) * llama: add llm_graph_input_mtp * rename input_mtp -> input_token_embd * add TODO about mtmd embedding * cont : clean-up --------- Co-authored-by: Georgi Gerganov (cherry picked from commit eef59a764264efc025be974e0452584f584a3c59) --- src/llama-graph.cpp | 33 +++++++++++++++++++++++++++++++++ src/llama-graph.h | 17 +++++++++++++++++ src/models/qwen35.cpp | 29 +++++++++++++++++++++-------- src/models/qwen35moe.cpp | 28 ++++++++++++++++++++-------- 4 files changed, 91 insertions(+), 16 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 9dddf400a65a..0ee9e5384042 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -102,6 +102,39 @@ bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) { return res; } +void llm_graph_input_embd_h::set_input(const llama_ubatch * ubatch) { + const int64_t n_tokens = ubatch->n_tokens; + + if (ubatch->token) { + ggml_backend_tensor_set(tokens, ubatch->token, 0, n_tokens*ggml_element_size(tokens)); + } else { + // note: mtmd embedding input goes through here + GGML_ASSERT(ubatch->embd); + GGML_ASSERT(n_embd == embd->ne[0]); + + ggml_backend_tensor_set(embd, ubatch->embd, 0, n_tokens*n_embd*ggml_element_size(h)); + } + + // TODO: extend llama_ubatch to differentiate between token embeddings and hidden states + // for now, we assume that the hidden state is always provided as an embedding + // ref: https://github.com/ggml-org/llama.cpp/pull/23643 + if (ubatch->embd) { + GGML_ASSERT(n_embd == h->ne[0]); + + ggml_backend_tensor_set(h, ubatch->embd, 0, n_tokens*n_embd*ggml_element_size(h)); + } +} + +bool llm_graph_input_embd_h::can_reuse(const llm_graph_params & params) { + bool res = true; + + res &= (!params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens); + res &= (!params.ubatch.embd) || (embd && embd->ne[1] == params.ubatch.n_tokens); + res &= (!params.ubatch.embd) || (h && h->ne[1] == params.ubatch.n_tokens); + + return res; +} + void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) { if (ubatch->pos && pos) { const int64_t n_tokens = ubatch->n_tokens; diff --git a/src/llama-graph.h b/src/llama-graph.h index bf6778237e6f..e240ade7b0c5 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -121,6 +121,23 @@ class llm_graph_input_embd : public llm_graph_input_i { const int64_t n_embd = 0; }; +// similar to llm_graph_input_embd but with an additional hidden state input +class llm_graph_input_embd_h : public llm_graph_input_i { +public: + llm_graph_input_embd_h(int64_t n_embd) : n_embd(n_embd) {} + virtual ~llm_graph_input_embd_h() = default; + + void set_input(const llama_ubatch * ubatch) override; + + bool can_reuse(const llm_graph_params & params) override; + + ggml_tensor * tokens = nullptr; // I32 [n_batch] + ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch] + ggml_tensor * h = nullptr; // F32 [n_embd, n_batch] + + const int64_t n_embd = 0; +}; + class llm_graph_input_pos : public llm_graph_input_i { public: llm_graph_input_pos(uint32_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {} diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index 04ecc18fcdc0..ba63ae441df5 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -508,28 +508,41 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr int sections[4]; std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); - auto inp = std::make_unique(hparams.n_embd); + // TODO: extract in a common llm_graph_context::build_inp_embd_h() + auto inp = std::make_unique(hparams.n_embd); inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); ggml_set_input(inp->tokens); - inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens); + inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd_inp(), n_tokens); ggml_set_input(inp->embd); - ggml_set_name(inp->embd, "mtp_h_input"); - ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd; + // TODO: make static using `ggml_build_forward_select()` + // see llm_graph_context::build_inp_embd() for reference + ggml_tensor * tok_embd; + if (ubatch.token) { + ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd; - ggml_tensor * h_input = inp->embd; - ggml_tensor * tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens); + tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens); + } else { + tok_embd = inp->embd; + } cb(tok_embd, "mtp_tok_embd", il); + inp->h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens); + ggml_set_input(inp->h); + ggml_set_name(inp->h, "mtp_h_input"); + + ggml_tensor * h_embd = inp->h; + res->add_input(std::move(inp)); ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_out_ids = build_inp_out_ids(); - auto * inp_attn = build_attn_inp_kv(); - ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il); + auto * inp_attn = build_attn_inp_kv(); + + ggml_tensor * h_norm = build_norm(h_embd, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il); cb(h_norm, "mtp_hnorm", il); ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il); diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index dc24f6ed5374..4f87d55d9112 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -571,29 +571,41 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm int sections[4]; std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); - auto inp = std::make_unique(hparams.n_embd); + // TODO: extract in a common llm_graph_context::build_inp_embd_h() + auto inp = std::make_unique(hparams.n_embd); inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); ggml_set_input(inp->tokens); - inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens); + inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd_inp(), n_tokens); ggml_set_input(inp->embd); - ggml_set_name(inp->embd, "mtp_h_input"); - ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd; + // TODO: make static using `ggml_build_forward_select()` + // see llm_graph_context::build_inp_embd() for reference + ggml_tensor * tok_embd; + if (ubatch.token) { + ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd; - ggml_tensor * h_input = inp->embd; - ggml_tensor * tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens); + tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens); + } else { + tok_embd = inp->embd; + } cb(tok_embd, "mtp_tok_embd", il); + inp->h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens); + ggml_set_input(inp->h); + ggml_set_name(inp->h, "mtp_h_input"); + + ggml_tensor * h_embd = inp->h; + res->add_input(std::move(inp)); ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_out_ids = build_inp_out_ids(); - auto * inp_attn = build_attn_inp_kv(); + auto * inp_attn = build_attn_inp_kv(); - ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il); + ggml_tensor * h_norm = build_norm(h_embd, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il); cb(h_norm, "mtp_hnorm", il); ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il); From 337e04cee72e1813d554a1d47c3724dc6fc1202e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 1 Jun 2026 22:26:58 +0300 Subject: [PATCH 11/19] speculative : fix n_outputs_max and remove draft-simple auto-enable (#23988) * speculative : add common_speculative_n_max helper function Extract the speculative max-draft-size logic from server_n_outputs_max into a reusable common_speculative_n_max() function in common/speculative. Assisted-by: llama.cpp:local pi * cont : draft context always has n_parallel outputs * llama : log n_outputs_max * speculative : remove draft-simple auto-enable * ci : enable server tests on PRs (cherry picked from commit 5dcb71166686799f0d873eab7386234302d05ecf) --- common/arg.cpp | 4 - common/speculative.cpp | 46 ++++++++--- common/speculative.h | 3 + src/llama-context.cpp | 1 + tools/server/server-context.cpp | 136 ++++++++++++++++++++++++++++++++ 5 files changed, 174 insertions(+), 16 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index b3878fd304c4..8b8e5f59a487 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1045,11 +1045,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex // we define here to make sure it's included in llama-gen-docs if (ex == LLAMA_EXAMPLE_COMPLETION) { params.use_jinja = false; // disable jinja by default - } else if (ex == LLAMA_EXAMPLE_MTMD) { params.use_jinja = false; // disable jinja by default params.sampling.temp = 0.2; // lower temp by default for better quality - } else if (ex == LLAMA_EXAMPLE_SERVER) { params.n_parallel = -1; // auto by default } @@ -1070,7 +1068,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex sampler_type_names.pop_back(); // remove last semicolon } - /** * filter options by example * rules: @@ -1084,7 +1081,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } }; - add_opt(common_arg( {"-h", "--help", "--usage"}, "print usage and exit", diff --git a/common/speculative.cpp b/common/speculative.cpp index 253a5ececbb5..73830fda6c97 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -1317,6 +1317,40 @@ static uint32_t common_get_enabled_speculative_configs(const std::vectortypes) { + switch (type) { + case COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE: + case COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3: + case COMMON_SPECULATIVE_TYPE_DRAFT_MTP: + n_max = std::max(n_max, std::max(0, spec->draft.n_max)); + break; + case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: + n_max = std::max(n_max, (int32_t) spec->ngram_simple.size_m); + break; + case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: + n_max = std::max(n_max, (int32_t) spec->ngram_map_k.size_m); + break; + case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: + n_max = std::max(n_max, (int32_t) spec->ngram_map_k4v.size_m); + break; + case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: + n_max = std::max(n_max, std::max(0, spec->ngram_mod.n_max)); + break; + case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: + n_max = std::max(n_max, (int32_t) 8); + break; + case COMMON_SPECULATIVE_TYPE_NONE: + case COMMON_SPECULATIVE_TYPE_COUNT: + break; + } + } + + return n_max; +} + // initialization of the speculative decoding system // common_speculative * common_speculative_init(common_params_speculative & params, uint32_t n_seq) { @@ -1325,8 +1359,6 @@ common_speculative * common_speculative_init(common_params_speculative & params, { uint32_t enabled_configs = common_get_enabled_speculative_configs(params.types); - bool has_draft_model_path = !params.draft.mparams.path.empty(); - bool has_draft_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE)); bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3 bool has_mtp = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP)) && params.draft.ctx_dft != nullptr; @@ -1359,16 +1391,6 @@ common_speculative * common_speculative_init(common_params_speculative & params, if (has_ngram_cache) { configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, params)); } - if (has_draft_simple) { - if (!has_draft_model_path) { - LOG_WRN("%s: draft model is not specified - cannot use 'draft' type\n", __func__); - has_draft_simple = false; - } - } else if (has_draft_model_path && !has_mtp && !has_draft_eagle3) { - LOG_WRN("%s: draft model is specified but 'draft' speculative type is not explicitly enabled - enabling it\n", __func__); - has_draft_simple = true; - } - if (has_draft_simple) { configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE, params)); } diff --git a/common/speculative.h b/common/speculative.h index f24bac79edb7..deba7dac7208 100644 --- a/common/speculative.h +++ b/common/speculative.h @@ -20,6 +20,9 @@ enum common_speculative_type common_speculative_type_from_name(const std::string // convert type to string std::string common_speculative_type_to_str(enum common_speculative_type type); +// return the max number of draft tokens based on the speculative parameters +int32_t common_speculative_n_max(const common_params_speculative * spec); + common_speculative * common_speculative_init(common_params_speculative & params, uint32_t n_seq); void common_speculative_free(common_speculative * spec); diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 8114d0d173cf..e59c1f74e2b2 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -226,6 +226,7 @@ llama_context::llama_context( LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); LLAMA_LOG_INFO("%s: n_rs_seq = %u\n", __func__, cparams.n_rs_seq); + LLAMA_LOG_INFO("%s: n_outputs_max = %u\n", __func__, cparams.n_outputs_max); if (cparams.n_ctx_seq < hparams.n_ctx_train) { LLAMA_LOG_WARN("%s: n_ctx_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n", diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index bb28fad4a5ee..59cb23dc292b 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -37,6 +37,21 @@ using json = nlohmann::ordered_json; constexpr int HTTP_POLLING_SECONDS = 1; +static uint32_t server_n_outputs_max(const common_params & params) { + const uint32_t n_batch = params.n_batch; + + if (params.embedding || + (params.pooling_type != LLAMA_POOLING_TYPE_UNSPECIFIED && params.pooling_type != LLAMA_POOLING_TYPE_NONE)) { + return n_batch; + } + + const uint32_t n_outputs_per_seq = 1 + common_speculative_n_max(¶ms.speculative); + + const uint64_t n_outputs = (uint64_t) params.n_parallel * n_outputs_per_seq; + + return std::max(1, std::min(n_batch, n_outputs)); +} + // state diagram: https://github.com/ggml-org/llama.cpp/pull/9283 enum slot_state { SLOT_STATE_IDLE, @@ -743,6 +758,127 @@ struct server_context_impl { SRV_INF("loading model '%s'\n", params.model.path.c_str()); params_base = params; + params_base.n_outputs_max = server_n_outputs_max(params_base); + + std::string & mmproj_path = params_base.mmproj.path; + bool has_mmproj = !mmproj_path.empty(); + mtmd_context_params mparams = mtmd_context_params_default(); + if (has_mmproj) { + mparams.use_gpu = params_base.mmproj_use_gpu; + mparams.print_timings = false; + mparams.n_threads = params_base.cpuparams.n_threads; + mparams.flash_attn_type = params_base.flash_attn_type; + mparams.warmup = params_base.warmup; + mparams.image_min_tokens = params_base.image_min_tokens; + mparams.image_max_tokens = params_base.image_max_tokens; + mparams.media_marker = get_media_marker(); + } + + // optionally get the memory usage of mmproj + if (has_mmproj && params_base.fit_params) { + auto mmproj_mem = mtmd_get_memory_usage(mmproj_path.c_str(), mparams); + if (!mmproj_mem.empty()) { + size_t total = 0; + for (auto & [dev, size] : mmproj_mem) { + total += size; + } + SRV_INF("[mtmd] estimated worst-case memory usage of mmproj is %.2f MiB\n", total / (1024.0 * 1024.0)); + GGML_ASSERT(!params_base.fit_params_target.empty()); + for (auto & [dev, size] : mmproj_mem) { + for (size_t i = 0; i < ggml_backend_dev_count(); i++) { + if (ggml_backend_dev_get(i) == dev) { + if (i < params_base.fit_params_target.size()) { + SRV_DBG("[mtmd] adding %.2f MiB to fit_params_target for device %s\n", size / (1024.0 * 1024.0), ggml_backend_dev_name(dev)); + params_base.fit_params_target[i] += size; + } + break; + } + } + } + } else { + SRV_ERR("%s", "[mtmd] failed to get memory usage of mmproj\n"); + } + } + + // optionally reserve VRAM for the draft / MTP context before fitting the target model + if (params_base.fit_params) { + const bool spec_mtp = std::find(params_base.speculative.types.begin(), + params_base.speculative.types.end(), + COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params_base.speculative.types.end(); + const bool has_draft = params_base.speculative.has_dft(); + + if (has_draft || spec_mtp) { + common_params params_dft = params_base; + bool measure_model_bytes = true; + + if (has_draft) { + const auto & params_spec = params_base.speculative.draft; + params_dft.devices = params_spec.devices; + params_dft.model = params_spec.mparams; + params_dft.n_gpu_layers = params_spec.n_gpu_layers; + params_dft.cache_type_k = params_spec.cache_type_k; + params_dft.cache_type_v = params_spec.cache_type_v; + params_dft.tensor_buft_overrides = params_spec.tensor_buft_overrides; + } else { + // MTP draft context lives on the target model, only context+compute are new + measure_model_bytes = false; + } + + params_dft.n_outputs_max = params_base.n_parallel; + + auto mparams_dft = common_model_params_to_llama(params_dft); + auto cparams_dft = common_context_params_to_llama(params_dft); + if (spec_mtp) { + cparams_dft.ctx_type = LLAMA_CONTEXT_TYPE_MTP; + cparams_dft.type_k = params_base.speculative.draft.cache_type_k; + cparams_dft.type_v = params_base.speculative.draft.cache_type_v; + } + cparams_dft.n_rs_seq = 0; + + std::vector devs; + uint32_t hp_ngl = 0; + uint32_t hp_nct = 0; + uint32_t hp_nex = 0; + try { + auto dmd = common_get_device_memory_data( + params_dft.model.path.c_str(), &mparams_dft, &cparams_dft, + devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR); + + GGML_ASSERT(!params_base.fit_params_target.empty()); + size_t total = 0; + + std::vector tgt_devices = params.devices; + + if (tgt_devices.empty()) { + for(size_t i = 0; i < ggml_backend_dev_count(); ++i) { + tgt_devices.push_back(ggml_backend_dev_get(i)); + } + } + + for (size_t j = 0; j < devs.size(); ++j) { + const size_t bytes = + (measure_model_bytes ? dmd[j].mb.model : 0) + + dmd[j].mb.context + + dmd[j].mb.compute; + total += bytes; + for (size_t i = 0; i < tgt_devices.size(); i++) { + if (tgt_devices[i] == devs[j]) { + SRV_DBG("[spec] adding %.2f MiB to fit_params_target for device %s\n", + bytes / (1024.0 * 1024.0), ggml_backend_dev_name(devs[j])); + params_base.fit_params_target[i] += bytes; + break; + } + } + } + SRV_INF("[spec] estimated memory usage of %s is %.2f MiB\n", + has_draft ? "draft model" : "MTP context", + total / (1024.0 * 1024.0)); + } catch (const std::exception & e) { + SRV_ERR("[spec] failed to measure %s memory: %s\n", + has_draft ? "draft model" : "MTP context", e.what()); + } + } + } llama_init = common_init_from_params(params_base); From 75606176f735c1cc4e127fb82ce441e82b26f03d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 3 Jun 2026 10:15:27 +0300 Subject: [PATCH 12/19] tests : add support for qwen3 SSM archs (#24031) * tests : add support for qwen3 SSM archs * arch : add LLM_KV_ATTENTION_RECURRENT_LAYERS * cont : naming + TODOs (cherry picked from commit 06938ac129e5feee1e731323e5c37dc973de5573) --- src/llama-arch.cpp | 1 + src/llama-arch.h | 1 + src/llama-hparams.cpp | 25 +++++++++++++++++++------ src/llama-hparams.h | 31 +++++++++++++++++++------------ src/llama-memory-hybrid-iswa.cpp | 4 ++-- src/llama-memory-hybrid.cpp | 4 ++-- src/llama-model-loader.cpp | 8 ++++---- src/llama-model-saver.cpp | 8 ++++---- src/llama-model.cpp | 30 ++++++++++++++---------------- src/models/falcon-h1.cpp | 2 +- src/models/gemma4.cpp | 2 +- src/models/granite-hybrid.cpp | 6 +++--- src/models/jamba.cpp | 2 +- src/models/kimi-linear.cpp | 6 +++--- src/models/lfm2.cpp | 10 +++++----- src/models/lfm2moe.cpp | 4 ++-- src/models/llama4.cpp | 3 ++- src/models/mimo2.cpp | 3 ++- src/models/nemotron-h.cpp | 6 +++--- src/models/plamo2.cpp | 6 +++--- src/models/qwen35.cpp | 9 +++++---- src/models/qwen35moe.cpp | 9 +++++---- src/models/qwen3next.cpp | 8 ++++---- src/models/step35.cpp | 4 +++- 24 files changed, 109 insertions(+), 83 deletions(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 02ebc2168292..200b099a388c 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -243,6 +243,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, "%s.attention.indexer.key_length" }, { LLM_KV_ATTENTION_INDEXER_TOP_K, "%s.attention.indexer.top_k" }, { LLM_KV_ATTENTION_SHARED_KV_LAYERS, "%s.attention.shared_kv_layers" }, + { LLM_KV_ATTENTION_RECURRENT_LAYERS, "%s.attention.recurrent_layers" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_DIMENSION_COUNT_SWA, "%s.rope.dimension_count_swa" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index f3ca6b51c17f..dfb89eb8292a 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -247,6 +247,7 @@ enum llm_kv { LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, LLM_KV_ATTENTION_INDEXER_TOP_K, LLM_KV_ATTENTION_SHARED_KV_LAYERS, + LLM_KV_ATTENTION_RECURRENT_LAYERS, LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_DIMENSION_COUNT_SWA, diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 2239309c8fb4..087afec55c60 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -8,18 +8,31 @@ void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) { if (dense_first) { for (uint32_t il = 0; il < n_layer; ++il) { - swa_layers[il] = n_pattern == 0 || (il % n_pattern != 0); + is_swa_impl[il] = n_pattern == 0 || (il % n_pattern != 0); } } else { for (uint32_t il = 0; il < n_layer; ++il) { - swa_layers[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1)); + is_swa_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1)); } } } +// TODO: implement +//void llama_hparams::set_recr_pattern(uint32_t n_pattern, bool dense_first) { +// if (dense_first) { +// for (uint32_t il = 0; il < n_layer; ++il) { +// is_recr_impl[il] = n_pattern == 0 || (il % n_pattern != 0); +// } +// } else { +// for (uint32_t il = 0; il < n_layer; ++il) { +// is_recr_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1)); +// } +// } +//} + bool llama_hparams::is_swa_any() const { for (uint32_t il = 0; il < n_layer; ++il) { - if (swa_layers[il]) { + if (is_swa_impl[il]) { return true; } } @@ -193,9 +206,9 @@ uint32_t llama_hparams::n_embd_s() const { return ssm_d_state * ssm_d_inner; } -bool llama_hparams::is_recurrent(uint32_t il) const { +bool llama_hparams::is_recr(uint32_t il) const { if (il < n_layer) { - return recurrent_layer_arr[il]; + return is_recr_impl[il]; } GGML_ABORT("%s: il (%u) out of bounds (n_layer: %u)\n", __func__, il, n_layer); @@ -207,7 +220,7 @@ uint32_t llama_hparams::n_pos_per_embd() const { bool llama_hparams::is_swa(uint32_t il) const { if (il < n_layer) { - return swa_layers[il]; + return is_swa_impl[il]; } GGML_ABORT("fatal error"); diff --git a/src/llama-hparams.h b/src/llama-hparams.h index e2d051edc6cd..43b6940a0e93 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -34,6 +34,9 @@ struct llama_hparams_convnext { }; struct llama_hparams { + // note: use the `_impl` suffix to avoid name conflict between members and getters + // for example: n_embd_out() vs n_embd_out_impl + bool vocab_only; bool no_alloc; bool rope_finetuned; @@ -43,7 +46,7 @@ struct llama_hparams { uint32_t n_ctx_train; // context size the model was trained on uint32_t n_embd; uint32_t n_layer; - int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache + int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache uint32_t n_expert = 0; uint32_t n_expert_used = 0; uint32_t n_rel_attn_bkts = 0; @@ -134,11 +137,15 @@ struct llama_hparams { llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE; // the size of the sliding window (0 - no SWA) uint32_t n_swa = 0; - // if swa_layers[il] == 1, then layer il is SWA - // if swa_layers[il] == 0, then layer il is dense (i.e. non-SWA) + + // if is_swa_impl[il] == 1, then layer il is SWA + // if is_swa_impl[il] == 0, then layer il is dense (i.e. non-SWA) // by default, all layers are dense // note: using uint32_t type for compatibility reason - std::array swa_layers; + std::array is_swa_impl; + + // for hybrid state space models + std::array is_recr_impl; // for State Space Models uint32_t ssm_d_conv = 0; @@ -150,9 +157,6 @@ struct llama_hparams { // for Kimi Linear KDA uint32_t n_embd_head_kda = 0; - // for hybrid state space models - std::array recurrent_layer_arr; - bool ssm_dt_b_c_rms = false; float f_clamp_kqv = 0.0f; @@ -255,6 +259,14 @@ struct llama_hparams { // return true if one of the layers is SWA bool is_swa_any() const; + bool is_swa(uint32_t il) const; + + // TODO: implement + //void set_recr_pattern(uint32_t n_pattern, bool dense_first = false); + + // whether or not the given layer is recurrent (for hybrid models) + bool is_recr(uint32_t il) const; + uint32_t n_head(uint32_t il = 0) const; uint32_t n_head_kv(uint32_t il = 0) const; @@ -296,13 +308,8 @@ struct llama_hparams { // dimension of the recurrent state embeddings uint32_t n_embd_s() const; - // whether or not the given layer is recurrent (for hybrid models) - bool is_recurrent(uint32_t il) const; - uint32_t n_pos_per_embd() const; - bool is_swa(uint32_t il) const; - // note: currently only support if either all or none of the layers are MLA bool is_mla() const; diff --git a/src/llama-memory-hybrid-iswa.cpp b/src/llama-memory-hybrid-iswa.cpp index 72f5c2fea725..a242079b4066 100644 --- a/src/llama-memory-hybrid-iswa.cpp +++ b/src/llama-memory-hybrid-iswa.cpp @@ -44,7 +44,7 @@ llama_memory_hybrid_iswa::llama_memory_hybrid_iswa( n_ubatch, n_pad, filter_attn == nullptr ? - [&](int32_t il) { return !hparams.is_recurrent(il); } + [&](int32_t il) { return !hparams.is_recr(il); } : filter_attn, nullptr )), @@ -57,7 +57,7 @@ llama_memory_hybrid_iswa::llama_memory_hybrid_iswa( n_seq_max, n_rs_seq, filter_recr == nullptr ? - [&](int32_t il) { return hparams.is_recurrent(il); } + [&](int32_t il) { return hparams.is_recr(il); } : filter_recr )) {} diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp index ed28625870b7..c14c4cd9a797 100644 --- a/src/llama-memory-hybrid.cpp +++ b/src/llama-memory-hybrid.cpp @@ -44,7 +44,7 @@ llama_memory_hybrid::llama_memory_hybrid( n_swa, swa_type, filter_attn == nullptr ? - [&](int32_t il) { return !hparams.is_recurrent(il); } + [&](int32_t il) { return !hparams.is_recr(il); } : filter_attn, nullptr )), @@ -57,7 +57,7 @@ llama_memory_hybrid::llama_memory_hybrid( n_seq_max, n_rs_seq, filter_recr == nullptr ? - [&](int32_t il) { return hparams.is_recurrent(il); } + [&](int32_t il) { return hparams.is_recr(il); } : filter_recr )) {} diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 54e5974e62c4..01e1a22498f6 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -148,7 +148,7 @@ namespace GGUFMeta { const enum gguf_type arr_type = gguf_get_arr_type(ctx, k); return ArrayInfo { arr_type, - size_t(gguf_get_arr_n(ctx, k)), + gguf_get_arr_n(ctx, k), arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx, k), }; } @@ -447,7 +447,7 @@ namespace GGUFMeta { } if (n > N_MAX) { - throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str())); + throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", n, (uint32_t) N_MAX, key.c_str())); } if (gguf_get_kv_type(metadata, kid) == GGUF_TYPE_ARRAY) { @@ -504,9 +504,9 @@ namespace GGUFMeta { } // TODO: this is not very clever - figure out something better - template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); + template bool llama_model_loader::get_key_or_arr> (enum llm_kv kid, std::array & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); - template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); + template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); llama_model_loader::llama_model_loader( diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp index 528e4c9c069f..168ac335bd40 100644 --- a/src/llama-model-saver.cpp +++ b/src/llama-model-saver.cpp @@ -14,9 +14,6 @@ bool llama_model_saver_supports_arch(llm_arch arch) { switch (arch) { - case LLM_ARCH_QWEN3NEXT: - case LLM_ARCH_QWEN35: - case LLM_ARCH_QWEN35MOE: case LLM_ARCH_PLAMO3: case LLM_ARCH_GEMMA3: case LLM_ARCH_GEMMA3N: @@ -106,6 +103,8 @@ void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, c gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT8, value.data(), n_values); } else if (std::is_same::value) { gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT32, value.data(), n_values); + } else if (std::is_same::value) { + gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_BOOL, value.data(), n_values); } else if (std::is_same::value) { gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT32, value.data(), n_values); } else if (std::is_same::value) { @@ -244,7 +243,7 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale); add_kv(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count); add_kv(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step); - // add_kv(LLM_KV_FULL_ATTENTION_INTERVAL, ???); + // add_kv(LLM_KV_FULL_ATTENTION_INTERVAL, ???); // saved as LLM_KV_ATTENTION_RECURRENT_LAYERS instead add_kv(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, true); add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, true); @@ -278,6 +277,7 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, hparams.indexer_n_head); add_kv(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, hparams.indexer_head_size); add_kv(LLM_KV_ATTENTION_INDEXER_TOP_K, hparams.indexer_top_k); + add_kv(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, true); const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 0d21b2a53c57..20584deb2638 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -366,10 +366,10 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str // count only the same type of previous layers to avoid this auto get_il_eff = [&](const size_t il){ size_t ret = 0; - const bool il_is_recurrent = hparams.is_recurrent(il); - const bool il_is_swa = hparams.is_swa(il); + const bool il_is_recr = hparams.is_recr(il); + const bool il_is_swa = hparams.is_swa(il); for (size_t il_prev = 0; il_prev < il; il_prev++) { - ret += hparams.is_recurrent(il_prev) == il_is_recurrent && hparams.is_swa(il_prev) == il_is_swa; + ret += hparams.is_recr(il_prev) == il_is_recr && hparams.is_swa(il_prev) == il_is_swa; } return ret; }; @@ -545,8 +545,8 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str return {tensor->ne[axis]}; }; - auto get_split_granularity = [&](int64_t blck_size, uint32_t il, const std::vector & segments) -> std::vector { - if (hparams.is_recurrent(il)) { + auto get_split_granularity = [&](int64_t blck_size, uint32_t il, const std::vector> & segments) -> std::vector { + if (hparams.is_recr(il)) { // linear attention const int64_t head_dim = hparams.ssm_d_state; const int64_t granularity_qkv = std::lcm(blck_size, head_dim); @@ -1042,18 +1042,16 @@ void llama_model_base::load_hparams(llama_model_loader & ml) { std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0); std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0); std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0); - std::fill( - hparams.recurrent_layer_arr.begin(), - hparams.recurrent_layer_arr.end(), - llm_arch_is_recurrent(ml.get_arch())); std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0); - std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0); + std::fill(hparams.is_swa_impl.begin(), hparams.is_swa_impl.end(), 0); + std::fill(hparams.is_recr_impl.begin(), hparams.is_recr_impl.end(), llm_arch_is_recurrent(ml.get_arch()) ? 1 : 0); std::fill(hparams.xielu_alpha_n.begin(), hparams.xielu_alpha_n.end(), 0.0f); std::fill(hparams.xielu_alpha_p.begin(), hparams.xielu_alpha_p.end(), 0.0f); - std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f); - std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f); + std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f); + std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f); + std::fill(hparams.swiglu_clamp_exp.begin(), hparams.swiglu_clamp_exp.end(), 0.0f); std::fill(hparams.swiglu_clamp_shexp.begin(), hparams.swiglu_clamp_shexp.end(), 0.0f); @@ -1985,18 +1983,18 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, filter_recr = [&](int32_t) { return true; }; } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) { filter_attn = [&](int32_t il) { - return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0; + return !hparams.is_recr(il) && hparams.n_ff(il) == 0; }; filter_recr = [&](int32_t il) { - return hparams.is_recurrent(il) && hparams.n_ff(il) == 0; + return hparams.is_recr(il) && hparams.n_ff(il) == 0; }; } else if (arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE) { const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers; filter_attn = [&, n_main](int32_t il) { - return (uint32_t)il < n_main && !hparams.is_recurrent(il); + return (uint32_t)il < n_main && !hparams.is_recr(il); }; filter_recr = [&, n_main](int32_t il) { - return (uint32_t)il < n_main && hparams.is_recurrent(il); + return (uint32_t)il < n_main && hparams.is_recr(il); }; } diff --git a/src/models/falcon-h1.cpp b/src/models/falcon-h1.cpp index 94b65a3c7c94..c130ccdd49ee 100644 --- a/src/models/falcon-h1.cpp +++ b/src/models/falcon-h1.cpp @@ -11,7 +11,7 @@ void llama_model_falcon_h1::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); - std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true); + std::fill(hparams.is_recr_impl.begin(), hparams.is_recr_impl.end(), true); switch (hparams.n_layer) { case 36: diff --git a/src/models/gemma4.cpp b/src/models/gemma4.cpp index c92ffffd23ac..31906de33d97 100644 --- a/src/models/gemma4.cpp +++ b/src/models/gemma4.cpp @@ -2,7 +2,7 @@ void llama_model_gemma4::load_arch_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; - ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer); + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer); uint32_t n_kv_shared_layers = 0; ml.get_key(LLM_KV_ATTENTION_SHARED_KV_LAYERS, n_kv_shared_layers, false); diff --git a/src/models/granite-hybrid.cpp b/src/models/granite-hybrid.cpp index 27f6706ea10f..8740d9fc7d9b 100644 --- a/src/models/granite-hybrid.cpp +++ b/src/models/granite-hybrid.cpp @@ -20,7 +20,7 @@ void llama_model_granite_hybrid::load_arch_hparams(llama_model_loader & ml) { // A layer is recurrent IFF the n_head_kv value is set to 0 for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0; + hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0; } ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -71,7 +71,7 @@ void llama_model_granite_hybrid::load_arch_tensors(llama_model_loader &) { // norm layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - if (hparams.is_recurrent(i)) { + if (hparams.is_recr(i)) { // ssm layers layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0); @@ -158,7 +158,7 @@ llama_model_granite_hybrid::graph::graph(const llama_model & model, const llm_gr cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); - if (hparams.is_recurrent(il)) { + if (hparams.is_recr(il)) { // ssm layer // cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il); } else { diff --git a/src/models/jamba.cpp b/src/models/jamba.cpp index 84ea63c31367..a62b121b3eea 100644 --- a/src/models/jamba.cpp +++ b/src/models/jamba.cpp @@ -9,7 +9,7 @@ void llama_model_jamba::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0; + hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0; } switch (hparams.n_layer) { diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index ecffb105496b..c13f71b5bcb1 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -15,7 +15,7 @@ void llama_model_kimi_linear::load_arch_hparams(llama_model_loader & ml) { // Mark KDA layers as recurrent using n_head_kv pattern (like Jamba) // Set n_head_kv = 0 for KDA layers (recurrent), n_head_kv = n_head for MLA layers (attention) for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0; // KDA layers are recurrent + hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0; // KDA layers are recurrent } // MoE parameters - Kimi uses moe_intermediate_size = 1024 @@ -53,7 +53,7 @@ void llama_model_kimi_linear::load_arch_tensors(llama_model_loader &) { const int64_t n_embd_head_v_kda = hparams.n_embd_head_kda; const int64_t ssm_d_conv = hparams.ssm_d_conv; - if (hparams.is_recurrent(i)) { + if (hparams.is_recr(i)) { // Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1) // 4D: [d_conv, 1, d_inner, 1], 3D: [d_conv, 1, d_inner] layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED); @@ -285,7 +285,7 @@ llama_model_kimi_linear::graph::graph(const llama_model & model, const llm_graph ggml_build_forward_expand(gf, cur); - if (hparams.is_recurrent(il)) { + if (hparams.is_recr(il)) { // === KDA Layer (Kimi Delta Attention) with Recurrent State === // Reference: vLLM kda.py const auto * mctx_cur = inp_rs->mctx; diff --git a/src/models/lfm2.cpp b/src/models/lfm2.cpp index 29081344b245..3898b56bb12d 100644 --- a/src/models/lfm2.cpp +++ b/src/models/lfm2.cpp @@ -6,7 +6,7 @@ void llama_model_lfm2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); for (uint32_t il = 0; il < hparams.n_layer; ++il) { - hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0; + hparams.is_recr_impl[il] = hparams.n_head_kv(il) == 0; } hparams.n_layer_dense_lead = hparams.n_layer; switch (hparams.n_ff()) { @@ -19,7 +19,7 @@ void llama_model_lfm2::load_arch_hparams(llama_model_loader & ml) { if (const auto is_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); is_swa && hparams.n_swa > 0) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; for (uint32_t il = 0; il < hparams.n_layer; ++il) { - hparams.swa_layers[il] = !hparams.recurrent_layer_arr[il]; + hparams.is_swa_impl[il] = !hparams.is_recr_impl[il]; } } } @@ -59,7 +59,7 @@ void llama_model_lfm2::load_arch_tensors(llama_model_loader &) { // for operator_norm layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - if (!hparams.is_recurrent(i)) { + if (!hparams.is_recr(i)) { layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0); layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0); GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa); @@ -235,8 +235,8 @@ llama_model_lfm2::graph::graph(const llama_model & model, const llm_graph_ cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); cb(cur, "model.layers.{}.operator_norm", il); - cur = hparams.is_recurrent(il) ? build_shortconv_block(cur, inp_hybrid->get_recr(), il) : - build_attn_block(cur, inp_pos, inp_hybrid->get_attn(), il); + cur = hparams.is_recr(il) ? build_shortconv_block(cur, inp_hybrid->get_recr(), il) : + build_attn_block(cur, inp_pos, inp_hybrid->get_attn(), il); if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); diff --git a/src/models/lfm2moe.cpp b/src/models/lfm2moe.cpp index 12a66c05c7dc..81ced2eaba2f 100644 --- a/src/models/lfm2moe.cpp +++ b/src/models/lfm2moe.cpp @@ -10,7 +10,7 @@ void llama_model_lfm2moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); for (uint32_t il = 0; il < hparams.n_layer; ++il) { - hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0; + hparams.is_recr_impl[il] = hparams.n_head_kv(il) == 0; } switch (hparams.n_layer) { @@ -55,7 +55,7 @@ void llama_model_lfm2moe::load_arch_tensors(llama_model_loader &) { // for operator_norm layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - if (!hparams.is_recurrent(i)) { + if (!hparams.is_recr(i)) { layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0); layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0); GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa); diff --git a/src/models/llama4.cpp b/src/models/llama4.cpp index 0ff5376d571f..8f39b3f59a5b 100644 --- a/src/models/llama4.cpp +++ b/src/models/llama4.cpp @@ -15,7 +15,8 @@ void llama_model_llama4::load_arch_hparams(llama_model_loader & ml) { hparams.n_attn_temp_floor_scale = 8192; hparams.f_attn_temp_scale = 0.1f; hparams.f_attn_temp_offset = 1.0f; - uint32_t swa_period = 4; // pattern: 3 chunked - 1 full + + uint32_t swa_period = 4; // pattern: 3 chunked - 1 full ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false); hparams.set_swa_pattern(swa_period); diff --git a/src/models/mimo2.cpp b/src/models/mimo2.cpp index d0295ec116f6..1bcdf696f2e8 100644 --- a/src/models/mimo2.cpp +++ b/src/models/mimo2.cpp @@ -8,7 +8,8 @@ void llama_model_mimo2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); - ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer); + + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer); float value_scale = 0.0f; if (ml.get_key(LLM_KV_ATTENTION_VALUE_SCALE, value_scale, false) && value_scale != 1.0f) { diff --git a/src/models/nemotron-h.cpp b/src/models/nemotron-h.cpp index a82f9c170b48..d2c811d2497c 100644 --- a/src/models/nemotron-h.cpp +++ b/src/models/nemotron-h.cpp @@ -10,7 +10,7 @@ void llama_model_nemotron_h::load_arch_hparams(llama_model_loader & ml) { // A layer is recurrent IFF the n_head_kv value is set to 0 and // the n_ff value is set to 0 for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.recurrent_layer_arr[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0); + hparams.is_recr_impl[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0); } ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -62,7 +62,7 @@ void llama_model_nemotron_h::load_arch_tensors(llama_model_loader &) { // all blocks use the attn norm layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - if (hparams.is_recurrent(i)) { + if (hparams.is_recr(i)) { // ssm layers layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0); @@ -143,7 +143,7 @@ llama_model_nemotron_h::graph::graph(const llama_model & model, const llm_graph_ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); - if (hparams.is_recurrent(il)) { + if (hparams.is_recr(il)) { // ssm layer // cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il); } else if (hparams.n_ff(il) == 0) { diff --git a/src/models/plamo2.cpp b/src/models/plamo2.cpp index b713889fe72f..2ffa0898f712 100644 --- a/src/models/plamo2.cpp +++ b/src/models/plamo2.cpp @@ -12,7 +12,7 @@ void llama_model_plamo2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0; + hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0; } switch (hparams.n_layer) { @@ -54,7 +54,7 @@ void llama_model_plamo2::load_arch_tensors(llama_model_loader &) { for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; - bool is_mamba_layer = hparams.is_recurrent(i); + bool is_mamba_layer = hparams.is_recr(i); layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); @@ -128,7 +128,7 @@ llama_model_plamo2::graph::graph(const llama_model & model, const llm_graph_para cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); // check if this layer is Mamba or Attention - const bool is_mamba_layer = hparams.is_recurrent(il); + const bool is_mamba_layer = hparams.is_recr(il); if (is_mamba_layer) { // PLaMo-2 Mamba layer diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index ba63ae441df5..f8fd53696239 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -18,12 +18,13 @@ void llama_model_qwen35::load_arch_hparams(llama_model_loader & ml) { // Mark recurrent layers (linear attention layers). MTP layers are dense // attention-only and must be flagged non-recurrent. - { + if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer, false)) { const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers; + uint32_t full_attn_interval = 4; ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false); for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.recurrent_layer_arr[i] = (i < n_main) && ((i + 1) % full_attn_interval != 0); + hparams.is_recr_impl[i] = (i < n_main) && ((i + 1) % full_attn_interval != 0); } } @@ -69,7 +70,7 @@ void llama_model_qwen35::load_arch_tensors(llama_model_loader & ml) { layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", il), { n_embd }, flags); layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", il), { n_embd }, flags); - if (!hparams.is_recurrent(il)) { + if (!hparams.is_recr(il)) { // Attention layers create_tensor_qkv(layer, il, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, flags); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", il), { n_embd_head_k * n_head, n_embd }, flags); @@ -168,7 +169,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para ggml_build_forward_expand(gf, cur); // Determine layer type and build appropriate attention mechanism - if (hparams.is_recurrent(il)) { + if (hparams.is_recr(il)) { // Linear attention layer (gated delta net) cur = build_layer_attn_linear(inp->get_recr(), cur, il); } else { diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index 4f87d55d9112..8db0b4717d99 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -21,12 +21,13 @@ void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) { // Mark recurrent layers (linear attention layers). MTP layers are dense // attention-only and must be flagged non-recurrent. - { + if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer, false)) { const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers; + uint32_t full_attn_interval = 4; ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false); for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.recurrent_layer_arr[i] = (i < n_main) && ((i + 1) % full_attn_interval != 0); + hparams.is_recr_impl[i] = (i < n_main) && ((i + 1) % full_attn_interval != 0); } } @@ -75,7 +76,7 @@ void llama_model_qwen35moe::load_arch_tensors(llama_model_loader & ml) { layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", il), { n_embd }, flags); layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", il), { n_embd }, flags); - if (!hparams.is_recurrent(il)) { + if (!hparams.is_recr(il)) { // Attention layers create_tensor_qkv(layer, il, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, flags); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", il), { n_embd_head_k * n_head, n_embd }, flags); @@ -191,7 +192,7 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p ggml_build_forward_expand(gf, cur); // Determine layer type and build appropriate attention mechanism - if (hparams.is_recurrent(il)) { + if (hparams.is_recr(il)) { // Linear attention layer (gated delta net) cur = build_layer_attn_linear(inp->get_recr(), cur, il); } else { diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp index 1d873427db5e..9e09ae6f232d 100644 --- a/src/models/qwen3next.cpp +++ b/src/models/qwen3next.cpp @@ -14,11 +14,11 @@ void llama_model_qwen3next::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); // Mark recurrent layers (linear attention layers) - { + if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer, false)) { uint32_t full_attn_interval = 4; ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false); for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0); + hparams.is_recr_impl[i] = ((i + 1) % full_attn_interval != 0); } } @@ -68,7 +68,7 @@ void llama_model_qwen3next::load_arch_tensors(llama_model_loader &) { layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0); - if (!hparams.is_recurrent(i)) { + if (!hparams.is_recr(i)) { // Attention layers create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0); @@ -129,7 +129,7 @@ llama_model_qwen3next::graph::graph(const llama_model & model, const llm_graph_p ggml_build_forward_expand(gf, cur); // Determine layer type and build appropriate attention mechanism - if (hparams.is_recurrent(il)) { + if (hparams.is_recr(il)) { // Linear attention layer (gated delta net) cur = build_layer_attn_linear(inp->get_recr(), cur, il); } else { diff --git a/src/models/step35.cpp b/src/models/step35.cpp index 3b68e68707ae..b1e379e65543 100644 --- a/src/models/step35.cpp +++ b/src/models/step35.cpp @@ -22,7 +22,9 @@ void llama_model_step35::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); - ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer); + + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer); + ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp, hparams.n_layer, false); ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer, false); From 0c809f0c1f3d5e3f4f12b2c5aa5cba2a2e89801a Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 4 Jun 2026 01:29:09 +0800 Subject: [PATCH 13/19] qwen35: use post-norm hidden state for MTP (#24025) * qwen35: use post-norm hidden state for MTP * rename pre_norm to nextn * fix step35 (cherry picked from commit 166fe29492abb4093ec889b5c6f6fdb4e3b8ba98) --- common/speculative.cpp | 20 ++-- common/speculative.h | 4 +- src/llama-context.cpp | 140 +++++++++++----------- src/llama-context.h | 14 +-- src/llama-cparams.h | 4 +- src/llama-ext.h | 12 +- src/llama-graph.cpp | 4 +- src/llama-graph.h | 4 +- src/models/qwen35.cpp | 25 ++-- src/models/qwen35moe.cpp | 25 ++-- src/models/step35.cpp | 200 +++++++++++++++++++++++++++++++- tools/server/server-context.cpp | 6 +- 12 files changed, 324 insertions(+), 134 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index 73830fda6c97..aa0d0ed86b24 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -3,7 +3,7 @@ #include "common.h" #include "ggml.h" #include "llama.h" -#include "../src/llama-ext.h" // staging API: llama_set_embeddings_pre_norm / llama_get_embeddings_pre_norm_ith (used by MTP) +#include "../src/llama-ext.h" // staging API: llama_set_embeddings_nextn / llama_get_embeddings_nextn_ith (used by MTP) #include "log.h" #include "ngram-cache.h" #include "ngram-map.h" @@ -162,7 +162,7 @@ struct common_speculative_impl { virtual bool need_embd() const = 0; // true if this implementation requires the target context to extract pre-norm embeddings - virtual bool need_embd_pre_norm() const { return false; } + virtual bool need_embd_nextn() const { return false; } }; struct common_speculative_impl_draft_simple : public common_speculative_impl { @@ -487,8 +487,8 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { } } - llama_set_embeddings_pre_norm(ctx_tgt, true, /*masked*/ false); - llama_set_embeddings_pre_norm(ctx_dft, true, /*masked*/ true); + llama_set_embeddings_nextn(ctx_tgt, true, /*masked*/ false); + llama_set_embeddings_nextn(ctx_dft, true, /*masked*/ true); pending_h.assign(n_seq, std::vector(n_embd, 0.0f)); @@ -583,7 +583,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { // ^--- this is a problem // TODO:this is generally true, but would be nice to assert it { - const float * h_tgt = llama_get_embeddings_pre_norm(ctx_tgt); + const float * h_tgt = llama_get_embeddings_nextn(ctx_tgt); std::memcpy(batch.embd + (size_t) 1 * n_embd, h_tgt, row_bytes * (n_tokens-1)); //{ @@ -625,7 +625,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { verify_h[seq_id].resize((size_t) n_rows * n_embd); for (int32_t i = 0; i < n_rows; ++i) { - const float * h = llama_get_embeddings_pre_norm_ith(ctx_tgt, i_batch_beg[seq_id] + i); + const float * h = llama_get_embeddings_nextn_ith(ctx_tgt, i_batch_beg[seq_id] + i); std::memcpy(verify_h[seq_id].data() + (size_t) i * n_embd, h, row_bytes); } @@ -686,7 +686,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { auto * smpl = smpls[seq_id].get(); common_sampler_sample(smpl, ctx_dft, i_batch, true); - h_row = llama_get_embeddings_pre_norm_ith(ctx_dft, i_batch); + h_row = llama_get_embeddings_nextn_ith(ctx_dft, i_batch); ++i_batch; const auto * cur_p = common_sampler_get_candidates(smpl, true); @@ -772,7 +772,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { return false; } - bool need_embd_pre_norm() const override { + bool need_embd_nextn() const override { return true; } }; @@ -1539,13 +1539,13 @@ bool common_speculative_need_embd(common_speculative * spec) { return false; } -bool common_speculative_need_embd_pre_norm(common_speculative * spec) { +bool common_speculative_need_embd_nextn(common_speculative * spec) { if (spec == nullptr) { return false; } for (auto & impl : spec->impls) { - if (impl->need_embd_pre_norm()) { + if (impl->need_embd_nextn()) { return true; } } diff --git a/common/speculative.h b/common/speculative.h index deba7dac7208..bf76ad709e26 100644 --- a/common/speculative.h +++ b/common/speculative.h @@ -59,8 +59,8 @@ bool common_speculative_process(common_speculative * spec, const llama_batch & b // true if any implementation requires target post-norm embeddings to be extracted bool common_speculative_need_embd(common_speculative * spec); -// true if any implementation requires target pre-norm embeddings to be extracted -bool common_speculative_need_embd_pre_norm(common_speculative * spec); +// true if any implementation requires target nextn embeddings to be extracted +bool common_speculative_need_embd_nextn(common_speculative * spec); // generate drafts for the sequences specified with `common_speculative_get_draft_params` void common_speculative_draft(common_speculative * spec); diff --git a/src/llama-context.cpp b/src/llama-context.cpp index e59c1f74e2b2..263237d00505 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -58,18 +58,20 @@ llama_context::llama_context( cparams.n_rs_seq = 0; } - cparams.n_threads = params.n_threads; - cparams.n_threads_batch = params.n_threads_batch; - cparams.yarn_ext_factor = params.yarn_ext_factor >= 0.0f ? params.yarn_ext_factor : hparams.yarn_ext_factor; - cparams.yarn_attn_factor = params.yarn_attn_factor >= 0.0f ? params.yarn_attn_factor : hparams.yarn_attn_factor; - cparams.yarn_beta_fast = params.yarn_beta_fast >= 0.0f ? params.yarn_beta_fast : hparams.yarn_beta_fast; - cparams.yarn_beta_slow = params.yarn_beta_slow >= 0.0f ? params.yarn_beta_slow : hparams.yarn_beta_slow; - cparams.embeddings = params.embeddings; - cparams.embeddings_pre_norm = false; - cparams.offload_kqv = params.offload_kqv; - cparams.no_perf = params.no_perf; - cparams.pooling_type = params.pooling_type; - cparams.warmup = false; + cparams.n_threads = params.n_threads; + cparams.n_threads_batch = params.n_threads_batch; + cparams.yarn_ext_factor = params.yarn_ext_factor >= 0.0f ? params.yarn_ext_factor : hparams.yarn_ext_factor; + cparams.yarn_attn_factor = params.yarn_attn_factor >= 0.0f ? params.yarn_attn_factor : hparams.yarn_attn_factor; + cparams.yarn_beta_fast = params.yarn_beta_fast >= 0.0f ? params.yarn_beta_fast : hparams.yarn_beta_fast; + cparams.yarn_beta_slow = params.yarn_beta_slow >= 0.0f ? params.yarn_beta_slow : hparams.yarn_beta_slow; + cparams.embeddings = params.embeddings; + cparams.embeddings_nextn = false; + cparams.embeddings_nextn_masked = false; + cparams.offload_kqv = params.offload_kqv; + cparams.no_perf = params.no_perf; + cparams.pooling_type = params.pooling_type; + cparams.warmup = false; + cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; @@ -882,34 +884,34 @@ float * llama_context::get_embeddings_seq(llama_seq_id seq_id) { return it->second.data(); } -float * llama_context::get_embeddings_pre_norm() { +float * llama_context::get_embeddings_nextn() { output_reorder(); - return embd_pre_norm.data; + return embd_nextn.data; } -float * llama_context::get_embeddings_pre_norm_ith(int32_t i) { +float * llama_context::get_embeddings_nextn_ith(int32_t i) { output_reorder(); try { - if (embd_pre_norm.data == nullptr) { - throw std::runtime_error("no pre-norm embeddings"); + if (embd_nextn.data == nullptr) { + throw std::runtime_error("no nextn embeddings"); } const uint32_t n_embd = model.hparams.n_embd; - if (!cparams.embeddings_pre_norm_masked) { - // unmasked: pre-norm rows are stored densely, indexed by raw token position. - if (i < 0 || (size_t)(i + 1) * n_embd > embd_pre_norm.size) { - throw std::runtime_error(format("out of range [0, %zu)", embd_pre_norm.size / n_embd)); + if (!cparams.embeddings_nextn_masked) { + // unmasked: nextn rows are stored densely, indexed by raw token position. + if (i < 0 || (size_t)(i + 1) * n_embd > embd_nextn.size) { + throw std::runtime_error(format("out of range [0, %zu)", embd_nextn.size / n_embd)); } - return embd_pre_norm.data + (size_t) i * n_embd; + return embd_nextn.data + (size_t) i * n_embd; } const int64_t j = output_resolve_row(i); - return embd_pre_norm.data + j*n_embd; + return embd_nextn.data + j*n_embd; } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: invalid pre-norm embeddings id %d, reason: %s\n", __func__, i, err.what()); + LLAMA_LOG_ERROR("%s: invalid nextn embeddings id %d, reason: %s\n", __func__, i, err.what()); #ifndef NDEBUG GGML_ABORT("fatal error"); #else @@ -1098,11 +1100,11 @@ void llama_context::set_embeddings(bool value) { //sched_need_reserve = true; } -void llama_context::set_embeddings_pre_norm(bool value, bool masked) { +void llama_context::set_embeddings_nextn(bool value, bool masked) { LLAMA_LOG_DEBUG("%s: value = %d, masked = %d\n", __func__, value, masked); - cparams.embeddings_pre_norm = value; - cparams.embeddings_pre_norm_masked = masked; + cparams.embeddings_nextn = value; + cparams.embeddings_nextn_masked = masked; } void llama_context::set_causal_attn(bool value) { @@ -1319,7 +1321,7 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll } int llama_context::encode(const llama_batch & batch_inp) { - // MTP hook batches carry both token (next-token id) and embd (h_pre_norm row), + // MTP hook batches carry both token (next-token id) and embd (h_nextn row), // so accept either present rather than requiring exactly one. GGML_ASSERT(batch_inp.token || batch_inp.embd); @@ -1392,9 +1394,9 @@ int llama_context::encode(const llama_batch & batch_inp) { } } - auto * t_logits = res->get_logits(); - auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd(); - auto * t_h_pre_norm = cparams.embeddings_pre_norm ? res->get_h_pre_norm() : nullptr; + auto * t_logits = res->get_logits(); + auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd(); + auto * t_h_nextn = cparams.embeddings_nextn ? res->get_h_nextn() : nullptr; // extract logits if (logits.data && t_logits) { @@ -1460,14 +1462,14 @@ int llama_context::encode(const llama_batch & batch_inp) { } } - // extract pre-norm embeddings (hidden state before the final output norm) - if (embd_pre_norm.data && t_h_pre_norm && cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { - ggml_backend_t backend_h = ggml_backend_sched_get_tensor_backend(sched.get(), t_h_pre_norm); + // extract nextn embeddings (hidden state before the final output norm) + if (embd_nextn.data && t_h_nextn && cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { + ggml_backend_t backend_h = ggml_backend_sched_get_tensor_backend(sched.get(), t_h_nextn); GGML_ASSERT(backend_h != nullptr); const uint32_t n_embd = hparams.n_embd; - GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_pre_norm.size); - ggml_backend_tensor_get_async(backend_h, t_h_pre_norm, embd_pre_norm.data, 0, n_tokens*n_embd*sizeof(float)); + GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_nextn.size); + ggml_backend_tensor_get_async(backend_h, t_h_nextn, embd_nextn.data, 0, n_tokens*n_embd*sizeof(float)); } // TODO: hacky solution @@ -1622,7 +1624,7 @@ static bool needs_raw_logits(const llama_ubatch & ubatch, const std::mapget_logits(); - auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr; - auto * t_h_pre_norm = cparams.embeddings_pre_norm ? res->get_h_pre_norm() : nullptr; + auto * t_logits = res->get_logits(); + auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr; + auto * t_h_nextn = cparams.embeddings_nextn ? res->get_h_nextn() : nullptr; if (t_embd && res->get_embd_pooled()) { t_embd = res->get_embd_pooled(); @@ -1905,22 +1907,22 @@ int llama_context::decode(const llama_batch & batch_inp) { } } - // extract pre-norm embeddings (hidden state before the final output norm) + // extract nextn embeddings before // only meaningful in LLAMA_POOLING_TYPE_NONE (per-token); other pooling modes are ignored. { - const bool masked = cparams.embeddings_pre_norm_masked; + const bool masked = cparams.embeddings_nextn_masked; const int64_t n_rows = masked ? n_outputs : (int64_t) ubatch.n_tokens; const int64_t offset = masked ? n_outputs_prev : n_tokens_prev; - if (embd_pre_norm.data && t_h_pre_norm && n_rows > 0 && cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { - ggml_backend_t backend_h = ggml_backend_sched_get_tensor_backend(sched.get(), t_h_pre_norm); + if (embd_nextn.data && t_h_nextn && n_rows > 0 && cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { + ggml_backend_t backend_h = ggml_backend_sched_get_tensor_backend(sched.get(), t_h_nextn); GGML_ASSERT(backend_h != nullptr); - const uint32_t n_embd = hparams.n_embd; - float * embd_pre_norm_out = embd_pre_norm.data + offset*n_embd; + const uint32_t n_embd = hparams.n_embd; + float * embd_nextn_out = embd_nextn.data + offset*n_embd; - GGML_ASSERT((offset + n_rows)*n_embd <= (int64_t) embd_pre_norm.size); - ggml_backend_tensor_get_async(backend_h, t_h_pre_norm, embd_pre_norm_out, 0, n_rows*n_embd*sizeof(float)); + GGML_ASSERT((offset + n_rows)*n_embd <= (int64_t) embd_nextn.size); + ggml_backend_tensor_get_async(backend_h, t_h_nextn, embd_nextn_out, 0, n_rows*n_embd*sizeof(float)); } } @@ -2012,9 +2014,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { const auto n_embd = hparams.n_embd; const auto n_embd_out = hparams.n_embd_out(); - bool has_logits = true; - bool has_embd = cparams.embeddings; - bool has_embd_pre_norm = cparams.embeddings_pre_norm; + bool has_logits = true; + bool has_embd = cparams.embeddings; + bool has_embd_nextn = cparams.embeddings_nextn; // TODO: hacky enc-dec support if (model.arch == LLM_ARCH_T5) { @@ -2026,14 +2028,14 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { size_t backend_float_count = 0; size_t backend_token_count = 0; - logits.size = has_logits ? n_vocab*n_outputs_max : 0; - embd.size = has_embd ? n_embd_out*n_outputs_max : 0; - embd_pre_norm.size = has_embd_pre_norm ? n_embd*n_outputs_max : 0; + logits.size = has_logits ? n_vocab*n_outputs_max : 0; + embd.size = has_embd ? n_embd_out*n_outputs_max : 0; + embd_nextn.size = has_embd_nextn ? n_embd*n_outputs_max : 0; - if (has_embd_pre_norm && !cparams.embeddings_pre_norm_masked) { - // unmasked: pre-norm row exists for every token in the batch, not just + if (has_embd_nextn && !cparams.embeddings_nextn_masked) { + // unmasked: nextn row exists for every token in the batch, not just // those flagged via batch.logits[i] -> size by token count instead. - embd_pre_norm.size = (size_t) n_embd * n_batch; + embd_nextn.size = (size_t) n_embd * n_batch; } // Allocate backend sampling output buffers if there are backend samplers configured. @@ -2050,7 +2052,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0; const size_t new_size = - (logits.size + embd.size + embd_pre_norm.size + backend_float_count) * sizeof(float) + + (logits.size + embd.size + embd_nextn.size + backend_float_count) * sizeof(float) + ( backend_token_count) * sizeof(llama_token); // alloc only when more than the current capacity is required @@ -2067,7 +2069,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { buf_output = nullptr; logits.data = nullptr; embd.data = nullptr; - embd_pre_norm.data = nullptr; + embd_nextn.data = nullptr; } auto * buft = ggml_backend_cpu_buffer_type(); @@ -2096,8 +2098,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { embd = has_embd ? buffer_view{(float *) (base + offset), embd.size} : buffer_view{nullptr, 0}; offset += embd.size * sizeof(float); - embd_pre_norm = has_embd_pre_norm ? buffer_view{(float *) (base + offset), embd_pre_norm.size} : buffer_view{nullptr, 0}; - offset += embd_pre_norm.size * sizeof(float); + embd_nextn = has_embd_nextn ? buffer_view{(float *) (base + offset), embd_nextn.size} : buffer_view{nullptr, 0}; + offset += embd_nextn.size * sizeof(float); if (has_sampling) { sampling.logits = {(float *) (base + offset), (size_t)(n_vocab*n_outputs_max)}; @@ -2163,9 +2165,9 @@ void llama_context::output_reorder() { } } - if (embd_pre_norm.size > 0) { + if (embd_nextn.size > 0) { for (uint64_t k = 0; k < n_embd; k++) { - std::swap(embd_pre_norm.data[i0*n_embd + k], embd_pre_norm.data[i1*n_embd + k]); + std::swap(embd_nextn.data[i0*n_embd + k], embd_nextn.data[i1*n_embd + k]); } } @@ -3609,20 +3611,20 @@ float * llama_get_embeddings_seq(llama_context * ctx, llama_seq_id seq_id) { return ctx->get_embeddings_seq(seq_id); } -void llama_set_embeddings_pre_norm(llama_context * ctx, bool value, bool masked) { - ctx->set_embeddings_pre_norm(value, masked); +void llama_set_embeddings_nextn(llama_context * ctx, bool value, bool masked) { + ctx->set_embeddings_nextn(value, masked); } -float * llama_get_embeddings_pre_norm(llama_context * ctx) { +float * llama_get_embeddings_nextn(llama_context * ctx) { ctx->synchronize(); - return ctx->get_embeddings_pre_norm(); + return ctx->get_embeddings_nextn(); } -float * llama_get_embeddings_pre_norm_ith(llama_context * ctx, int32_t i) { +float * llama_get_embeddings_nextn_ith(llama_context * ctx, int32_t i) { ctx->synchronize(); - return ctx->get_embeddings_pre_norm_ith(i); + return ctx->get_embeddings_nextn_ith(i); } bool llama_set_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) { diff --git a/src/llama-context.h b/src/llama-context.h index d03f681d4a13..2af92b0f0968 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -84,8 +84,8 @@ struct llama_context { float * get_embeddings_ith(int32_t i); float * get_embeddings_seq(llama_seq_id seq_id); - float * get_embeddings_pre_norm(); - float * get_embeddings_pre_norm_ith(int32_t i); + float * get_embeddings_nextn(); + float * get_embeddings_nextn_ith(int32_t i); llama_token * get_sampled_tokens() const; llama_token get_sampled_token_ith(int32_t idx); @@ -110,7 +110,7 @@ struct llama_context { void set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data); void set_embeddings (bool value); - void set_embeddings_pre_norm(bool value, bool masked); + void set_embeddings_nextn(bool value, bool masked); void set_causal_attn(bool value); void set_warmup(bool value); @@ -282,10 +282,10 @@ struct llama_context { // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE buffer_view embd = {nullptr, 0}; - // hidden state before the final output norm (2-dimensional array: [n_outputs][n_embd]) - // populated only when cparams.embeddings_pre_norm is enabled and the model graph - // sets llm_graph_result::t_h_pre_norm - buffer_view embd_pre_norm = {nullptr, 0}; + // hidden state required by the nextn layers (2-dimensional array: [n_outputs][n_embd]) + // populated only when cparams.embeddings_nextn is enabled and the model graph + // sets llm_graph_result::t_h_nextn + buffer_view embd_nextn = {nullptr, 0}; struct sampling_info { // !samplers.empty() to check if any samplers are active diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 20ec59fe3357..a1515e9a061d 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -28,8 +28,8 @@ struct llama_cparams { float yarn_beta_slow; bool embeddings; - bool embeddings_pre_norm; // also extract the hidden state before the final output norm - bool embeddings_pre_norm_masked; // extract for only rows where batch.logits != 0 + bool embeddings_nextn; // also extract the hidden state before the final output norm + bool embeddings_nextn_masked; // extract for only rows where batch.logits != 0 bool causal_attn; bool offload_kqv; bool flash_attn; diff --git a/src/llama-ext.h b/src/llama-ext.h index edfa71c207c5..7ad6125fad33 100644 --- a/src/llama-ext.h +++ b/src/llama-ext.h @@ -89,18 +89,14 @@ LLAMA_API ggml_backend_dev_t llama_model_get_device(const struct llama_model * m LLAMA_API llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx); -// -// pre-norm embeddings (hidden state before the final output norm) -// - -// Set whether the context outputs pre-norm embeddings or not +// Set whether the context outputs nextn embeddings or not // If masked == true, output the embeddings only for the tokens with batch.logits != 0 // If masked == false, output the embeddings for all tokens in the batch regardless of batch.logits -LLAMA_API void llama_set_embeddings_pre_norm(struct llama_context * ctx, bool value, bool masked); +LLAMA_API void llama_set_embeddings_nextn(struct llama_context * ctx, bool value, bool masked); // mirrors: // LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); -LLAMA_API float * llama_get_embeddings_pre_norm (struct llama_context * ctx); +LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx); // LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i); -LLAMA_API float * llama_get_embeddings_pre_norm_ith(struct llama_context * ctx, int32_t i); +LLAMA_API float * llama_get_embeddings_nextn_ith(struct llama_context * ctx, int32_t i); diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 0ee9e5384042..1eef4e059dff 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -894,8 +894,8 @@ void llm_graph_result::set_outputs() { if (t_embd_pooled != nullptr) { ggml_set_output(t_embd_pooled); } - if (t_h_pre_norm != nullptr) { - ggml_set_output(t_h_pre_norm); + if (t_h_nextn != nullptr) { + ggml_set_output(t_h_nextn); } for (auto & [seq_id, t] : t_sampled) { if (t != nullptr) { diff --git a/src/llama-graph.h b/src/llama-graph.h index e240ade7b0c5..b514e4c23e07 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -663,7 +663,7 @@ class llm_graph_result { ggml_tensor * get_logits() const { return t_logits; } ggml_tensor * get_embd() const { return t_embd; } ggml_tensor * get_embd_pooled() const { return t_embd_pooled; } - ggml_tensor * get_h_pre_norm() const { return t_h_pre_norm; } + ggml_tensor * get_h_nextn() const { return t_h_nextn; } ggml_cgraph * get_gf() const { return gf; } ggml_context * get_ctx() const { return ctx_compute.get(); } @@ -692,7 +692,7 @@ class llm_graph_result { ggml_tensor * t_logits = nullptr; ggml_tensor * t_embd = nullptr; ggml_tensor * t_embd_pooled = nullptr; - ggml_tensor * t_h_pre_norm = nullptr; // [n_embd, n_outputs] hidden state before final output norm + ggml_tensor * t_h_nextn = nullptr; // [n_embd, n_outputs] hidden state before final output norm std::map t_sampled_logits; std::map t_candidates; diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index f8fd53696239..348650b37966 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -177,7 +177,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il); } - if (il == n_transformer_layers - 1 && inp_out_ids && cparams.embeddings_pre_norm_masked) { + if (il == n_transformer_layers - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -209,16 +209,15 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para } cur = inpL; - cb(cur, "h_pre_norm", -1); - res->t_h_pre_norm = cur; + cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); + + cb(cur, "h_nextn", -1); + res->t_h_nextn = cur; - if (!cparams.embeddings_pre_norm_masked && inp_out_ids) { + if (!cparams.embeddings_nextn_masked && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); } - // Final norm - cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); res->t_embd = cur; @@ -625,18 +624,16 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr cur = ggml_add(ctx0, cur, ffn_residual); cb(cur, "mtp_post_ffn", il); - // Pre-norm hidden state: used by the AR draft loop to seed the next MTP step. - // (In the trunk graph this is `t_h_pre_norm`; the MTP head reuses the same slot.) - cb(cur, "h_pre_norm", -1); - res->t_h_pre_norm = cur; - - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - ggml_tensor * head_norm_w = layer.nextn.shared_head_norm ? layer.nextn.shared_head_norm : model.output_norm; GGML_ASSERT(head_norm_w && "QWEN35 MTP: missing both nextn.shared_head_norm and output_norm"); cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1); + + cb(cur, "h_nextn", -1); + res->t_h_nextn = cur; + + cur = ggml_get_rows(ctx0, cur, inp_out_ids); cb(cur, "mtp_shared_head_norm", -1); ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output; diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index 8db0b4717d99..7d906191cbbc 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -200,7 +200,7 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il); } - if (il == n_transformer_layers - 1 && inp_out_ids && cparams.embeddings_pre_norm_masked) { + if (il == n_transformer_layers - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -232,16 +232,16 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p } cur = inpL; - cb(cur, "h_pre_norm", -1); - res->t_h_pre_norm = cur; + // post-norm hidden state feeds both the LM head and the MTP seed below + cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); + + cb(cur, "h_nextn", -1); + res->t_h_nextn = cur; - if (!cparams.embeddings_pre_norm_masked && inp_out_ids) { + if (!cparams.embeddings_nextn_masked && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); } - // Final norm - cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); res->t_embd = cur; @@ -721,17 +721,16 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm cur = ggml_add(ctx0, cur, ffn_residual); cb(cur, "mtp_post_ffn", il); - // Pre-norm hidden state: used by the AR draft loop to seed the next MTP step. - cb(cur, "h_pre_norm", -1); - res->t_h_pre_norm = cur; - - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - ggml_tensor * head_norm_w = layer.nextn.shared_head_norm ? layer.nextn.shared_head_norm : model.output_norm; GGML_ASSERT(head_norm_w && "QWEN35MOE MTP: missing both nextn.shared_head_norm and output_norm"); cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1); + + cb(cur, "h_nextn", -1); + res->t_h_nextn= cur; + + cur = ggml_get_rows(ctx0, cur, inp_out_ids); cb(cur, "mtp_shared_head_norm", -1); ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output; diff --git a/src/models/step35.cpp b/src/models/step35.cpp index b1e379e65543..0633bfadbcea 100644 --- a/src/models/step35.cpp +++ b/src/models/step35.cpp @@ -200,8 +200,8 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para cb(cur, "attn_proj", il); } - if (il == n_layer - 1 && inp_out_ids) { - cur = ggml_get_rows(ctx0, cur, inp_out_ids); + if (il == n_transformer_layers - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -259,6 +259,13 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para cur = inpL; + cb(cur, "h_nextn", -1); + res->t_h_nextn = cur; + + if (!cparams.embeddings_nextn_masked && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + } + cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); res->t_embd = cur; @@ -269,3 +276,192 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para ggml_build_forward_expand(gf, cur); } + +// LLM_GRAPH_TYPE_DECODER_MTP draft head for Step3p5 (MoE) +llama_model_step35::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params) + : llm_graph_context(params) { + GGML_ASSERT(hparams.nextn_predict_layers > 0 && "STEP35 MTP requires nextn_predict_layers > 0"); + + // Single-block MTP only: always run the first trained MTP block (Qwen + // MTP / vLLM single-MTP-layer style). Multi-block round-robin proved to + // be a much deeper refactor than this PR justifies; the trailing MTP + // blocks are loaded with TENSOR_NOT_REQUIRED so pruned GGUFs (with just + // block 0) also work — see load_arch_tensors below and + // scripts/prune_step35_extra_mtp.py. + const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers; + const auto & layer = model.layers[il]; + + GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj"); + GGML_ASSERT(layer.nextn.enorm && "MTP block missing nextn.enorm"); + GGML_ASSERT(layer.nextn.hnorm && "MTP block missing nextn.hnorm"); + + const uint32_t n_head_l = hparams.n_head(il); + const uint32_t n_head_kv_l = hparams.n_head_kv(il); + + const float freq_base_l = model.get_rope_freq_base(cparams, il); + const float freq_scale_l = model.get_rope_freq_scale(cparams, il); + + auto inp = std::make_unique(hparams.n_embd); + + inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + ggml_set_input(inp->tokens); + + inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens); + ggml_set_input(inp->embd); + ggml_set_name(inp->embd, "mtp_h_input"); + + ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd; + + ggml_tensor * h_input = inp->embd; + ggml_tensor * tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens); + cb(tok_embd, "mtp_tok_embd", il); + + res->add_input(std::move(inp)); + + ggml_tensor * inp_pos = build_inp_pos(); + auto * inp_attn = build_attn_inp_kv_iswa(); + + ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il); + cb(h_norm, "mtp_hnorm", il); + + ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il); + cb(e_norm, "mtp_enorm", il); + + ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0); + cb(concat, "mtp_concat", il); + + ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat); + cb(cur, "mtp_eh_proj", il); + + ggml_tensor * inpSA = cur; + + // mtp_block: full Step3p5 decoder layer (attention with optional head-wise gate, then MoE/dense FFN) + cur = build_norm(cur, layer.attn_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "mtp_attn_norm", il); + + ggml_tensor * Qcur = build_lora_mm(layer.wq, cur, layer.wq_s); + ggml_tensor * Kcur = build_lora_mm(layer.wk, cur, layer.wk_s); + ggml_tensor * Vcur = build_lora_mm(layer.wv, cur, layer.wv_s); + cb(Qcur, "mtp_Qcur", il); + cb(Kcur, "mtp_Kcur", il); + cb(Vcur, "mtp_Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head_l, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv_l, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv_l, n_tokens); + + if (layer.attn_q_norm) { + Qcur = build_norm(Qcur, layer.attn_q_norm, nullptr, LLM_NORM_RMS, il); + cb(Qcur, "mtp_Qcur_normed", il); + } + if (layer.attn_k_norm) { + Kcur = build_norm(Kcur, layer.attn_k_norm, nullptr, LLM_NORM_RMS, il); + cb(Kcur, "mtp_Kcur_normed", il); + } + + const bool is_swa = hparams.is_swa(il); + ggml_tensor * rope_factors = is_swa ? nullptr : model.get_rope_factors(cparams, il); + const int64_t n_rot_l = hparams.n_rot(il); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, rope_factors, + n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, + ext_factor, attn_factor, beta_fast, beta_slow); + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, rope_factors, + n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "mtp_Qcur_pos", il); + cb(Kcur, "mtp_Kcur_pos", il); + + const float kq_scale = 1.0f / sqrtf(float(n_embd_head_k)); + ggml_tensor * attn_out = build_attn(inp_attn, + nullptr, nullptr, nullptr, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); + cb(attn_out, "mtp_attn_out", il); + + // head-wise attention gate: sigmoid(g_proj(x)) + if (layer.wqkv_gate) { + ggml_tensor * gate = build_lora_mm(layer.wqkv_gate, cur); // [n_head_l, n_tokens] + cb(gate, "mtp_attn_gate", il); + + gate = ggml_sigmoid(ctx0, gate); + cb(gate, "mtp_attn_gate_sigmoid", il); + + ggml_tensor * attn_3d = ggml_reshape_3d(ctx0, attn_out, n_embd_head_v, n_head_l, n_tokens); + ggml_tensor * gate_3d = ggml_reshape_3d(ctx0, gate, 1, n_head_l, n_tokens); + cb(gate_3d, "mtp_attn_gate_3d", il); + + attn_3d = ggml_mul(ctx0, attn_3d, gate_3d); + cb(attn_3d, "mtp_attn_gated_3d", il); + + attn_out = ggml_reshape_2d(ctx0, attn_3d, n_embd_head_v * n_head_l, n_tokens); + cb(attn_out, "mtp_attn_gated", il); + } + + cur = build_lora_mm(layer.wo, attn_out, layer.wo_s); + cb(cur, "mtp_attn_proj", il); + + cur = ggml_add(ctx0, cur, inpSA); + cb(cur, "mtp_attn_residual", il); + + ggml_tensor * ffn_inp = cur; + cur = build_norm(cur, layer.ffn_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "mtp_ffn_norm", il); + + // FFN: dense MLP or MoE (mirrors trunk path) + if (layer.ffn_gate_inp == nullptr) { + cur = build_ffn(cur, + layer.ffn_up, layer.ffn_up_b, nullptr, + layer.ffn_gate, layer.ffn_gate_b, nullptr, + layer.ffn_down, layer.ffn_down_b, nullptr, + nullptr, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "mtp_ffn_out", il); + } else { + ggml_tensor * moe_out = build_moe_ffn(cur, + layer.ffn_gate_inp, + layer.ffn_up_exps, + layer.ffn_gate_exps, + layer.ffn_down_exps, + layer.ffn_exp_probs_b, + n_expert, n_expert_used, + LLM_FFN_SILU, hparams.expert_weights_norm, + hparams.expert_weights_scale, + (llama_expert_gating_func_type) hparams.expert_gating_func, + il); + cb(moe_out, "mtp_ffn_moe_out", il); + + ggml_tensor * sh_out = build_ffn(cur, + layer.ffn_up_shexp, nullptr, nullptr, + layer.ffn_gate_shexp, nullptr, nullptr, + layer.ffn_down_shexp, nullptr, nullptr, + nullptr, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(sh_out, "mtp_ffn_shared_out", il); + + cur = ggml_add(ctx0, moe_out, sh_out); + cb(cur, "mtp_ffn_out", il); + } + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "mtp_post_ffn", il); + + // Pre-norm hidden state: used by the AR draft loop to seed the next MTP step. + cb(cur, "h_nextn", -1); + res->t_h_nextn = cur; + + ggml_tensor * head_norm_w = layer.nextn.shared_head_norm + ? layer.nextn.shared_head_norm + : model.output_norm; + GGML_ASSERT(head_norm_w && "STEP35 MTP: missing both nextn.shared_head_norm and output_norm"); + cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1); + cb(cur, "mtp_shared_head_norm", -1); + + ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output; + GGML_ASSERT(head_w && "STEP35 MTP: missing LM head (nextn.shared_head_head or model.output)"); + cur = build_lora_mm(head_w, cur); + cb(cur, "result_output", -1); + + res->t_logits = cur; + ggml_build_forward_expand(gf, cur); +} diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 59cb23dc292b..e48efd7663a7 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -259,9 +259,9 @@ struct server_slot { return task->need_embd() || (spec && common_speculative_need_embd(spec)); } - bool need_embd_pre_norm() const { + bool need_embd_nextn() const { GGML_ASSERT(task); - return spec && common_speculative_need_embd_pre_norm(spec); + return spec && common_speculative_need_embd_nextn(spec); } // if the context does not have a memory module then all embeddings have to be computed within a single ubatch @@ -3032,7 +3032,7 @@ struct server_context_impl { // embedding requires all tokens in the batch to be output; // MTP also wants logits at every prompt position so the - // streaming hook can mirror t_h_pre_norm into ctx_dft. + // streaming hook can mirror t_h_nextn into ctx_dft. common_batch_add(batch, cur_tok, slot.prompt.tokens.pos_next(), From d55c844ffb8b625b16d750aaf7c93551bdbdee7f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 5 Jun 2026 11:09:36 +0300 Subject: [PATCH 14/19] hparams : refactor `hparams.n_layer` (#24060) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * hparams : refactor hparams.n_layer * cont : remove `n_layer_kv()`, use n_layer_all instead * cont : type consistency * pi : update SYSTEM.md * models : fix Step3.5 MTP * cont : remove duplicate switch cases * cont : explicitly set `false` to extra layers for `is_swa` and `is_recr` * cont : fix nextn layer count handling Co-authored-by: Sigbjørn Skjæret --------- Co-authored-by: Sigbjørn Skjæret (cherry picked from commit 7acb4e8cd2ce21f457d1298e75fad729520d263c) --- src/llama-adapter.cpp | 8 +-- src/llama-context.cpp | 10 ++-- src/llama-graph.cpp | 2 +- src/llama-hparams.cpp | 83 +++++++++++++-------------- src/llama-hparams.h | 17 +++--- src/llama-kv-cache.cpp | 5 +- src/llama-memory-recurrent.cpp | 8 +-- src/llama-model-loader.cpp | 6 +- src/llama-model-saver.cpp | 6 +- src/llama-model.cpp | 80 ++++++++++++++------------ src/llama-model.h | 3 +- src/llama-quant.cpp | 4 +- src/models/afmoe.cpp | 2 +- src/models/apertus.cpp | 11 ++-- src/models/arcee.cpp | 2 +- src/models/arctic.cpp | 2 +- src/models/arwkv7.cpp | 2 +- src/models/baichuan.cpp | 2 +- src/models/bailingmoe.cpp | 2 +- src/models/bailingmoe2.cpp | 21 +++---- src/models/bert.cpp | 4 +- src/models/bitnet.cpp | 2 +- src/models/bloom.cpp | 2 +- src/models/chameleon.cpp | 2 +- src/models/chatglm.cpp | 3 +- src/models/codeshell.cpp | 3 +- src/models/cogvlm.cpp | 3 +- src/models/cohere2.cpp | 4 +- src/models/command-r.cpp | 3 +- src/models/dbrx.cpp | 12 ++-- src/models/deci.cpp | 3 +- src/models/deepseek2.cpp | 11 ++-- src/models/deepseek2ocr.cpp | 2 +- src/models/dots1.cpp | 3 +- src/models/dream.cpp | 3 +- src/models/ernie4-5.cpp | 2 +- src/models/eurobert.cpp | 2 +- src/models/exaone-moe.cpp | 22 ++++--- src/models/exaone.cpp | 2 +- src/models/exaone4.cpp | 4 +- src/models/falcon-h1.cpp | 2 +- src/models/falcon.cpp | 2 +- src/models/gemma-embedding.cpp | 2 +- src/models/gemma.cpp | 2 +- src/models/gemma2.cpp | 2 +- src/models/gemma3.cpp | 2 +- src/models/gemma3n.cpp | 6 +- src/models/gemma4.cpp | 6 +- src/models/glm-dsa.cpp | 17 +++--- src/models/glm4-moe.cpp | 26 ++++----- src/models/glm4.cpp | 20 +++---- src/models/gpt2.cpp | 3 +- src/models/gptneox.cpp | 3 +- src/models/granite-hybrid.cpp | 2 +- src/models/granite-moe.cpp | 2 +- src/models/granite.cpp | 2 +- src/models/grok.cpp | 2 +- src/models/grovemoe.cpp | 2 +- src/models/hunyuan-moe.cpp | 2 +- src/models/internlm2.cpp | 3 +- src/models/jais.cpp | 2 +- src/models/jais2.cpp | 2 +- src/models/jamba.cpp | 4 +- src/models/jina-bert-v2.cpp | 2 +- src/models/jina-bert-v3.cpp | 2 +- src/models/kimi-linear.cpp | 4 +- src/models/lfm2.cpp | 10 +++- src/models/lfm2moe.cpp | 4 +- src/models/llada-moe.cpp | 5 +- src/models/llada.cpp | 4 +- src/models/llama.cpp | 4 +- src/models/llama4.cpp | 2 +- src/models/maincoder.cpp | 3 +- src/models/mamba.cpp | 2 +- src/models/mamba2.cpp | 2 +- src/models/mimo2.cpp | 22 +++---- src/models/minicpm.cpp | 4 +- src/models/minicpm3.cpp | 2 +- src/models/minimax-m2.cpp | 2 +- src/models/mistral3.cpp | 2 +- src/models/modern-bert.cpp | 2 +- src/models/mpt.cpp | 2 +- src/models/nemotron-h.cpp | 4 +- src/models/nemotron.cpp | 3 +- src/models/neo-bert.cpp | 2 +- src/models/nomic-bert-moe.cpp | 2 +- src/models/nomic-bert.cpp | 2 +- src/models/olmo.cpp | 2 +- src/models/olmo2.cpp | 2 +- src/models/olmoe.cpp | 3 +- src/models/openai-moe.cpp | 2 +- src/models/openelm.cpp | 12 ++-- src/models/orion.cpp | 2 +- src/models/pangu-embed.cpp | 3 +- src/models/phi2.cpp | 2 +- src/models/phi3.cpp | 2 +- src/models/phimoe.cpp | 2 +- src/models/plamo.cpp | 2 +- src/models/plamo2.cpp | 4 +- src/models/plamo3.cpp | 2 +- src/models/plm.cpp | 3 +- src/models/qwen.cpp | 2 +- src/models/qwen2.cpp | 3 +- src/models/qwen2moe.cpp | 3 +- src/models/qwen3.cpp | 3 +- src/models/qwen35.cpp | 33 +++++------ src/models/qwen35moe.cpp | 33 +++++------ src/models/qwen3moe.cpp | 6 +- src/models/qwen3next.cpp | 8 +-- src/models/qwen3vl.cpp | 3 +- src/models/qwen3vlmoe.cpp | 3 +- src/models/refact.cpp | 3 +- src/models/rnd1.cpp | 5 +- src/models/rwkv6.cpp | 2 +- src/models/rwkv6qwen2.cpp | 2 +- src/models/rwkv7.cpp | 2 +- src/models/seed-oss.cpp | 3 +- src/models/smallthinker.cpp | 4 +- src/models/smollm3.cpp | 2 +- src/models/stablelm.cpp | 2 +- src/models/starcoder.cpp | 3 +- src/models/starcoder2.cpp | 3 +- src/models/step35.cpp | 101 ++++++++++++++++++++++++++++++--- src/models/t5.cpp | 4 +- src/models/xverse.cpp | 3 +- 125 files changed, 473 insertions(+), 383 deletions(-) diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index 4a1aaa955a87..3e0fe66afff7 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -41,7 +41,7 @@ bool llama_adapter_cvec::init(const llama_model & model) { auto it = ctx_map.find(buft); if (it == ctx_map.end()) { ggml_init_params params = { - /*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(), + /*.mem_size =*/ hparams.n_layer()*ggml_tensor_overhead(), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; @@ -61,9 +61,9 @@ bool llama_adapter_cvec::init(const llama_model & model) { }; // make tensors - tensors.reserve(hparams.n_layer); + tensors.reserve(hparams.n_layer()); tensors.push_back(nullptr); // there's never a tensor for layer 0 - for (size_t il = 1; il < hparams.n_layer; il++) { + for (size_t il = 1; il < hparams.n_layer(); il++) { ggml_backend_buffer_type_t buft = model.select_buft(il); ggml_context * ctx = ctx_for_buft(buft); if (!ctx) { @@ -121,7 +121,7 @@ bool llama_adapter_cvec::apply( layer_start = il_start; layer_end = il_end; - for (size_t il = 1; il < hparams.n_layer; il++) { + for (size_t il = 1; il < hparams.n_layer(); il++) { assert(tensors[il] != nullptr); const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 263237d00505..a41d2499ba7b 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -339,7 +339,7 @@ llama_context::llama_context( // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary bool pipeline_parallel = model.n_devices() > 1 && - model.n_gpu_layers() > model.hparams.n_layer && + model.n_gpu_layers() > model.hparams.n_layer() && model.split_mode() == LLAMA_SPLIT_MODE_LAYER && cparams.offload_kqv && !model.has_tensor_overrides(); @@ -2345,7 +2345,7 @@ llm_graph_cb llama_context::graph_get_cb() const { // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends // FIXME: fix in ggml_backend_sched - const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer; + const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer(); if (ubatch.n_tokens < 32 || full_offload) { if (il != -1 && strcmp(name, "norm") == 0) { const auto & dev_layer = model.dev_layer(il); @@ -3416,7 +3416,7 @@ llama_context * llama_init_from_model( const bool k_is_turbo = (params.type_k == GGML_TYPE_TURBO2_0 || params.type_k == GGML_TYPE_TURBO3_0 || params.type_k == GGML_TYPE_TURBO4_0); - for (uint32_t il = 0; il < model->hparams.n_layer; ++il) { + for (uint32_t il = 0; il < model->hparams.n_layer(); ++il) { uint32_t head_k = model->hparams.n_embd_head_k(il); // Turbo types zero-pad heads to next multiple of 128 in llama-kv-cache.cpp if (k_is_turbo && head_k % 128 != 0) { @@ -3436,7 +3436,7 @@ llama_context * llama_init_from_model( params.type_v == GGML_TYPE_TURBO3_0 || params.type_v == GGML_TYPE_TURBO4_0); const bool is_mla = model->hparams.is_mla(); - for (uint32_t il = 0; il < model->hparams.n_layer; ++il) { + for (uint32_t il = 0; il < model->hparams.n_layer(); ++il) { uint32_t head_v = model->hparams.n_embd_head_v(il); // Turbo types zero-pad; MLA has no separate V cache (V = view of K) if (v_is_turbo && !is_mla && head_v % 128 != 0) { @@ -3471,7 +3471,7 @@ llama_context * llama_init_from_model( } if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP && - model->hparams.nextn_predict_layers == 0) { + model->hparams.n_layer_nextn == 0) { LLAMA_LOG_WARN("%s: context type MTP requested but model doesn't contain MTP layers\n", __func__); return nullptr; } diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 1eef4e059dff..ef712be139e9 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -970,7 +970,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : cparams (params.cparams), ubatch (params.ubatch), n_embd (hparams.n_embd), - n_layer (hparams.n_layer), + n_layer (hparams.n_layer()), n_rot (hparams.n_rot()), n_ctx (cparams.n_ctx), n_head (hparams.n_head()), diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 087afec55c60..e1e49d1cc1fe 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -7,31 +7,38 @@ void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) { if (dense_first) { - for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t il = 0; il < n_layer(); ++il) { is_swa_impl[il] = n_pattern == 0 || (il % n_pattern != 0); } } else { - for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t il = 0; il < n_layer(); ++il) { is_swa_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1)); } } + + for (uint32_t il = n_layer(); il < n_layer_all; ++il) { + is_swa_impl[il] = false; + } } -// TODO: implement -//void llama_hparams::set_recr_pattern(uint32_t n_pattern, bool dense_first) { -// if (dense_first) { -// for (uint32_t il = 0; il < n_layer; ++il) { -// is_recr_impl[il] = n_pattern == 0 || (il % n_pattern != 0); -// } -// } else { -// for (uint32_t il = 0; il < n_layer; ++il) { -// is_recr_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1)); -// } -// } -//} +void llama_hparams::set_recr_pattern(uint32_t n_pattern, bool dense_first) { + if (dense_first) { + for (uint32_t il = 0; il < n_layer(); ++il) { + is_recr_impl[il] = n_pattern == 0 || (il % n_pattern != 0); + } + } else { + for (uint32_t il = 0; il < n_layer(); ++il) { + is_recr_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1)); + } + } + + for (uint32_t il = n_layer(); il < n_layer_all; ++il) { + is_recr_impl[il] = false; + } +} bool llama_hparams::is_swa_any() const { - for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t il = 0; il < n_layer_all; ++il) { if (is_swa_impl[il]) { return true; } @@ -41,7 +48,7 @@ bool llama_hparams::is_swa_any() const { } uint32_t llama_hparams::n_head(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return n_head_arr[il]; } @@ -49,7 +56,7 @@ uint32_t llama_hparams::n_head(uint32_t il) const { } uint32_t llama_hparams::n_head_kv(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return n_head_kv_arr[il]; } @@ -57,7 +64,7 @@ uint32_t llama_hparams::n_head_kv(uint32_t il) const { } uint32_t llama_hparams::n_ff(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return n_ff_arr[il]; } @@ -76,7 +83,7 @@ uint32_t llama_hparams::n_gqa(uint32_t il) const { } uint32_t llama_hparams::n_rot(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return is_swa(il) ? n_rot_swa : n_rot_full; } @@ -98,7 +105,7 @@ uint32_t llama_hparams::n_embd_out() const { } uint32_t llama_hparams::n_embd_head_k(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return is_swa(il) ? n_embd_head_k_swa : n_embd_head_k_full; } @@ -106,7 +113,7 @@ uint32_t llama_hparams::n_embd_head_k(uint32_t il) const { } uint32_t llama_hparams::n_embd_head_v(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return is_swa(il) ? n_embd_head_v_swa : n_embd_head_v_full; } @@ -127,7 +134,7 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const { bool llama_hparams::is_n_embd_k_gqa_variable() const { const uint32_t val = n_embd_k_gqa(); - for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t il = 0; il < n_layer_all; ++il) { if (val != n_embd_k_gqa(il)) { return true; } @@ -138,7 +145,7 @@ bool llama_hparams::is_n_embd_k_gqa_variable() const { bool llama_hparams::is_n_embd_v_gqa_variable() const { const uint32_t val = n_embd_v_gqa(); - for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t il = 0; il < n_layer_all; ++il) { if (val != n_embd_v_gqa(il)) { return true; } @@ -149,7 +156,7 @@ bool llama_hparams::is_n_embd_v_gqa_variable() const { uint32_t llama_hparams::n_embd_k_gqa_max() const { uint32_t val = n_embd_k_gqa(); - for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t il = 0; il < n_layer_all; ++il) { val = std::max(val, n_embd_k_gqa(il)); } @@ -158,7 +165,7 @@ uint32_t llama_hparams::n_embd_k_gqa_max() const { uint32_t llama_hparams::n_embd_v_gqa_max() const { uint32_t val = n_embd_v_gqa(); - for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t il = 0; il < n_layer_all; ++il) { val = std::max(val, n_embd_v_gqa(il)); } @@ -207,11 +214,11 @@ uint32_t llama_hparams::n_embd_s() const { } bool llama_hparams::is_recr(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return is_recr_impl[il]; } - GGML_ABORT("%s: il (%u) out of bounds (n_layer: %u)\n", __func__, il, n_layer); + GGML_ABORT("%s: il (%u) out of bounds (n_layer_all: %u)\n", __func__, il, n_layer_all); } uint32_t llama_hparams::n_pos_per_embd() const { @@ -219,11 +226,11 @@ uint32_t llama_hparams::n_pos_per_embd() const { } bool llama_hparams::is_swa(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return is_swa_impl[il]; } - GGML_ABORT("fatal error"); + GGML_ABORT("%s: il (%u) out of bounds (n_layer_all: %u)\n", __func__, il, n_layer_all); } bool llama_hparams::is_mla() const { @@ -242,12 +249,6 @@ uint32_t llama_hparams::n_embd_head_v_mla() const { } bool llama_hparams::has_kv(uint32_t il) const { - if (kv_only_nextn) { - // MTP head: only the trailing nextn_predict_layers blocks own a KV cache; - // the leading trunk blocks are not executed in this graph. - return nextn_predict_layers > 0 && il >= (n_layer - nextn_predict_layers); - } - if (n_layer_kv_from_start >= 0) { if (il < (uint32_t) n_layer_kv_from_start) { return true; @@ -260,16 +261,8 @@ bool llama_hparams::has_kv(uint32_t il) const { return true; } -uint32_t llama_hparams::n_layer_kv() const { - uint32_t res = 0; - - for (uint32_t il = 0; il < n_layer; ++il) { - if (has_kv(il)) { - res++; - } - } - - return res; +uint32_t llama_hparams::n_layer() const { + return n_layer_all - n_layer_nextn; } bool llama_hparams::use_mrope() const { diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 43b6940a0e93..094526eca82b 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -45,12 +45,15 @@ struct llama_hparams { uint32_t n_ctx_train; // context size the model was trained on uint32_t n_embd; - uint32_t n_layer; - int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache + uint32_t n_layer_all; + uint32_t n_layer_nextn = 0; uint32_t n_expert = 0; uint32_t n_expert_used = 0; uint32_t n_rel_attn_bkts = 0; + // TODO: this needs to be reworked + int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache + // different head size for full_attention and SWA layers uint32_t n_embd_head_k_full; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads uint32_t n_embd_head_v_full; // dimension of values (d_v) aka n_embd_head @@ -93,9 +96,6 @@ struct llama_hparams { uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE; uint32_t moe_every_n_layers = 0; uint32_t moe_latent_size = 0; - uint32_t nextn_predict_layers = 0; - - bool kv_only_nextn = false; // if true, only the last nextn_predict_layers blocks have a KV cache (MTP head arches) float f_norm_eps; float f_norm_rms_eps; @@ -261,8 +261,7 @@ struct llama_hparams { bool is_swa(uint32_t il) const; - // TODO: implement - //void set_recr_pattern(uint32_t n_pattern, bool dense_first = false); + void set_recr_pattern(uint32_t n_pattern, bool dense_first = false); // whether or not the given layer is recurrent (for hybrid models) bool is_recr(uint32_t il) const; @@ -318,8 +317,8 @@ struct llama_hparams { bool has_kv(uint32_t il) const; - // number of layers for which has_kv() returns true - uint32_t n_layer_kv() const; + // number of effective layers (excludes nextn layers) + uint32_t n_layer() const; // note that this function uses different SWA parameters from those in the hparams // note: inlined on purpose for performance reasons diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 5a1a1c2e88b8..11fd5547b387 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -146,6 +146,7 @@ llama_kv_cache::llama_kv_cache( } } + const uint32_t n_layer = hparams.n_layer(); const uint32_t n_layer_kv = hparams.n_layer_kv(); // define a comparator for the buft -> ctx map to ensure that the order is well-defined: @@ -210,7 +211,7 @@ llama_kv_cache::llama_kv_cache( const bool is_mla = hparams.is_mla(); - for (uint32_t il = 0; il < hparams.n_layer; il++) { + for (uint32_t il = 0; il < n_layer; il++) { if (!hparams.has_kv(il)) { LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il); continue; @@ -389,7 +390,7 @@ llama_kv_cache::llama_kv_cache( if (reuse) { LLAMA_LOG_DEBUG("%s: reusing layers:\n", __func__); - for (uint32_t il = 0; il < hparams.n_layer; il++) { + for (uint32_t il = 0; il < n_layer; il++) { const int32_t il_reuse = reuse(il); if (il_reuse < 0) { diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp index ec5dc5835dd4..6a4892fb471d 100644 --- a/src/llama-memory-recurrent.cpp +++ b/src/llama-memory-recurrent.cpp @@ -26,7 +26,7 @@ llama_memory_recurrent::llama_memory_recurrent( uint32_t n_seq_max, uint32_t n_rs_seq, const layer_filter_cb & filter) : hparams(model.hparams), n_seq_max(n_seq_max) { - const int32_t n_layer = hparams.n_layer; + const int32_t n_layer = hparams.n_layer(); head = 0; size = mem_size; @@ -863,7 +863,7 @@ void llama_memory_recurrent::state_write_meta(llama_io_write_i & io, const std:: void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::vector> & cell_ranges) const { const uint32_t s_trans = 0; - const uint32_t n_layer = hparams.n_layer; + const uint32_t n_layer = hparams.n_layer(); io.write(&s_trans, sizeof(s_trans)); io.write(&n_layer, sizeof(n_layer)); @@ -1047,8 +1047,8 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell io.read(&s_trans, sizeof(s_trans)); io.read(&n_layer, sizeof(n_layer)); - if (n_layer != hparams.n_layer) { - LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer); + if (n_layer != hparams.n_layer()) { + LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer()); return false; } if (cell_count > size) { diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 01e1a22498f6..374788e2468f 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -1054,10 +1054,10 @@ struct ggml_tensor * llama_model_loader::create_tensor( if (it == ctx_map.end()) { // one ggml context per buffer type int max_n_tensors = n_tensors; - max_n_tensors += 1; // duplicated output tensor - max_n_tensors += hparams.n_layer*2; // duplicated rope freq tensors + max_n_tensors += 1; // duplicated output tensor + max_n_tensors += hparams.n_layer()*2; // duplicated rope freq tensors if (files.empty()) { - max_n_tensors += hparams.n_layer*256; // this should be well above what any model actually uses + max_n_tensors += hparams.n_layer()*256; // this should be well above what any model actually uses } const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors; diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp index 168ac335bd40..88692babbb6c 100644 --- a/src/llama-model-saver.cpp +++ b/src/llama-model-saver.cpp @@ -76,7 +76,7 @@ void llama_model_saver::add_kv(const enum llm_kv key, const char value) { template void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, const bool per_layer) { GGML_ASSERT(model != nullptr || !per_layer); - const size_t n_values = per_layer ? size_t(model->hparams.n_layer) : value.size(); + const size_t n_values = per_layer ? size_t(model->hparams.n_layer()) : value.size(); GGML_ASSERT(n_values <= value.size()); if (n_values == 0) { @@ -205,7 +205,7 @@ void llama_model_saver::add_kv_from_model() { if (hparams.n_embd_out_impl > 0) { add_kv(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl); } - add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer); + add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer_all); add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead); add_kv(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, true); add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); @@ -226,7 +226,7 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_EXPERT_GROUP_SCALE, hparams.expert_group_scale); add_kv(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts); add_kv(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers); - add_kv(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers); + add_kv(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn); add_kv(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers); add_kv(LLM_KV_POOLING_TYPE, uint32_t(hparams.pooling_type)); add_kv(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 20584deb2638..0ffb7cdc4787 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -391,7 +391,7 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str rotation = get_il_eff(il) % ud->n_devices; } else { il = 0; - rotation = hparams.n_layer % ud->n_devices; + rotation = hparams.n_layer() % ud->n_devices; } const ggml_tensor * tensor_axis_0 = suffix.empty() ? tensor : ud->model->get_tensor((prefix + suffix).c_str()); if (tensor_axis_0 == nullptr) { @@ -1000,7 +1000,7 @@ void llama_model_base::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl, false); ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false); ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); - ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); + ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer_all); ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false); @@ -1055,13 +1055,13 @@ void llama_model_base::load_hparams(llama_model_loader & ml) { std::fill(hparams.swiglu_clamp_exp.begin(), hparams.swiglu_clamp_exp.end(), 0.0f); std::fill(hparams.swiglu_clamp_shexp.begin(), hparams.swiglu_clamp_shexp.end(), 0.0f); - ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false); - ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false); + ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer(), false); + ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer(), false); // n_head_kv is optional, default to n_head hparams.n_head_kv_arr = hparams.n_head_arr; - ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false); + ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer(), false); bool rope_finetuned = false; ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false); @@ -1160,7 +1160,7 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { const auto & use_mlock = params.use_mlock; const auto & tensor_split = params.tensor_split; - const int n_layer = hparams.n_layer; + const int n_layer = hparams.n_layer_all; const int n_gpu_layers = this->n_gpu_layers(); const bool use_mmap_buffer = true; @@ -1217,10 +1217,10 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { splits[i] /= split_sum; } - const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0); - const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1); + const int i_gpu_start = std::max(n_layer + 1 - n_gpu_layers, 0); + const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, n_layer + 1); auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev { - const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il); + const bool is_swa = il < n_layer && hparams.is_swa(il); if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) { LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa); return {cpu_dev, &pimpl->cpu_buft_list}; @@ -1523,7 +1523,7 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { } if (llama_supports_gpu_offload()) { - const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); + const int n_gpu = std::min(n_gpu_layers, n_layer); int n_repeating = n_gpu; if (n_repeating > 0) { @@ -1532,8 +1532,8 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { } LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating); - const int max_backend_supported_layers = hparams.n_layer + 1; - const int max_offloadable_layers = hparams.n_layer + 1; + const int max_backend_supported_layers = n_layer + 1; + const int max_offloadable_layers = n_layer + 1; LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); } @@ -1602,7 +1602,7 @@ const float * llama_model::tensor_split() const { } uint32_t llama_model::n_gpu_layers() const { - return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1; + return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer() + 1; } llama_split_mode llama_model::split_mode() const { @@ -1673,17 +1673,17 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train); LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp()); - LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer); - LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str()); - LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str()); + LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer()); + LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer()).c_str()); + LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer()).c_str()); LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot_full); LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa); LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any()); LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k_full); LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v_full); - LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str()); - LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str()); - LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str()); + LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer()).c_str()); + LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer()).c_str()); + LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer()).c_str()); LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps); LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps); LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv); @@ -1691,7 +1691,7 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale); LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale); LLAMA_LOG_INFO("%s: f_attn_value_scale = %.4f\n", __func__, hparams.f_attn_value_scale); - LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str()); + LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer()).c_str()); LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert); LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used); LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups); @@ -1814,7 +1814,7 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm); LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func)); - LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers); + LLAMA_LOG_INFO("%s: n_layer_nextn = %d\n", __func__, hparams.n_layer_nextn); } if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) { @@ -1979,22 +1979,21 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_memory_hybrid::layer_filter_cb filter_attn = nullptr; llama_memory_hybrid::layer_filter_cb filter_recr = nullptr; if (arch == LLM_ARCH_FALCON_H1) { - filter_attn = [&](int32_t) { return true; }; - filter_recr = [&](int32_t) { return true; }; + filter_attn = [&](uint32_t) { return true; }; + filter_recr = [&](uint32_t) { return true; }; } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) { - filter_attn = [&](int32_t il) { + filter_attn = [&](uint32_t il) { return !hparams.is_recr(il) && hparams.n_ff(il) == 0; }; - filter_recr = [&](int32_t il) { + filter_recr = [&](uint32_t il) { return hparams.is_recr(il) && hparams.n_ff(il) == 0; }; } else if (arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE) { - const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers; - filter_attn = [&, n_main](int32_t il) { - return (uint32_t)il < n_main && !hparams.is_recr(il); + filter_attn = [&](uint32_t il) { + return il < hparams.n_layer() && !hparams.is_recr(il); }; - filter_recr = [&, n_main](int32_t il) { - return (uint32_t)il < n_main && hparams.is_recr(il); + filter_recr = [&](uint32_t il) { + return il < hparams.n_layer() && hparams.is_recr(il); }; } @@ -2043,9 +2042,11 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_kv_cache::layer_filter_cb filter = nullptr; if (arch == LLM_ARCH_GEMMA3N || arch == LLM_ARCH_GEMMA4) { - reuse = [&](int32_t il) { - if (il >= (int32_t) hparams.n_layer_kv_from_start) { - return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1); + reuse = [&](uint32_t il) { + GGML_ASSERT(hparams.n_layer_kv_from_start >= 2); + + if (il >= (uint32_t)hparams.n_layer_kv_from_start) { + return hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1); } return -1; @@ -2053,8 +2054,15 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, } if (mtp_on_hybrid_qwen35) { - const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers; - filter = [n_main](int32_t il) { return (uint32_t)il >= n_main; }; + filter = [&](uint32_t il) { return il >= hparams.n_layer(); }; + } + + if (arch == LLM_ARCH_STEP35 && hparams.n_layer_nextn > 0) { + if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP) { + filter = [&](uint32_t il) { return il >= hparams.n_layer(); }; + } else { + filter = [&](uint32_t il) { return il < hparams.n_layer(); }; + } } if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { @@ -2177,7 +2185,7 @@ int32_t llama_model_n_embd_out(const llama_model * model) { } int32_t llama_model_n_layer(const llama_model * model) { - return model->hparams.n_layer; + return model->hparams.n_layer(); } int32_t llama_model_n_head(const llama_model * model) { diff --git a/src/llama-model.h b/src/llama-model.h index 398a0aa725c3..3a8866d30a21 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -694,7 +694,8 @@ const char * llm_type_name(llm_type type); // convenience macro for loading local variables for load_tensors() in llama_model_base // note: cast to int64_t since we will use these for the tensor dimensions #define LLAMA_LOAD_LOCALS \ - const int n_layer = hparams.n_layer; GGML_UNUSED(n_layer); \ + const int n_layer = hparams.n_layer(); GGML_UNUSED(n_layer); \ + const int n_layer_all = hparams.n_layer_all; GGML_UNUSED(n_layer_all); \ const int64_t n_head = hparams.n_head(); GGML_UNUSED(n_head); \ const int64_t n_head_kv = hparams.n_head_kv(); GGML_UNUSED(n_head_kv); \ const int64_t n_embd = hparams.n_embd; GGML_UNUSED(n_embd); \ diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index f86c9cce55e6..6695dcfb23f3 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -849,7 +849,7 @@ static void init_quantize_state_counters(quantize_state_impl & qs, std::vectorhparams.n_embd = desc->n_embd; model->hparams.n_embd_head_k_full = desc->n_embd_head_k; model->hparams.n_embd_head_v_full = desc->n_embd_head_v; - model->hparams.n_layer = desc->n_layer; + model->hparams.n_layer_all = desc->n_layer; model->hparams.n_expert = desc->n_expert; for (uint32_t i = 0; i < desc->n_layer; i++) { diff --git a/src/models/afmoe.cpp b/src/models/afmoe.cpp index a7c77ee5d28b..063b214256e7 100644 --- a/src/models/afmoe.cpp +++ b/src/models/afmoe.cpp @@ -30,7 +30,7 @@ void llama_model_afmoe::load_arch_hparams(llama_model_loader & ml) { hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 56: type = LLM_TYPE_6B; break; case 32: type = LLM_TYPE_26B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/apertus.cpp b/src/models/apertus.cpp index bec7136521c6..6dfb8905fbe5 100644 --- a/src/models/apertus.cpp +++ b/src/models/apertus.cpp @@ -2,12 +2,13 @@ void llama_model_apertus::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n, hparams.n_layer); - ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p, hparams.n_layer); - ml.get_key_or_arr(LLM_KV_XIELU_BETA, hparams.xielu_beta, hparams.n_layer); - ml.get_key_or_arr(LLM_KV_XIELU_EPS, hparams.xielu_eps, hparams.n_layer); - switch (hparams.n_layer) { + ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n, hparams.n_layer()); + ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p, hparams.n_layer()); + ml.get_key_or_arr(LLM_KV_XIELU_BETA, hparams.xielu_beta, hparams.n_layer()); + ml.get_key_or_arr(LLM_KV_XIELU_EPS, hparams.xielu_eps, hparams.n_layer()); + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_8B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/arcee.cpp b/src/models/arcee.cpp index d086c4717ff1..9536e7c5d429 100644 --- a/src/models/arcee.cpp +++ b/src/models/arcee.cpp @@ -4,7 +4,7 @@ void llama_model_arcee::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); // Arcee uses the same structure as Llama - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 36: type = LLM_TYPE_4B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/arctic.cpp b/src/models/arctic.cpp index 27deadffeb75..09ee0f752f0e 100644 --- a/src/models/arctic.cpp +++ b/src/models/arctic.cpp @@ -4,7 +4,7 @@ void llama_model_arctic::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); if (hparams.n_expert == 128) { - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 35: type = LLM_TYPE_10B_128x3_66B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/arwkv7.cpp b/src/models/arwkv7.cpp index 9bd04127b25a..b38b2064785b 100644 --- a/src/models/arwkv7.cpp +++ b/src/models/arwkv7.cpp @@ -10,7 +10,7 @@ void llama_model_arwkv7::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK, hparams.n_lora_gate, false); ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 12: switch (hparams.n_embd) { case 768: type = LLM_TYPE_190M; break; diff --git a/src/models/baichuan.cpp b/src/models/baichuan.cpp index 4d26081cd5d4..585f36141741 100644 --- a/src/models/baichuan.cpp +++ b/src/models/baichuan.cpp @@ -2,7 +2,7 @@ void llama_model_baichuan::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 40: type = LLM_TYPE_13B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/bailingmoe.cpp b/src/models/bailingmoe.cpp index fe1ae10864b8..7faf73c835b6 100644 --- a/src/models/bailingmoe.cpp +++ b/src/models/bailingmoe.cpp @@ -8,7 +8,7 @@ void llama_model_bailingmoe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 28: type = LLM_TYPE_16B; break; case 88: type = LLM_TYPE_290B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/bailingmoe2.cpp b/src/models/bailingmoe2.cpp index 2f0d44a62596..5000e9c6db89 100644 --- a/src/models/bailingmoe2.cpp +++ b/src/models/bailingmoe2.cpp @@ -9,17 +9,13 @@ void llama_model_bailingmoe2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); - // TODO: when MTP is implemented, this should probably be updated if needed - hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 20: type = LLM_TYPE_16B_A1B; break; - case 21: type = LLM_TYPE_16B_A1B; break; case 32: type = LLM_TYPE_100B_A6B; break; - case 33: type = LLM_TYPE_100B_A6B; break; default: type = LLM_TYPE_UNKNOWN; } } @@ -39,9 +35,9 @@ void llama_model_bailingmoe2::load_arch_tensors(llama_model_loader &) { GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2"); GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2"); - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { int flags = 0; - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { // skip all tensors in the NextN layers flags |= TENSOR_SKIP; } @@ -78,7 +74,7 @@ void llama_model_bailingmoe2::load_arch_tensors(llama_model_loader &) { } // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags); layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); @@ -112,8 +108,7 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph ggml_tensor * inp_out_ids = build_inp_out_ids(); - const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; // norm @@ -146,7 +141,7 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); } - if (il == n_transformer_layers - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/bert.cpp b/src/models/bert.cpp index 3c28f419ccf2..53ce29f23cae 100644 --- a/src/models/bert.cpp +++ b/src/models/bert.cpp @@ -1,9 +1,9 @@ #include "models.h" void llama_model_bert::load_arch_hparams(llama_model_loader & ml) { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 3: type = LLM_TYPE_17M; break; // bge-micro case 6: diff --git a/src/models/bitnet.cpp b/src/models/bitnet.cpp index 7e8125deec40..c83302745806 100644 --- a/src/models/bitnet.cpp +++ b/src/models/bitnet.cpp @@ -3,7 +3,7 @@ void llama_model_bitnet::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 26: type = LLM_TYPE_3B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/bloom.cpp b/src/models/bloom.cpp index 30b0f3d07d06..609d2ddf998b 100644 --- a/src/models/bloom.cpp +++ b/src/models/bloom.cpp @@ -3,7 +3,7 @@ void llama_model_bloom::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1B; break; case 30: switch (hparams.n_embd) { diff --git a/src/models/chameleon.cpp b/src/models/chameleon.cpp index 4bceaefd63b3..4f45acecf848 100644 --- a/src/models/chameleon.cpp +++ b/src/models/chameleon.cpp @@ -6,7 +6,7 @@ void llama_model_chameleon::load_arch_hparams(llama_model_loader & ml) { hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 48: type = LLM_TYPE_34B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/chatglm.cpp b/src/models/chatglm.cpp index 6766fa71c159..7ae5b938fde3 100644 --- a/src/models/chatglm.cpp +++ b/src/models/chatglm.cpp @@ -2,7 +2,8 @@ void llama_model_chatglm::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 28: { if (hparams.n_head(0) == 16) { type = LLM_TYPE_1_5B; diff --git a/src/models/codeshell.cpp b/src/models/codeshell.cpp index 274dd3342a7b..de53bb981847 100644 --- a/src/models/codeshell.cpp +++ b/src/models/codeshell.cpp @@ -2,7 +2,8 @@ void llama_model_codeshell::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 42: type = LLM_TYPE_7B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/cogvlm.cpp b/src/models/cogvlm.cpp index 2e231bb3f935..750f57a394e3 100644 --- a/src/models/cogvlm.cpp +++ b/src/models/cogvlm.cpp @@ -2,7 +2,8 @@ void llama_model_cogvlm::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_13B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/cohere2.cpp b/src/models/cohere2.cpp index a514cf88fc63..61a5945a1948 100644 --- a/src/models/cohere2.cpp +++ b/src/models/cohere2.cpp @@ -5,6 +5,7 @@ void llama_model_cohere2::load_arch_hparams(llama_model_loader & ml) { uint32_t swa_period = 4; ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false); hparams.set_swa_pattern(swa_period); + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; @@ -12,7 +13,8 @@ void llama_model_cohere2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_8B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/command-r.cpp b/src/models/command-r.cpp index adf7fcaa20f8..94a46188bb89 100644 --- a/src/models/command-r.cpp +++ b/src/models/command-r.cpp @@ -3,7 +3,8 @@ void llama_model_command_r::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 40: type = LLM_TYPE_35B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/dbrx.cpp b/src/models/dbrx.cpp index af71c7753658..4f5ac4d06a45 100644 --- a/src/models/dbrx.cpp +++ b/src/models/dbrx.cpp @@ -1,14 +1,14 @@ #include "models.h" void llama_model_dbrx::load_arch_hparams(llama_model_loader & ml) { -ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); -ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv); -switch (hparams.n_layer) { - case 40: type = LLM_TYPE_16x12B; break; - default: type = LLM_TYPE_UNKNOWN; + switch (hparams.n_layer()) { + case 40: type = LLM_TYPE_16x12B; break; + default: type = LLM_TYPE_UNKNOWN; + } } - } void llama_model_dbrx::load_arch_tensors(llama_model_loader &) { LLAMA_LOAD_LOCALS; diff --git a/src/models/deci.cpp b/src/models/deci.cpp index 567e35352761..cdfcf29e02f5 100644 --- a/src/models/deci.cpp +++ b/src/models/deci.cpp @@ -2,7 +2,8 @@ void llama_model_deci::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 80: type = LLM_TYPE_70B; break; case 162: type = LLM_TYPE_405B; break; diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp index 1fe54adc13e2..a9e8bc514036 100644 --- a/src/models/deepseek2.cpp +++ b/src/models/deepseek2.cpp @@ -5,7 +5,7 @@ void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false); // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B, Kanana-2-30B-A3B - const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256)); + const bool is_lite = (hparams.n_layer() == 27 || hparams.n_layer() == 26 || (hparams.n_layer() == 48 && n_vocab == 128256)); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false); @@ -23,7 +23,7 @@ void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) { if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) { // for compatibility with existing DeepSeek V2 and V2.5 GGUFs // that have no expert_gating_func model parameter set - if ((hparams.n_layer == 47 || hparams.n_layer == 48) && n_vocab == 154880) { + if ((hparams.n_layer() == 47 || hparams.n_layer() == 48) && n_vocab == 154880) { // GLM 4.7 Lite hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID; } else { @@ -43,7 +43,7 @@ void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) { hparams.f_attn_temp_offset = 0.0f; - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 27: type = LLM_TYPE_16B; break; case 47: type = LLM_TYPE_30B_A3B; break; case 60: type = LLM_TYPE_236B; break; @@ -191,8 +191,7 @@ llama_model_deepseek2::graph::graph(const llama_model & model, const llm_graph_p ggml_tensor * inp_out_ids = build_inp_out_ids(); - int effective_n_layers = hparams.n_layer - hparams.nextn_predict_layers; - for (int il = 0; il < effective_n_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; // norm @@ -366,7 +365,7 @@ llama_model_deepseek2::graph::graph(const llama_model & model, const llm_graph_p Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); } } - if (il == effective_n_layers - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/deepseek2ocr.cpp b/src/models/deepseek2ocr.cpp index f9e4c98785c1..65d31c31b93e 100644 --- a/src/models/deepseek2ocr.cpp +++ b/src/models/deepseek2ocr.cpp @@ -14,7 +14,7 @@ void llama_model_deepseek2ocr::load_arch_hparams(llama_model_loader & ml) { hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 12: type = LLM_TYPE_3B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/dots1.cpp b/src/models/dots1.cpp index 435d27281c69..07d6ab1b7cdb 100644 --- a/src/models/dots1.cpp +++ b/src/models/dots1.cpp @@ -8,7 +8,8 @@ void llama_model_dots1::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 62: type = LLM_TYPE_142B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/dream.cpp b/src/models/dream.cpp index 12ac6f1ce88f..abe737c335a4 100644 --- a/src/models/dream.cpp +++ b/src/models/dream.cpp @@ -2,8 +2,9 @@ void llama_model_dream::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + // Dream models are primarily 7B with 28 layers - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 28: type = LLM_TYPE_7B; break; diff --git a/src/models/ernie4-5.cpp b/src/models/ernie4-5.cpp index 9b39c605e350..895cf690bd2d 100644 --- a/src/models/ernie4-5.cpp +++ b/src/models/ernie4-5.cpp @@ -12,7 +12,7 @@ void llama_model_ernie4_5::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false); } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 18: type = LLM_TYPE_0_3B; break; case 28: type = LLM_TYPE_21B_A3B; break; case 54: type = LLM_TYPE_300B_A47B; break; diff --git a/src/models/eurobert.cpp b/src/models/eurobert.cpp index ddf13c3028f2..0948d7de656f 100644 --- a/src/models/eurobert.cpp +++ b/src/models/eurobert.cpp @@ -3,7 +3,7 @@ void llama_model_eurobert::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - if (hparams.n_layer == 12) { + if (hparams.n_layer() == 12) { type = LLM_TYPE_SMALL; // 0.2B } } diff --git a/src/models/exaone-moe.cpp b/src/models/exaone-moe.cpp index 76d91982fc53..bccf169f8c00 100644 --- a/src/models/exaone-moe.cpp +++ b/src/models/exaone-moe.cpp @@ -20,13 +20,12 @@ void llama_model_exaone_moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false); - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_30B_A3B; break; - case 48: - case 49: type = LLM_TYPE_235B_A22B; break; + case 48: type = LLM_TYPE_235B_A22B; break; default: type = LLM_TYPE_UNKNOWN; } } @@ -50,9 +49,9 @@ void llama_model_exaone_moe::load_arch_tensors(llama_model_loader &) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { int flags = 0; - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { // skip all tensors in the NextN layers flags |= TENSOR_SKIP; } @@ -70,7 +69,7 @@ void llama_model_exaone_moe::load_arch_tensors(llama_model_loader &) { layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags); // dense layers for first n_layer_dense_lead layers or nextn_predict_layers layers at the end - if (i < (int) hparams.n_layer_dense_lead || (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers)) { + if (i < (int) hparams.n_layer_dense_lead || (i >= n_layer)) { layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, flags); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags); @@ -95,7 +94,7 @@ void llama_model_exaone_moe::load_arch_tensors(llama_model_loader &) { } // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), {2 * n_embd, n_embd}, flags); layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), {n_embd}, flags); layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), {n_embd}, flags); @@ -130,8 +129,7 @@ llama_model_exaone_moe::graph::graph(const llama_model & model, const llm_graph_ ggml_tensor * inp_out_ids = build_inp_out_ids(); - const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; // use RoPE for SWA layers @@ -170,7 +168,7 @@ llama_model_exaone_moe::graph::graph(const llama_model & model, const llm_graph_ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); cb(cur, "attn_out", il); } - if (il == n_transformer_layers - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/exaone.cpp b/src/models/exaone.cpp index c7e9960d7185..676fb37b5a63 100644 --- a/src/models/exaone.cpp +++ b/src/models/exaone.cpp @@ -3,7 +3,7 @@ void llama_model_exaone::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_8B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/exaone4.cpp b/src/models/exaone4.cpp index 499e22dde81e..595289d72f57 100644 --- a/src/models/exaone4.cpp +++ b/src/models/exaone4.cpp @@ -1,7 +1,7 @@ #include "models.h" void llama_model_exaone4::load_arch_hparams(llama_model_loader & ml) { - if (hparams.n_layer == 64) { // 32B + if (hparams.n_layer() == 64) { // 32B hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; hparams.n_swa = 4096; uint32_t swa_period = 4; @@ -16,7 +16,7 @@ void llama_model_exaone4::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 30: type = LLM_TYPE_1_2B; break; case 64: type = LLM_TYPE_32B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/falcon-h1.cpp b/src/models/falcon-h1.cpp index c130ccdd49ee..d6ef2d519862 100644 --- a/src/models/falcon-h1.cpp +++ b/src/models/falcon-h1.cpp @@ -13,7 +13,7 @@ void llama_model_falcon_h1::load_arch_hparams(llama_model_loader & ml) { std::fill(hparams.is_recr_impl.begin(), hparams.is_recr_impl.end(), true); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 36: type = LLM_TYPE_0_5B; break; case 24: diff --git a/src/models/falcon.cpp b/src/models/falcon.cpp index ad546ef2db56..b2ad90b32724 100644 --- a/src/models/falcon.cpp +++ b/src/models/falcon.cpp @@ -3,7 +3,7 @@ void llama_model_falcon::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 60: type = LLM_TYPE_40B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/gemma-embedding.cpp b/src/models/gemma-embedding.cpp index 4e07f5f2bdaf..80ed3b1a4605 100644 --- a/src/models/gemma-embedding.cpp +++ b/src/models/gemma-embedding.cpp @@ -21,7 +21,7 @@ void llama_model_gemma_embedding::load_arch_hparams(llama_model_loader & ml) { GGML_ASSERT((hparams.dense_2_feat_in == 0 || hparams.dense_2_feat_in == hparams.n_embd) && "dense_2_feat_in must be equal to n_embd"); GGML_ASSERT((hparams.dense_3_feat_out == 0 || hparams.dense_3_feat_out == hparams.n_embd) && "dense_3_feat_out must be equal to n_embd"); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_0_3B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/gemma.cpp b/src/models/gemma.cpp index 1519682fdf62..651cd7e64de0 100644 --- a/src/models/gemma.cpp +++ b/src/models/gemma.cpp @@ -3,7 +3,7 @@ void llama_model_gemma::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 18: type = LLM_TYPE_2B; break; case 28: type = LLM_TYPE_7B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/gemma2.cpp b/src/models/gemma2.cpp index ae3f9ffb5301..2fbfb15a94a3 100644 --- a/src/models/gemma2.cpp +++ b/src/models/gemma2.cpp @@ -16,7 +16,7 @@ void llama_model_gemma2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false); ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 26: type = LLM_TYPE_2B; break; case 42: type = LLM_TYPE_9B; break; case 46: type = LLM_TYPE_27B; break; diff --git a/src/models/gemma3.cpp b/src/models/gemma3.cpp index 63a2b380e71a..690194529e38 100644 --- a/src/models/gemma3.cpp +++ b/src/models/gemma3.cpp @@ -17,7 +17,7 @@ void llama_model_gemma3::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 18: type = LLM_TYPE_270M; break; case 26: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_8B; break; // Rnj-1 diff --git a/src/models/gemma3n.cpp b/src/models/gemma3n.cpp index 6ec3a006081f..83eb8250aa94 100644 --- a/src/models/gemma3n.cpp +++ b/src/models/gemma3n.cpp @@ -6,14 +6,14 @@ void llama_model_gemma3n::load_arch_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; hparams.set_swa_pattern(swa_period); - hparams.n_layer_kv_from_start = 20; - hparams.f_attention_scale = 1.0f; + hparams.n_layer_kv_from_start = 20; + hparams.f_attention_scale = 1.0f; ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 30: type = LLM_TYPE_E2B; break; case 35: type = LLM_TYPE_E4B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/gemma4.cpp b/src/models/gemma4.cpp index 31906de33d97..7198e5411164 100644 --- a/src/models/gemma4.cpp +++ b/src/models/gemma4.cpp @@ -2,12 +2,12 @@ void llama_model_gemma4::load_arch_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; - ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer); + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer()); uint32_t n_kv_shared_layers = 0; ml.get_key(LLM_KV_ATTENTION_SHARED_KV_LAYERS, n_kv_shared_layers, false); - hparams.n_layer_kv_from_start = hparams.n_layer - (int32_t)n_kv_shared_layers; + hparams.n_layer_kv_from_start = hparams.n_layer_all - (int32_t)n_kv_shared_layers; hparams.f_attention_scale = 1.0f; // Gemma4 uses self.scaling = 1.0 (no pre-attn scaling) ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); @@ -19,7 +19,7 @@ void llama_model_gemma4::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa); ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 30: type = LLM_TYPE_26B_A4B; break; case 35: type = LLM_TYPE_E2B; break; case 42: type = LLM_TYPE_E4B; break; diff --git a/src/models/glm-dsa.cpp b/src/models/glm-dsa.cpp index af2b55ef563d..11d91312defc 100644 --- a/src/models/glm-dsa.cpp +++ b/src/models/glm-dsa.cpp @@ -33,13 +33,10 @@ void llama_model_glm_dsa::load_arch_hparams(llama_model_loader & ml) { } // NextN/MTP parameters - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); - // TODO: when MTP is implemented, this should probably be updated if needed - hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; - - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 79: type = LLM_TYPE_744B_A40B; break; default: type = LLM_TYPE_UNKNOWN; } @@ -76,9 +73,9 @@ void llama_model_glm_dsa::load_arch_tensors(llama_model_loader &) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { int flags = 0; - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { // skip all tensors in the NextN layers // TODO @ngxson : TENSOR_NOT_REQUIRED was a hack, need to remove it later flags |= TENSOR_SKIP | TENSOR_NOT_REQUIRED; @@ -135,8 +132,8 @@ void llama_model_glm_dsa::load_arch_tensors(llama_model_loader &) { layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags); } - // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + // NextN/MTP tensors (preserved but unused) - conditionally load for last n_layer_nextn + if (i >= n_layer) { layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags); diff --git a/src/models/glm4-moe.cpp b/src/models/glm4-moe.cpp index 27654b8cba3c..3105c56b5306 100644 --- a/src/models/glm4-moe.cpp +++ b/src/models/glm4-moe.cpp @@ -20,16 +20,13 @@ void llama_model_glm4_moe::load_arch_hparams(llama_model_loader & ml) { } // NextN/MTP parameters - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); - // TODO: when MTP is implemented, this should probably be updated if needed - hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; - - switch (hparams.n_layer) { - case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer) - case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open - case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer) + switch (hparams.n_layer()) { + case 46: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air + case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open + case 92: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 default: type = LLM_TYPE_UNKNOWN; } } @@ -54,9 +51,9 @@ void llama_model_glm4_moe::load_arch_tensors(llama_model_loader &) { // Load ALL tensors including NextN layer to satisfy total tensor count // but only PROCESS up to last layer (skipping final NextN layer) in forward pass - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { int flags = 0; - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { // skip all tensors in the NextN layers flags |= TENSOR_SKIP; } @@ -116,7 +113,7 @@ void llama_model_glm4_moe::load_arch_tensors(llama_model_loader &) { } // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags); @@ -161,8 +158,7 @@ llama_model_glm4_moe::graph::graph(const llama_model & model, const llm_graph_pa // Only process up to last layer (skip final NextN layer) // Final layer tensors are loaded but not processed in forward pass - const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; // Pre-attention norm @@ -211,7 +207,7 @@ llama_model_glm4_moe::graph::graph(const llama_model & model, const llm_graph_pa model.layers[il].wo, NULL, model.layers[il].wo_s, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } - if (il == n_transformer_layers - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp index 7c242fed298e..b4326c5f2107 100644 --- a/src/models/glm4.cpp +++ b/src/models/glm4.cpp @@ -5,13 +5,10 @@ void llama_model_glm4::load_arch_hparams(llama_model_loader & ml) { ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false); // NextN/MTP parameters (GLM-OCR) - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); - // TODO: when MTP is implemented, this should probably be updated if needed - hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; - - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 17: type = LLM_TYPE_1B; break; // GLM-OCR case 40: type = LLM_TYPE_9B; break; case 61: type = LLM_TYPE_32B; break; @@ -32,9 +29,9 @@ void llama_model_glm4::load_arch_tensors(llama_model_loader &) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { int flags = 0; - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { // skip all tensors in the NextN layers flags |= TENSOR_SKIP; } @@ -55,7 +52,7 @@ void llama_model_glm4::load_arch_tensors(llama_model_loader &) { layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, flags); // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags); @@ -100,8 +97,7 @@ llama_model_glm4::graph::graph(const llama_model & model, const llm_graph_params // Only process up to last layer (skip final NextN layer) // Final layer tensors are loaded but not processed in forward pass - const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; // Pre-attention norm @@ -140,7 +136,7 @@ llama_model_glm4::graph::graph(const llama_model & model, const llm_graph_params model.layers[il].wo, NULL, model.layers[il].wo_s, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); } - if (il == n_transformer_layers - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/gpt2.cpp b/src/models/gpt2.cpp index e2dcc8b1521e..45afbccc1210 100644 --- a/src/models/gpt2.cpp +++ b/src/models/gpt2.cpp @@ -2,7 +2,8 @@ void llama_model_gpt2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 12: type = LLM_TYPE_SMALL; break; case 24: type = LLM_TYPE_MEDIUM; break; case 36: type = LLM_TYPE_LARGE; break; diff --git a/src/models/gptneox.cpp b/src/models/gptneox.cpp index 443e35addf28..ed5e8c50da2f 100644 --- a/src/models/gptneox.cpp +++ b/src/models/gptneox.cpp @@ -3,7 +3,8 @@ void llama_model_gptneox::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 6: switch (hparams.n_ff()) { case 512: type = LLM_TYPE_14M; break; diff --git a/src/models/granite-hybrid.cpp b/src/models/granite-hybrid.cpp index 8740d9fc7d9b..eb23095aece6 100644 --- a/src/models/granite-hybrid.cpp +++ b/src/models/granite-hybrid.cpp @@ -19,7 +19,7 @@ void llama_model_granite_hybrid::load_arch_hparams(llama_model_loader & ml) { hparams.rope_finetuned = rope_finetuned; // A layer is recurrent IFF the n_head_kv value is set to 0 - for (uint32_t i = 0; i < hparams.n_layer; ++i) { + for (uint32_t i = 0; i < hparams.n_layer(); ++i) { hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0; } diff --git a/src/models/granite-moe.cpp b/src/models/granite-moe.cpp index 0d89bc1f3404..115263c418fa 100644 --- a/src/models/granite-moe.cpp +++ b/src/models/granite-moe.cpp @@ -12,7 +12,7 @@ void llama_model_granite_moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false); hparams.rope_finetuned = rope_finetuned; - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_3B; break; case 40: type = LLM_TYPE_3B; break; // Add additional layer/vocab/etc checks here for other model sizes diff --git a/src/models/granite.cpp b/src/models/granite.cpp index cda4aa231fa9..7aff942da015 100644 --- a/src/models/granite.cpp +++ b/src/models/granite.cpp @@ -12,7 +12,7 @@ void llama_model_granite::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false); hparams.rope_finetuned = rope_finetuned; - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_3B; break; case 40: type = LLM_TYPE_3B; break; // Add additional layer/vocab/etc checks here for other model sizes diff --git a/src/models/grok.cpp b/src/models/grok.cpp index 7c46ec1c0f2d..42f38af67243 100644 --- a/src/models/grok.cpp +++ b/src/models/grok.cpp @@ -26,7 +26,7 @@ void llama_model_grok::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false); ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 64: type = LLM_TYPE_314B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/grovemoe.cpp b/src/models/grovemoe.cpp index 1cab75adc7fe..643a448e59ac 100644 --- a/src/models/grovemoe.cpp +++ b/src/models/grovemoe.cpp @@ -7,7 +7,7 @@ void llama_model_grovemoe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 48: type = LLM_TYPE_30B_A3B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/hunyuan-moe.cpp b/src/models/hunyuan-moe.cpp index deb3c9671f33..4d55f5e7f317 100644 --- a/src/models/hunyuan-moe.cpp +++ b/src/models/hunyuan-moe.cpp @@ -5,7 +5,7 @@ void llama_model_hunyuan_moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_A13B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/internlm2.cpp b/src/models/internlm2.cpp index f9ee37a24b69..f6cfdfb9458f 100644 --- a/src/models/internlm2.cpp +++ b/src/models/internlm2.cpp @@ -2,7 +2,8 @@ void llama_model_internlm2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 48: type = LLM_TYPE_20B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/jais.cpp b/src/models/jais.cpp index 2ba162605f13..415103ce23a8 100644 --- a/src/models/jais.cpp +++ b/src/models/jais.cpp @@ -4,7 +4,7 @@ void llama_model_jais::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1_3B; break; case 40: type = LLM_TYPE_13B; break; /* TODO: add variants */ diff --git a/src/models/jais2.cpp b/src/models/jais2.cpp index 8966131441c3..8610fcc9f82f 100644 --- a/src/models/jais2.cpp +++ b/src/models/jais2.cpp @@ -3,7 +3,7 @@ void llama_model_jais2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_8B; break; case 68: type = LLM_TYPE_70B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/jamba.cpp b/src/models/jamba.cpp index a62b121b3eea..dba160b014fd 100644 --- a/src/models/jamba.cpp +++ b/src/models/jamba.cpp @@ -8,11 +8,11 @@ void llama_model_jamba::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - for (uint32_t i = 0; i < hparams.n_layer; ++i) { + for (uint32_t i = 0; i < hparams.n_layer(); ++i) { hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { // TODO: Jamba layers are a bit heterogeneous, so naming this is hard. case 12: // 900M 8x???M case 32: // 51B 16x?B diff --git a/src/models/jina-bert-v2.cpp b/src/models/jina-bert-v2.cpp index 4f8866ece4d2..86ff1c84d1a6 100644 --- a/src/models/jina-bert-v2.cpp +++ b/src/models/jina-bert-v2.cpp @@ -4,7 +4,7 @@ void llama_model_jina_bert_v2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); hparams.f_max_alibi_bias = 8.0f; - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 4: type = LLM_TYPE_33M; break; // jina-embeddings-small case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/jina-bert-v3.cpp b/src/models/jina-bert-v3.cpp index e0527529f56c..1c974a6f16cc 100644 --- a/src/models/jina-bert-v3.cpp +++ b/src/models/jina-bert-v3.cpp @@ -3,7 +3,7 @@ void llama_model_jina_bert_v3::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_558M; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index c13f71b5bcb1..367f6990d1fb 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -14,7 +14,7 @@ void llama_model_kimi_linear::load_arch_hparams(llama_model_loader & ml) { // Mark KDA layers as recurrent using n_head_kv pattern (like Jamba) // Set n_head_kv = 0 for KDA layers (recurrent), n_head_kv = n_head for MLA layers (attention) - for (uint32_t i = 0; i < hparams.n_layer; ++i) { + for (uint32_t i = 0; i < hparams.n_layer(); ++i) { hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0; // KDA layers are recurrent } @@ -25,7 +25,7 @@ void llama_model_kimi_linear::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 27: type = LLM_TYPE_48B_A3B; break; // Kimi-Linear-48B-A3B default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/lfm2.cpp b/src/models/lfm2.cpp index 3898b56bb12d..97da8a6abb84 100644 --- a/src/models/lfm2.cpp +++ b/src/models/lfm2.cpp @@ -5,10 +5,13 @@ void llama_model_lfm2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - for (uint32_t il = 0; il < hparams.n_layer; ++il) { + + for (uint32_t il = 0; il < hparams.n_layer(); ++il) { hparams.is_recr_impl[il] = hparams.n_head_kv(il) == 0; } - hparams.n_layer_dense_lead = hparams.n_layer; + + hparams.n_layer_dense_lead = hparams.n_layer(); + switch (hparams.n_ff()) { case 4608: type = LLM_TYPE_350M; break; case 6912: type = LLM_TYPE_700M; break; @@ -16,9 +19,10 @@ void llama_model_lfm2::load_arch_hparams(llama_model_loader & ml) { case 10752: type = LLM_TYPE_2_6B; break; default: type = LLM_TYPE_UNKNOWN; } + if (const auto is_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); is_swa && hparams.n_swa > 0) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; - for (uint32_t il = 0; il < hparams.n_layer; ++il) { + for (uint32_t il = 0; il < hparams.n_layer(); ++il) { hparams.is_swa_impl[il] = !hparams.is_recr_impl[il]; } } diff --git a/src/models/lfm2moe.cpp b/src/models/lfm2moe.cpp index 81ced2eaba2f..490f5c223ebb 100644 --- a/src/models/lfm2moe.cpp +++ b/src/models/lfm2moe.cpp @@ -9,11 +9,11 @@ void llama_model_lfm2moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); - for (uint32_t il = 0; il < hparams.n_layer; ++il) { + for (uint32_t il = 0; il < hparams.n_layer(); ++il) { hparams.is_recr_impl[il] = hparams.n_head_kv(il) == 0; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_8B_A1B; break; case 40: type = LLM_TYPE_24B_A2B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/llada-moe.cpp b/src/models/llada-moe.cpp index 9722dde9f177..2ae893864472 100644 --- a/src/models/llada-moe.cpp +++ b/src/models/llada-moe.cpp @@ -2,11 +2,12 @@ void llama_model_llada_moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + // diffusion language model uses non-causal attention hparams.causal_attn = false; - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 16: type = LLM_TYPE_A1_7B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/llada.cpp b/src/models/llada.cpp index 58b2c466e176..87d4259f9a74 100644 --- a/src/models/llada.cpp +++ b/src/models/llada.cpp @@ -2,14 +2,16 @@ void llama_model_llada::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_8B; break; default: type = LLM_TYPE_UNKNOWN; } + // Set non-causal attention for diffusion models hparams.causal_attn = false; } diff --git a/src/models/llama.cpp b/src/models/llama.cpp index cef66d054b0c..c0ec7e0a9adb 100644 --- a/src/models/llama.cpp +++ b/src/models/llama.cpp @@ -7,13 +7,13 @@ void llama_model_llama::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); if (hparams.n_expert == 8) { - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_8x7B; break; case 56: type = LLM_TYPE_8x22B; break; default: type = LLM_TYPE_UNKNOWN; } } else { - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B case 22: type = LLM_TYPE_1B; break; case 26: type = LLM_TYPE_3B; break; diff --git a/src/models/llama4.cpp b/src/models/llama4.cpp index 8f39b3f59a5b..7194c72a5850 100644 --- a/src/models/llama4.cpp +++ b/src/models/llama4.cpp @@ -8,7 +8,7 @@ void llama_model_llama4::load_arch_hparams(llama_model_loader & ml) { const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); if (found_swa && hparams.n_swa == 0) { hparams.swa_type = LLAMA_SWA_TYPE_NONE; - hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope + hparams.n_no_rope_layer_step = hparams.n_layer(); // always use rope } else { hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED; hparams.n_swa = 8192; diff --git a/src/models/maincoder.cpp b/src/models/maincoder.cpp index 84cfe399027e..ae56a26a1f64 100644 --- a/src/models/maincoder.cpp +++ b/src/models/maincoder.cpp @@ -2,7 +2,8 @@ void llama_model_maincoder::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_1B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/mamba.cpp b/src/models/mamba.cpp index 887a1fa509a9..0d94e98281ce 100644 --- a/src/models/mamba.cpp +++ b/src/models/mamba.cpp @@ -9,7 +9,7 @@ void llama_model_mamba::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: switch (hparams.n_embd) { case 768: type = LLM_TYPE_SMALL; break; diff --git a/src/models/mamba2.cpp b/src/models/mamba2.cpp index 3277ca53ec4b..c5951cf0f7fc 100644 --- a/src/models/mamba2.cpp +++ b/src/models/mamba2.cpp @@ -9,7 +9,7 @@ void llama_model_mamba2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: switch (hparams.n_embd) { case 768: type = LLM_TYPE_SMALL; break; diff --git a/src/models/mimo2.cpp b/src/models/mimo2.cpp index 1bcdf696f2e8..889891605701 100644 --- a/src/models/mimo2.cpp +++ b/src/models/mimo2.cpp @@ -9,18 +9,17 @@ void llama_model_mimo2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); - ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer); + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer()); float value_scale = 0.0f; if (ml.get_key(LLM_KV_ATTENTION_VALUE_SCALE, value_scale, false) && value_scale != 1.0f) { hparams.f_attn_value_scale = value_scale; } - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); - hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); - switch (hparams.n_layer - hparams.nextn_predict_layers) { + switch (hparams.n_layer()) { case 48: type = LLM_TYPE_310B_A15B; break; default: type = LLM_TYPE_UNKNOWN; } @@ -35,16 +34,14 @@ void llama_model_mimo2::load_arch_tensors(llama_model_loader &) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); - const uint32_t n_nextn = hparams.nextn_predict_layers; - - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { auto & layer = layers[i]; uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i); uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i); uint32_t n_head = hparams.n_head(i); // NextN/MTP layers (the last n_nextn blocks) are preserved but disabled pending support - const bool is_nextn = (n_nextn > 0) && (static_cast(i) >= n_layer - n_nextn); + const bool is_nextn = i >= n_layer; const int skip = is_nextn ? TENSOR_SKIP : 0; create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, skip); @@ -93,10 +90,7 @@ llama_model_mimo2::graph::graph(const llama_model & model, const llm_graph_param const float v_scale = hparams.f_attn_value_scale; - // The last hparams.nextn_predict_layers blocks are MTP heads, currently inactive - const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; - - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; uint32_t n_head_l = hparams.n_head(il); @@ -174,7 +168,7 @@ llama_model_mimo2::graph::graph(const llama_model & model, const llm_graph_param } } - if (il == n_transformer_layers - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/minicpm.cpp b/src/models/minicpm.cpp index 966d3af615c0..fc3e5b171d52 100644 --- a/src/models/minicpm.cpp +++ b/src/models/minicpm.cpp @@ -3,7 +3,7 @@ void llama_model_minicpm::load_arch_hparams(llama_model_loader & ml) { // Backward-compatible defaults for older MiniCPM GGUFs hparams.f_embedding_scale = 12.0f; - hparams.f_residual_scale = 1.4f / sqrtf(float(hparams.n_layer)); + hparams.f_residual_scale = 1.4f / sqrtf(float(hparams.n_layer())); hparams.f_logit_scale = hparams.n_embd ? (256.0f / float(hparams.n_embd)) : 1.0f; ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -16,7 +16,7 @@ void llama_model_minicpm::load_arch_hparams(llama_model_loader & ml) { // MiniCPM uses rope by default, unlike Granite which uses it as a switch hparams.rope_finetuned = true; - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 52: type = LLM_TYPE_1B; break; case 40: type = LLM_TYPE_2B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/minicpm3.cpp b/src/models/minicpm3.cpp index 1ffc54fa7c66..e011b1ff0a84 100644 --- a/src/models/minicpm3.cpp +++ b/src/models/minicpm3.cpp @@ -5,7 +5,7 @@ void llama_model_minicpm3::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q); ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 62: type = LLM_TYPE_4B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/minimax-m2.cpp b/src/models/minimax-m2.cpp index 22e291d73a33..b25435e4d97c 100644 --- a/src/models/minimax-m2.cpp +++ b/src/models/minimax-m2.cpp @@ -5,7 +5,7 @@ void llama_model_minimax_m2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 62: type = LLM_TYPE_230B_A10B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/mistral3.cpp b/src/models/mistral3.cpp index 4e6ebef82cbb..0b119cc93cce 100644 --- a/src/models/mistral3.cpp +++ b/src/models/mistral3.cpp @@ -18,7 +18,7 @@ void llama_model_mistral3::load_arch_hparams(llama_model_loader & ml) { } } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 26: type = LLM_TYPE_3B; break; case 34: type = LLM_TYPE_8B; break; case 40: type = LLM_TYPE_14B; break; diff --git a/src/models/modern-bert.cpp b/src/models/modern-bert.cpp index e9b79ffc6dc0..cca841f5c74e 100644 --- a/src/models/modern-bert.cpp +++ b/src/models/modern-bert.cpp @@ -14,7 +14,7 @@ void llama_model_modern_bert::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 12: type = LLM_TYPE_47M; break; // granite-embedding-small case 22: diff --git a/src/models/mpt.cpp b/src/models/mpt.cpp index 0229d20ed369..d094fd9f80bc 100644 --- a/src/models/mpt.cpp +++ b/src/models/mpt.cpp @@ -5,7 +5,7 @@ void llama_model_mpt::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false); ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 48: type = LLM_TYPE_30B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/nemotron-h.cpp b/src/models/nemotron-h.cpp index d2c811d2497c..a456269347b9 100644 --- a/src/models/nemotron-h.cpp +++ b/src/models/nemotron-h.cpp @@ -9,7 +9,7 @@ void llama_model_nemotron_h::load_arch_hparams(llama_model_loader & ml) { // A layer is recurrent IFF the n_head_kv value is set to 0 and // the n_ff value is set to 0 - for (uint32_t i = 0; i < hparams.n_layer; ++i) { + for (uint32_t i = 0; i < hparams.n_layer(); ++i) { hparams.is_recr_impl[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0); } @@ -22,7 +22,7 @@ void llama_model_nemotron_h::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); ml.get_key(LLM_KV_MOE_LATENT_SIZE, hparams.moe_latent_size, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B case 56: type = LLM_TYPE_9B; break; case 88: type = LLM_TYPE_120B_A12B; break; diff --git a/src/models/nemotron.cpp b/src/models/nemotron.cpp index 5d4a3b5c69e5..6e2bd9a33ca8 100644 --- a/src/models/nemotron.cpp +++ b/src/models/nemotron.cpp @@ -2,7 +2,8 @@ void llama_model_nemotron::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_4B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/neo-bert.cpp b/src/models/neo-bert.cpp index f00d6eddfc99..4a08d7abd40f 100644 --- a/src/models/neo-bert.cpp +++ b/src/models/neo-bert.cpp @@ -3,7 +3,7 @@ void llama_model_neo_bert::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - if (hparams.n_layer == 28) { + if (hparams.n_layer() == 28) { type = LLM_TYPE_250M; } } diff --git a/src/models/nomic-bert-moe.cpp b/src/models/nomic-bert-moe.cpp index a17abe2c2692..da4b62919bb9 100644 --- a/src/models/nomic-bert-moe.cpp +++ b/src/models/nomic-bert-moe.cpp @@ -4,7 +4,7 @@ void llama_model_nomic_bert_moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0); - if (hparams.n_layer == 12 && hparams.n_embd == 768) { + if (hparams.n_layer() == 12 && hparams.n_embd == 768) { if (arch == LLM_ARCH_NOMIC_BERT) { type = LLM_TYPE_137M; } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) { diff --git a/src/models/nomic-bert.cpp b/src/models/nomic-bert.cpp index 5a8a5584457e..e7fc72286a6d 100644 --- a/src/models/nomic-bert.cpp +++ b/src/models/nomic-bert.cpp @@ -4,7 +4,7 @@ void llama_model_nomic_bert::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0); - if (hparams.n_layer == 12 && hparams.n_embd == 768) { + if (hparams.n_layer() == 12 && hparams.n_embd == 768) { if (arch == LLM_ARCH_NOMIC_BERT) { type = LLM_TYPE_137M; } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) { diff --git a/src/models/olmo.cpp b/src/models/olmo.cpp index cfcf17bcb035..9f7a2ba60efb 100644 --- a/src/models/olmo.cpp +++ b/src/models/olmo.cpp @@ -4,7 +4,7 @@ void llama_model_olmo::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 22: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_7B; break; case 80: type = LLM_TYPE_70B; break; diff --git a/src/models/olmo2.cpp b/src/models/olmo2.cpp index 7cc262f55046..cb52cdef7204 100644 --- a/src/models/olmo2.cpp +++ b/src/models/olmo2.cpp @@ -17,7 +17,7 @@ void llama_model_olmo2::load_arch_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_NONE; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 16: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_7B; break; case 40: type = LLM_TYPE_13B; break; diff --git a/src/models/olmoe.cpp b/src/models/olmoe.cpp index 7976ae44a51c..1e2baeb207ff 100644 --- a/src/models/olmoe.cpp +++ b/src/models/olmoe.cpp @@ -2,7 +2,8 @@ void llama_model_olmoe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 16: type = LLM_TYPE_A1_7B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/openai-moe.cpp b/src/models/openai-moe.cpp index 15b6c8c12057..3ab15d61f08c 100644 --- a/src/models/openai-moe.cpp +++ b/src/models/openai-moe.cpp @@ -14,7 +14,7 @@ void llama_model_openai_moe::load_arch_hparams(llama_model_loader & ml) { hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_20B; break; case 36: type = LLM_TYPE_120B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/openelm.cpp b/src/models/openelm.cpp index 9f76350fd4d1..13120bd3236d 100644 --- a/src/models/openelm.cpp +++ b/src/models/openelm.cpp @@ -3,12 +3,12 @@ void llama_model_openelm::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { - case 16: type = LLM_TYPE_270M; break; - case 20: type = LLM_TYPE_450M; break; - case 28: type = LLM_TYPE_1B; break; - case 36: type = LLM_TYPE_3B; break; - default: type = LLM_TYPE_UNKNOWN; + switch (hparams.n_layer()) { + case 16: type = LLM_TYPE_270M; break; + case 20: type = LLM_TYPE_450M; break; + case 28: type = LLM_TYPE_1B; break; + case 36: type = LLM_TYPE_3B; break; + default: type = LLM_TYPE_UNKNOWN; } } diff --git a/src/models/orion.cpp b/src/models/orion.cpp index bcb4bbba4b10..863a2822269b 100644 --- a/src/models/orion.cpp +++ b/src/models/orion.cpp @@ -3,7 +3,7 @@ void llama_model_orion::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 40: type = LLM_TYPE_14B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/pangu-embed.cpp b/src/models/pangu-embed.cpp index 7593f879b24f..90f05c088c10 100644 --- a/src/models/pangu-embed.cpp +++ b/src/models/pangu-embed.cpp @@ -2,7 +2,8 @@ void llama_model_pangu_embed::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 26: type = LLM_TYPE_1B; break; // openPangu-Embedded-1B-V1.1 case 34: type = LLM_TYPE_7B; break; // openPangu-Embedded-7B-V1.1 default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/phi2.cpp b/src/models/phi2.cpp index 8f3ed5f7b7d2..81b1ad12cc02 100644 --- a/src/models/phi2.cpp +++ b/src/models/phi2.cpp @@ -3,7 +3,7 @@ void llama_model_phi2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_3B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/phi3.cpp b/src/models/phi3.cpp index f8a4a4d5aa50..716ff814cc12 100644 --- a/src/models/phi3.cpp +++ b/src/models/phi3.cpp @@ -3,7 +3,7 @@ void llama_model_phi3::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_3B; break; case 40: type = LLM_TYPE_14B; break; diff --git a/src/models/phimoe.cpp b/src/models/phimoe.cpp index 4575d6139cf5..c332553bc7d1 100644 --- a/src/models/phimoe.cpp +++ b/src/models/phimoe.cpp @@ -3,7 +3,7 @@ void llama_model_phimoe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_16x3_8B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/plamo.cpp b/src/models/plamo.cpp index c7ed1211c311..246144519e43 100644 --- a/src/models/plamo.cpp +++ b/src/models/plamo.cpp @@ -3,7 +3,7 @@ void llama_model_plamo::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 40: type = LLM_TYPE_13B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/plamo2.cpp b/src/models/plamo2.cpp index 2ffa0898f712..b93cf48bc5c6 100644 --- a/src/models/plamo2.cpp +++ b/src/models/plamo2.cpp @@ -11,11 +11,11 @@ void llama_model_plamo2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); - for (uint32_t i = 0; i < hparams.n_layer; ++i) { + for (uint32_t i = 0; i < hparams.n_layer(); ++i) { hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 16: type = LLM_TYPE_1B; break; case 32: if (hparams.n_embd == 2048) { diff --git a/src/models/plamo3.cpp b/src/models/plamo3.cpp index 29f3e803d685..16d0b1dcef71 100644 --- a/src/models/plamo3.cpp +++ b/src/models/plamo3.cpp @@ -13,7 +13,7 @@ void llama_model_plamo3::load_arch_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_NONE; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_2B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/plm.cpp b/src/models/plm.cpp index ce050919e6a8..8ca325f5e2cf 100644 --- a/src/models/plm.cpp +++ b/src/models/plm.cpp @@ -3,7 +3,8 @@ void llama_model_plm::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_1_8B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/qwen.cpp b/src/models/qwen.cpp index 00467dbad7d5..1f5dff3843cf 100644 --- a/src/models/qwen.cpp +++ b/src/models/qwen.cpp @@ -3,7 +3,7 @@ void llama_model_qwen::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 40: type = LLM_TYPE_13B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/qwen2.cpp b/src/models/qwen2.cpp index a5147460bae6..e9c2ea80a6be 100644 --- a/src/models/qwen2.cpp +++ b/src/models/qwen2.cpp @@ -2,7 +2,8 @@ void llama_model_qwen2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break; case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break; case 32: type = LLM_TYPE_7B; break; diff --git a/src/models/qwen2moe.cpp b/src/models/qwen2moe.cpp index 7cb03859debb..e831ed11aad6 100644 --- a/src/models/qwen2moe.cpp +++ b/src/models/qwen2moe.cpp @@ -5,7 +5,8 @@ void llama_model_qwen2moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_A2_7B; break; case 28: type = LLM_TYPE_57B_A14B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/qwen3.cpp b/src/models/qwen3.cpp index 41b97fed9564..1d0d2fab362a 100644 --- a/src/models/qwen3.cpp +++ b/src/models/qwen3.cpp @@ -2,7 +2,8 @@ void llama_model_qwen3::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break; case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break; case 40: type = LLM_TYPE_14B; break; diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index 348650b37966..4b642cff467c 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -13,22 +13,20 @@ void llama_model_qwen35::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); // NextN/MTP (Qwen3.5/3.6): extra decoder block appended beyond the main stack - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); // Mark recurrent layers (linear attention layers). MTP layers are dense // attention-only and must be flagged non-recurrent. - if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer, false)) { - const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers; - + if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer_all, false)) { uint32_t full_attn_interval = 4; ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false); - for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.is_recr_impl[i] = (i < n_main) && ((i + 1) % full_attn_interval != 0); + for (uint32_t i = 0; i < hparams.n_layer_all; ++i) { + hparams.is_recr_impl[i] = (i < hparams.n_layer()) && ((i + 1) % full_attn_interval != 0); } } - switch (hparams.n_layer - hparams.nextn_predict_layers) { + switch (hparams.n_layer()) { case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_8B : LLM_TYPE_2B; break; case 32: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_9B; break; case 64: type = LLM_TYPE_27B; break; @@ -39,9 +37,7 @@ void llama_model_qwen35::load_arch_hparams(llama_model_loader & ml) { void llama_model_qwen35::load_arch_tensors(llama_model_loader & ml) { LLAMA_LOAD_LOCALS; - const uint32_t n_main = n_layer - hparams.nextn_predict_layers; - const bool mtp_only = (hparams.nextn_predict_layers > 0) && - (ml.get_weight("blk.0.attn_norm.weight") == nullptr); + const bool mtp_only = (hparams.n_layer_nextn > 0) && (ml.get_weight("blk.0.attn_norm.weight") == nullptr); const int trunk_flags = mtp_only ? TENSOR_NOT_REQUIRED : 0; tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); @@ -122,10 +118,10 @@ void llama_model_qwen35::load_arch_tensors(llama_model_loader & ml) { layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", il), { n_embd }, TENSOR_NOT_REQUIRED); }; - for (int i = 0; i < (int) n_main; ++i) { + for (int i = 0; i < n_layer; ++i) { load_block_trunk(i, trunk_flags); } - for (int i = (int) n_main; i < n_layer; ++i) { + for (int i = n_layer; i < n_layer_all; ++i) { load_block_mtp(i); } } @@ -159,8 +155,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para ggml_tensor * inp_out_ids = build_inp_out_ids(); // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass. - const int n_transformer_layers = n_layer - (int) hparams.nextn_predict_layers; - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); @@ -177,7 +172,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il); } - if (il == n_transformer_layers - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { + if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -490,15 +485,15 @@ ggml_tensor * llama_model_qwen35::graph::build_layer_ffn(ggml_tensor * cur, cons // LLM_GRAPH_TYPE_DECODER_MTP draft head for Qwen3.5/3.6 dense series llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35 MTP requires nextn_predict_layers > 0"); - GGML_ASSERT(hparams.nextn_predict_layers == 1 && "QWEN35 MTP currently only supports a single MTP block"); + GGML_ASSERT(hparams.n_layer_nextn > 0 && "QWEN35 MTP requires n_layer_nextn > 0"); + GGML_ASSERT(hparams.n_layer_nextn == 1 && "QWEN35 MTP currently only supports a single MTP block"); const int64_t n_embd_head = hparams.n_embd_head_v(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); // hparams.n_layer includes both main model layers and MTP layers. The MTP // layer is stored immediately after the main layers in model.layers[]. - const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers; + const int il = hparams.n_layer(); const auto & layer = model.layers[il]; GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj"); diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index 7d906191cbbc..eb5e9a406a15 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -16,22 +16,20 @@ void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); // NextN/MTP (Qwen3.5/3.6): extra decoder block appended beyond the main stack - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); // Mark recurrent layers (linear attention layers). MTP layers are dense // attention-only and must be flagged non-recurrent. - if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer, false)) { - const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers; - + if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer_all, false)) { uint32_t full_attn_interval = 4; ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false); - for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.is_recr_impl[i] = (i < n_main) && ((i + 1) % full_attn_interval != 0); + for (uint32_t i = 0; i < hparams.n_layer_all; ++i) { + hparams.is_recr_impl[i] = (i < hparams.n_layer()) && ((i + 1) % full_attn_interval != 0); } } - switch (hparams.n_layer - hparams.nextn_predict_layers) { + switch (hparams.n_layer()) { case 40: type = LLM_TYPE_35B_A3B; break; case 48: type = LLM_TYPE_122B_A10B; break; case 60: type = LLM_TYPE_397B_A17B; break; @@ -42,9 +40,7 @@ void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) { void llama_model_qwen35moe::load_arch_tensors(llama_model_loader & ml) { LLAMA_LOAD_LOCALS; - const uint32_t n_main = n_layer - hparams.nextn_predict_layers; - const bool mtp_only = (hparams.nextn_predict_layers > 0) && - (ml.get_weight("blk.0.attn_norm.weight") == nullptr); + const bool mtp_only = (hparams.n_layer_nextn > 0) && (ml.get_weight("blk.0.attn_norm.weight") == nullptr); const int trunk_flags = mtp_only ? TENSOR_NOT_REQUIRED : 0; tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); @@ -145,10 +141,10 @@ void llama_model_qwen35moe::load_arch_tensors(llama_model_loader & ml) { layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", il), { n_embd }, TENSOR_NOT_REQUIRED); }; - for (int i = 0; i < (int) n_main; ++i) { + for (int i = 0; i < n_layer; ++i) { load_block_trunk(i, trunk_flags); } - for (int i = (int) n_main; i < n_layer; ++i) { + for (int i = n_layer; i < n_layer_all; ++i) { load_block_mtp(i); } } @@ -182,8 +178,7 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p ggml_tensor * inp_out_ids = build_inp_out_ids(); // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass. - const int n_transformer_layers = n_layer - (int) hparams.nextn_predict_layers; - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); @@ -200,7 +195,7 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il); } - if (il == n_transformer_layers - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { + if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -555,13 +550,13 @@ ggml_tensor * llama_model_qwen35moe::graph::build_layer_ffn(ggml_tensor * cur, c // LLM_GRAPH_TYPE_DECODER_MTP draft head for Qwen3.5/3.6 MoE llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35MOE MTP requires nextn_predict_layers > 0"); - GGML_ASSERT(hparams.nextn_predict_layers == 1 && "QWEN35MOE MTP currently only supports a single MTP block"); + GGML_ASSERT(hparams.n_layer_nextn > 0 && "QWEN35MOE MTP requires n_layer_nextn > 0"); + GGML_ASSERT(hparams.n_layer_nextn == 1 && "QWEN35MOE MTP currently only supports a single MTP block"); const int64_t n_embd_head = hparams.n_embd_head_v(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); - const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers; + const int il = hparams.n_layer(); const auto & layer = model.layers[il]; GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj"); diff --git a/src/models/qwen3moe.cpp b/src/models/qwen3moe.cpp index a4f8e1379c90..317e668bec79 100644 --- a/src/models/qwen3moe.cpp +++ b/src/models/qwen3moe.cpp @@ -1,10 +1,10 @@ #include "models.h" void llama_model_qwen3moe::load_arch_hparams(llama_model_loader & ml) { - ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); - + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 48: type = LLM_TYPE_30B_A3B; break; case 94: type = LLM_TYPE_235B_A22B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp index 9e09ae6f232d..97200a44072f 100644 --- a/src/models/qwen3next.cpp +++ b/src/models/qwen3next.cpp @@ -14,15 +14,15 @@ void llama_model_qwen3next::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); // Mark recurrent layers (linear attention layers) - if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer, false)) { + if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer_all, false)) { uint32_t full_attn_interval = 4; ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false); - for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.is_recr_impl[i] = ((i + 1) % full_attn_interval != 0); + for (uint32_t i = 0; i < hparams.n_layer_all; ++i) { + hparams.is_recr_impl[i] = (i < hparams.n_layer()) && ((i + 1) % full_attn_interval != 0); } } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 48: type = LLM_TYPE_80B_A3B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/qwen3vl.cpp b/src/models/qwen3vl.cpp index 5defd8939448..724d6140d193 100644 --- a/src/models/qwen3vl.cpp +++ b/src/models/qwen3vl.cpp @@ -4,7 +4,8 @@ void llama_model_qwen3vl::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false); ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 28: type = LLM_TYPE_1_7B; break; case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break; case 64: type = LLM_TYPE_32B; break; diff --git a/src/models/qwen3vlmoe.cpp b/src/models/qwen3vlmoe.cpp index 5b77df57122e..7c41592f7727 100644 --- a/src/models/qwen3vlmoe.cpp +++ b/src/models/qwen3vlmoe.cpp @@ -5,7 +5,8 @@ void llama_model_qwen3vlmoe::load_arch_hparams(llama_model_loader & ml) { ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true); ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 48: type = LLM_TYPE_30B_A3B; break; case 94: type = LLM_TYPE_235B_A22B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/refact.cpp b/src/models/refact.cpp index bf3949a90920..a46c358fa68a 100644 --- a/src/models/refact.cpp +++ b/src/models/refact.cpp @@ -2,7 +2,8 @@ void llama_model_refact::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_1B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/rnd1.cpp b/src/models/rnd1.cpp index ca8e009615e9..fc276ce591bf 100644 --- a/src/models/rnd1.cpp +++ b/src/models/rnd1.cpp @@ -2,12 +2,13 @@ void llama_model_rnd1::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 48: type = LLM_TYPE_30B_A3B; break; default: type = LLM_TYPE_UNKNOWN; } + // Set non-causal attention for diffusion models hparams.causal_attn = false; } diff --git a/src/models/rwkv6.cpp b/src/models/rwkv6.cpp index ba2a9dfa0db4..0b5013dc7585 100644 --- a/src/models/rwkv6.cpp +++ b/src/models/rwkv6.cpp @@ -9,7 +9,7 @@ void llama_model_rwkv6::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false); ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1_6B; break; case 32: switch (hparams.n_embd) { diff --git a/src/models/rwkv6qwen2.cpp b/src/models/rwkv6qwen2.cpp index 566b8cdcb549..6c7db514435e 100644 --- a/src/models/rwkv6qwen2.cpp +++ b/src/models/rwkv6qwen2.cpp @@ -9,7 +9,7 @@ void llama_model_rwkv6qwen2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false); ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1_6B; break; case 32: switch (hparams.n_embd) { diff --git a/src/models/rwkv7.cpp b/src/models/rwkv7.cpp index 7574b252621c..67c51f5b59c6 100644 --- a/src/models/rwkv7.cpp +++ b/src/models/rwkv7.cpp @@ -10,7 +10,7 @@ void llama_model_rwkv7::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK, hparams.n_lora_gate, false); ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 12: switch (hparams.n_embd) { case 768: type = LLM_TYPE_190M; break; diff --git a/src/models/seed-oss.cpp b/src/models/seed-oss.cpp index 806cba574be8..57de881a091f 100644 --- a/src/models/seed-oss.cpp +++ b/src/models/seed-oss.cpp @@ -2,7 +2,8 @@ void llama_model_seed_oss::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 64: type = LLM_TYPE_36B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/smallthinker.cpp b/src/models/smallthinker.cpp index 4231cccc666f..a8e3d957f1f0 100644 --- a/src/models/smallthinker.cpp +++ b/src/models/smallthinker.cpp @@ -15,14 +15,14 @@ void llama_model_smallthinker::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); } else { hparams.swa_type = LLAMA_SWA_TYPE_NONE; - hparams.n_no_rope_layer_step = hparams.n_layer; + hparams.n_no_rope_layer_step = hparams.n_layer(); } ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_4B; break; case 52: type = LLM_TYPE_20B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/smollm3.cpp b/src/models/smollm3.cpp index 90e7d473eaff..c67d967b2040 100644 --- a/src/models/smollm3.cpp +++ b/src/models/smollm3.cpp @@ -4,7 +4,7 @@ void llama_model_smollm3::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); hparams.n_no_rope_layer_step = 4; - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 36: type = LLM_TYPE_3B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/stablelm.cpp b/src/models/stablelm.cpp index 4da7f7aefcf0..bf6087b87960 100644 --- a/src/models/stablelm.cpp +++ b/src/models/stablelm.cpp @@ -3,7 +3,7 @@ void llama_model_stablelm::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_3B; break; case 40: type = LLM_TYPE_12B; break; diff --git a/src/models/starcoder.cpp b/src/models/starcoder.cpp index e131af058bc3..f73a88fd4e90 100644 --- a/src/models/starcoder.cpp +++ b/src/models/starcoder.cpp @@ -2,7 +2,8 @@ void llama_model_starcoder::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1B; break; case 36: type = LLM_TYPE_3B; break; case 42: type = LLM_TYPE_7B; break; diff --git a/src/models/starcoder2.cpp b/src/models/starcoder2.cpp index 9c207c028856..b81b469374a4 100644 --- a/src/models/starcoder2.cpp +++ b/src/models/starcoder2.cpp @@ -2,7 +2,8 @@ void llama_model_starcoder2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 30: type = LLM_TYPE_3B; break; case 32: type = LLM_TYPE_7B; break; case 40: type = LLM_TYPE_15B; break; diff --git a/src/models/step35.cpp b/src/models/step35.cpp index 0633bfadbcea..753bd68a4429 100644 --- a/src/models/step35.cpp +++ b/src/models/step35.cpp @@ -23,12 +23,16 @@ void llama_model_step35::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); - ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer); + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer()); - ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp, hparams.n_layer, false); - ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer, false); + ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp, hparams.n_layer(), false); + ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer(), false); - switch (hparams.n_layer) { + // NextN/MTP (Step3p5): extra decoder block appended beyond the main stack. + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); + + switch (hparams.n_layer()) { case 45: type = LLM_TYPE_196B_A11B; break; default: type = LLM_TYPE_UNKNOWN; } @@ -37,6 +41,15 @@ void llama_model_step35::load_arch_hparams(llama_model_loader & ml) { void llama_model_step35::load_arch_tensors(llama_model_loader &) { LLAMA_LOAD_LOCALS; + const bool mtp_only = (hparams.n_layer_nextn > 0) && (ml.get_weight("blk.0.attn_norm.weight") == nullptr); + // Trunk-only: the GGUF declares MTP layers in metadata but the actual MTP + // tensors live in a separate file (e.g. user split target/draft). Mark + // MTP tensors NOT_REQUIRED so the trunk loads cleanly. + const std::string mtp_probe = "blk." + std::to_string(n_layer) + ".nextn.eh_proj.weight"; + const bool trunk_only = (hparams.n_layer_nextn > 0) && (ml.get_weight(mtp_probe.c_str()) == nullptr); + const int trunk_flags = mtp_only ? TENSOR_NOT_REQUIRED : 0; + const int mtp_flags = trunk_only ? TENSOR_NOT_REQUIRED : 0; + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // output @@ -97,6 +110,79 @@ void llama_model_step35::load_arch_tensors(llama_model_loader &) { layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED); layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED); layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, TENSOR_NOT_REQUIRED); + }; + + auto load_block_mtp = [&](int i, bool is_first_mtp) { + auto & layer = layers[i]; + + const uint32_t n_head_l = hparams.n_head(i); + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i); + + // The MTP block is a full Step3p5 decoder layer (mtp_block) plus the + // NextN-specific wiring (enorm/hnorm/eh_proj + optional shared head). + // `mtp_flags` becomes NOT_REQUIRED when the GGUF is trunk-only. + // + // Only the FIRST MTP block (i == n_main) is required for the + // single-block MTP runtime; trailing MTP blocks are always tolerated + // as missing so pruned GGUFs (block 0 only) load cleanly. Override + // mtp_flags to NOT_REQUIRED for those. + const int eff_mtp_flags = is_first_mtp ? mtp_flags : (mtp_flags | TENSOR_NOT_REQUIRED); + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, eff_mtp_flags); + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED); + + if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) { + layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | TENSOR_DUPLICATED); + layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | TENSOR_DUPLICATED); + } else { + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | TENSOR_DUPLICATED); + } + + create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head_l, n_embd_k_gqa, n_embd_v_gqa, eff_mtp_flags); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, eff_mtp_flags); + + layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_head_l}, TENSOR_NOT_REQUIRED); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, eff_mtp_flags); + + // dense MLP (leading dense blocks) — present if the MTP block isn't MoE + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, TENSOR_NOT_REQUIRED); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED); + + // MoE routed experts + selection bias (router_bias) + const int64_t n_ff_exp = hparams.n_ff_exp; + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED); + layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, TENSOR_NOT_REQUIRED); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED); + layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED); + + layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED); + layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED); + layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, TENSOR_NOT_REQUIRED); + + // NextN-specific tensors that define the MTP block. + layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, eff_mtp_flags); + layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, eff_mtp_flags); + layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, eff_mtp_flags); + layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); + layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); + layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED); + }; + + for (int i = 0; i < n_layer; ++i) { + load_block_trunk(i, trunk_flags); + } + // Only the first MTP block (i == n_main) is required at runtime — the + // single-block-MTP graph in build_arch_graph always uses that one. + // Trailing MTP blocks are loaded if present (so an un-pruned GGUF with + // all MTP layers still works) but tolerated when absent via the pruning + // path. See scripts/prune_step35_extra_mtp.py for the pruner. + for (int i = n_layer; i < n_layer_all; ++i) { + load_block_mtp(i, /*is_first_mtp=*/ i == n_layer); } } @@ -113,6 +199,7 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para auto * inp_attn = build_attn_inp_kv_iswa(); ggml_tensor * inp_out_ids = build_inp_out_ids(); + // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass. for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -200,7 +287,7 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para cb(cur, "attn_proj", il); } - if (il == n_transformer_layers - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { + if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -280,7 +367,7 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para // LLM_GRAPH_TYPE_DECODER_MTP draft head for Step3p5 (MoE) llama_model_step35::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - GGML_ASSERT(hparams.nextn_predict_layers > 0 && "STEP35 MTP requires nextn_predict_layers > 0"); + GGML_ASSERT(hparams.n_layer_nextn > 0 && "STEP35 MTP requires n_layer_nextn > 0"); // Single-block MTP only: always run the first trained MTP block (Qwen // MTP / vLLM single-MTP-layer style). Multi-block round-robin proved to @@ -288,7 +375,7 @@ llama_model_step35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr // blocks are loaded with TENSOR_NOT_REQUIRED so pruned GGUFs (with just // block 0) also work — see load_arch_tensors below and // scripts/prune_step35_extra_mtp.py. - const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers; + const int il = hparams.n_layer(); const auto & layer = model.layers[il]; GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj"); diff --git a/src/models/t5.cpp b/src/models/t5.cpp index 73e327414066..b0e3f062572f 100644 --- a/src/models/t5.cpp +++ b/src/models/t5.cpp @@ -9,10 +9,10 @@ void llama_model_t5::load_arch_hparams(llama_model_loader & ml) { hparams.dec_start_token_id = dec_start_token_id; } - hparams.dec_n_layer = hparams.n_layer; + hparams.dec_n_layer = hparams.n_layer(); ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 6: type = LLM_TYPE_60M; break; // t5-small case 8: type = LLM_TYPE_80M; break; // flan-t5-small case 12: diff --git a/src/models/xverse.cpp b/src/models/xverse.cpp index d6d1c7a2e5de..3135001293a8 100644 --- a/src/models/xverse.cpp +++ b/src/models/xverse.cpp @@ -2,7 +2,8 @@ void llama_model_xverse::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 40: type = LLM_TYPE_13B; break; case 80: type = LLM_TYPE_65B; break; From 6d9a4a89345a248255eaf5d4e561a7f0b73bac0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 7 Jun 2026 13:43:52 +0200 Subject: [PATCH 15/19] spec : fix vocab compatibility check (#24256) (cherry picked from commit 8a091c47abe67e0a03b85bc7c9eee8bdb9b14b05) --- common/speculative.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/speculative.cpp b/common/speculative.cpp index aa0d0ed86b24..628ded45ca7a 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -58,10 +58,10 @@ static bool common_speculative_are_compatible( const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt); const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft); - const bool vocab_type_tgt = llama_vocab_type(vocab_tgt); + const auto vocab_type_tgt = llama_vocab_type(vocab_tgt); LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt); - const bool vocab_type_dft = llama_vocab_type(vocab_dft); + const auto vocab_type_dft = llama_vocab_type(vocab_dft); LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft); if (vocab_type_tgt != vocab_type_dft) { From d1e70aaf024d469aa3a2f443880335fb565d5e19 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sun, 7 Jun 2026 20:50:54 +0800 Subject: [PATCH 16/19] llama : add Gemma4 MTP (#23398) (cherry picked from commit 04eb4c446d22b63449d5dc41c038987d4d8cc3a6) --- common/speculative.cpp | 87 ++++++----- conversion/__init__.py | 2 + conversion/gemma.py | 10 ++ gguf-py/gguf/constants.py | 24 +++ gguf-py/gguf/tensor_mapping.py | 8 + include/llama.h | 4 + src/llama-arch.cpp | 5 + src/llama-arch.h | 3 + src/llama-context.cpp | 55 ++++--- src/llama-context.h | 3 +- src/llama-cparams.h | 2 + src/llama-ext.h | 2 + src/llama-graph.cpp | 31 ++-- src/llama-graph.h | 1 + src/llama-hparams.cpp | 4 + src/llama-hparams.h | 9 ++ src/llama-kv-cache-dsa.cpp | 261 +++++++++++++++++++++++++++++++ src/llama-kv-cache-iswa.cpp | 18 ++- src/llama-kv-cache-iswa.h | 4 +- src/llama-kv-cache.cpp | 203 +++++++++++++++++------- src/llama-kv-cache.h | 8 +- src/llama-memory-hybrid-iswa.cpp | 2 + src/llama-memory-hybrid.cpp | 2 + src/llama-memory.h | 4 + src/llama-model.cpp | 93 ++++++++--- src/llama-model.h | 5 + src/models/gemma4-assistant.cpp | 200 +++++++++++++++++++++++ src/models/gemma4.cpp | 22 ++- src/models/models.h | 13 ++ tests/test-llama-archs.cpp | 6 +- tools/server/server-context.cpp | 32 ++-- 31 files changed, 951 insertions(+), 172 deletions(-) create mode 100644 src/llama-kv-cache-dsa.cpp create mode 100644 src/models/gemma4-assistant.cpp diff --git a/common/speculative.cpp b/common/speculative.cpp index 628ded45ca7a..86c1e6a42903 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -3,13 +3,14 @@ #include "common.h" #include "ggml.h" #include "llama.h" -#include "../src/llama-ext.h" // staging API: llama_set_embeddings_nextn / llama_get_embeddings_nextn_ith (used by MTP) #include "log.h" #include "ngram-cache.h" #include "ngram-map.h" #include "ngram-mod.h" #include "sampling.h" +#include "../src/llama-ext.h" // staging API: llama_set_embeddings_nextn / llama_get_embeddings_nextn_ith (used by MTP) + #include #include #include @@ -418,6 +419,8 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { int32_t n_embd = 0; + bool is_mem_shared = false; + // Per-sequence cross-batch carryover: pair (h_p, x_{p+1}) at MTP pos p+1. // The last h-row of one process() call needs the first token of the NEXT // call to pair with, so it's stashed here until that next call fires. @@ -444,7 +447,9 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { auto * ctx_dft = this->params.ctx_dft; GGML_ASSERT(ctx_tgt && ctx_dft && "MTP requires ctx_tgt and ctx_dft to be set"); - n_embd = llama_model_n_embd(llama_get_model(ctx_dft)); + n_embd = llama_model_n_embd_out(llama_get_model(ctx_dft)); + GGML_ASSERT(n_embd == llama_model_n_embd(llama_get_model(ctx_tgt)) && + "MTP input row width must match the target h_nextn width"); LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__); LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d, backend_sampling=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd, (int) this->params.backend_sampling); @@ -490,6 +495,8 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { llama_set_embeddings_nextn(ctx_tgt, true, /*masked*/ false); llama_set_embeddings_nextn(ctx_dft, true, /*masked*/ true); + is_mem_shared = llama_get_ctx_other(ctx_dft) == ctx_tgt; + pending_h.assign(n_seq, std::vector(n_embd, 0.0f)); i_batch_beg.assign(n_seq, -1); @@ -526,9 +533,11 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { if (N <= 0) { return; } + auto * ctx_dft = this->params.ctx_dft; const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id); - if (pos_max < N - 1) { + + if (pos_max < N - 1 && !is_mem_shared) { LOG_WRN("%s: ctx_dft pos_max=%d < N-1=%d - " "process() hook may not have run on every prefill ubatch " "(need_embd / logits=1 on every prompt position?). " @@ -571,48 +580,42 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { const size_t row_bytes = (size_t) n_embd * sizeof(float); - common_batch_clear(batch); + // if kv is shared with target (e.g Gemma4), then we can skip this catch-up decode + if (!is_mem_shared) { + common_batch_clear(batch); - for (int k = 0; k < n_tokens; ++k) { - common_batch_add(batch, batch_in.token[k], batch_in.pos[k], { batch_in.seq_id[k][0] }, 0); - } + for (int k = 0; k < n_tokens; ++k) { + common_batch_add(batch, batch_in.token[k], batch_in.pos[k], { batch_in.seq_id[k][0] }, 0); + } - // shift the tgt embeddings to the right by one position - // assumes that the tokens in the batch are sequential for each sequence - // i.e. we cannot have seq_id like this: [0, 0, 0, 1, 1, 0, 1, 1] - // ^--- this is a problem - // TODO:this is generally true, but would be nice to assert it - { - const float * h_tgt = llama_get_embeddings_nextn(ctx_tgt); - std::memcpy(batch.embd + (size_t) 1 * n_embd, h_tgt, row_bytes * (n_tokens-1)); + // shift the tgt embeddings to the right by one position + // assumes that the tokens in the batch are sequential for each sequence + // i.e. we cannot have seq_id like this: [0, 0, 0, 1, 1, 0, 1, 1] + // ^--- this is a problem + // TODO:this is generally true, but would be nice to assert it + { + const float * h_tgt = llama_get_embeddings_nextn(ctx_tgt); + std::memcpy(batch.embd + (size_t) 1 * n_embd, h_tgt, row_bytes * (n_tokens-1)); + } - //{ - // // string with seq_ids in the batch - // std::stringstream ss; - // for (int i = 0; i < n_tokens; ++i) { - // ss << batch_in.seq_id[i][0] << ","; - // } - // LOG_WRN("%s: batch_in.seq_id = %s\n", __func__, ss.str().c_str()); - //} - } + // fill the pending embeddings from a previous run + auto set_h = [&](int idx, const float * h_row) { + std::memcpy(batch.embd + (size_t) idx * n_embd, h_row, row_bytes); + }; - // fill the pending embeddings from a previous run - auto set_h = [&](int idx, const float * h_row) { - std::memcpy(batch.embd + (size_t) idx * n_embd, h_row, row_bytes); - }; + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + if (i_batch_beg[seq_id] < 0) { + continue; + } - for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { - if (i_batch_beg[seq_id] < 0) { - continue; + set_h(i_batch_beg[seq_id], pending_h[seq_id].data()); } - set_h(i_batch_beg[seq_id], pending_h[seq_id].data()); - } - - const int32_t rc = llama_decode(ctx_dft, batch); - if (rc != 0) { - LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (pos=%d)\n", __func__, (int) rc, (int) batch_in.pos[0]); - return false; + const int32_t rc = llama_decode(ctx_dft, batch); + if (rc != 0) { + LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (pos=%d)\n", __func__, (int) rc, (int) batch_in.pos[0]); + return false; + } } for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { @@ -721,7 +724,13 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { continue; } - common_batch_add(batch, id, dp.n_past + i + 1, { seq_id }, true); + if (is_mem_shared) { + // note: with shared memory (e.g. Gemma4 assistants) we use the same position for all draft tokens + // ref: https://github.com/huggingface/transformers/blob/effde20942e3f82a1b97449f60b3a48c5ff96145/docs/source/en/model_doc/gemma4_assistant.md?plain=1#L36-L37 + common_batch_add(batch, id, dp.n_past, { seq_id }, true); + } else { + common_batch_add(batch, id, dp.n_past + i + 1, { seq_id }, true); + } std::memcpy(batch.embd + n_embd*(batch.n_tokens - 1), h_row, row_bytes); } diff --git a/conversion/__init__.py b/conversion/__init__.py index e933280a9422..14b86198631b 100644 --- a/conversion/__init__.py +++ b/conversion/__init__.py @@ -73,9 +73,11 @@ "Gemma3TextModel": "gemma", "Gemma3nForCausalLM": "gemma", "Gemma3nForConditionalGeneration": "gemma", + "Gemma4AssistantForCausalLM": "gemma", "Gemma4ForConditionalGeneration": "gemma", "Gemma4ForCausalLM": "gemma", "Gemma4UnifiedForConditionalGeneration": "gemma", + "Gemma4UnifiedAssistantForCausalLM": "gemma", "GemmaForCausalLM": "gemma", "Glm4ForCausalLM": "glm", "Glm4MoeForCausalLM": "glm", diff --git a/conversion/gemma.py b/conversion/gemma.py index 31a450ed4824..5339ec016780 100644 --- a/conversion/gemma.py +++ b/conversion/gemma.py @@ -785,6 +785,16 @@ def set_gguf_parameters(self): self.gguf_writer.add_suppress_tokens(suppress_tokens) +@ModelBase.register("Gemma4AssistantForCausalLM", "Gemma4UnifiedAssistantForCausalLM") +class Gemma4AssistantModel(Gemma4Model): + model_arch = gguf.MODEL_ARCH.GEMMA4_ASSISTANT + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_embedding_length_out(self.hparams["backbone_hidden_size"]) + self.gguf_writer.add_nextn_predict_layers(self.block_count) + + @ModelBase.register("Gemma4ForConditionalGeneration") class Gemma4VisionAudioModel(MmprojModel): has_audio_encoder = True diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 67e81ab5f836..5bb723f6416a 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -431,6 +431,7 @@ class MODEL_ARCH(IntEnum): GEMMA3 = auto() GEMMA3N = auto() GEMMA4 = auto() + GEMMA4_ASSISTANT = auto() GEMMA_EMBEDDING = auto() STARCODER2 = auto() RWKV6 = auto() @@ -858,6 +859,8 @@ class MODEL_TENSOR(IntEnum): A_PER_DIM_K_SCALE = auto() # gemma4 A_PER_DIM_SCALE = auto() # gemma4 # nextn/mtp + NEXTN_PROJ_PRE = auto() + NEXTN_PROJ_POST = auto() NEXTN_EH_PROJ = auto() NEXTN_EMBED_TOKENS = auto() NEXTN_ENORM = auto() @@ -947,6 +950,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.GEMMA3: "gemma3", MODEL_ARCH.GEMMA3N: "gemma3n", MODEL_ARCH.GEMMA4: "gemma4", + MODEL_ARCH.GEMMA4_ASSISTANT: "gemma4-assistant", MODEL_ARCH.GEMMA_EMBEDDING: "gemma-embedding", MODEL_ARCH.STARCODER2: "starcoder2", MODEL_ARCH.RWKV6: "rwkv6", @@ -1404,6 +1408,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.A_QF_FFN_DOWN: "a.proj_blk.{bid}.ffn_down", MODEL_TENSOR.A_QF_FFN_NORM: "a.proj_blk.{bid}.ffn_norm", # NextN/MTP + MODEL_TENSOR.NEXTN_PROJ_PRE: "nextn.pre_projection", + MODEL_TENSOR.NEXTN_PROJ_POST: "nextn.post_projection", MODEL_TENSOR.NEXTN_EH_PROJ: "blk.{bid}.nextn.eh_proj", MODEL_TENSOR.NEXTN_EMBED_TOKENS: "blk.{bid}.nextn.embed_tokens", MODEL_TENSOR.NEXTN_ENORM: "blk.{bid}.nextn.enorm", @@ -2485,6 +2491,24 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.PER_LAYER_PROJ_NORM, MODEL_TENSOR.PER_LAYER_POST_NORM, ], + MODEL_ARCH.GEMMA4_ASSISTANT: [ + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.NEXTN_PROJ_PRE, + MODEL_TENSOR.NEXTN_PROJ_POST, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_POST_NORM, + MODEL_TENSOR.FFN_PRE_NORM, + MODEL_TENSOR.FFN_POST_NORM, + MODEL_TENSOR.LAYER_OUT_SCALE, + ], MODEL_ARCH.GEMMA_EMBEDDING: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index ab9dfe36758f..85c4541486fc 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -2251,6 +2251,14 @@ class TensorNameMap: ), # NextN/MTP tensors + MODEL_TENSOR.NEXTN_PROJ_PRE: ( + "pre_projection", + ), + + MODEL_TENSOR.NEXTN_PROJ_POST: ( + "post_projection", + ), + MODEL_TENSOR.NEXTN_EH_PROJ: ( "model.layers.{bid}.eh_proj", ), diff --git a/include/llama.h b/include/llama.h index 52eb212466b5..43d72d07fbd8 100644 --- a/include/llama.h +++ b/include/llama.h @@ -389,6 +389,10 @@ extern "C" { // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init) struct llama_sampler_seq_config * samplers; size_t n_samplers; + + // a source/target/parent context + // can be utilized in various ways, for example by sharing results or llama_memory between 2 contexts + struct llama_context * ctx_other; }; struct llama_model_tensor_override { diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 200b099a388c..dfbadd7c7a87 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -57,6 +57,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_GEMMA3, "gemma3" }, { LLM_ARCH_GEMMA3N, "gemma3n" }, { LLM_ARCH_GEMMA4, "gemma4" }, + { LLM_ARCH_GEMMA4_ASSISTANT, "gemma4-assistant" }, { LLM_ARCH_GEMMA_EMBEDDING, "gemma-embedding" }, { LLM_ARCH_STARCODER2, "starcoder2" }, { LLM_ARCH_MAMBA, "mamba" }, @@ -447,6 +448,8 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" }, { LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" }, { LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" }, + { LLM_TENSOR_NEXTN_PROJ_PRE, "nextn.pre_projection" }, + { LLM_TENSOR_NEXTN_PROJ_POST, "nextn.post_projection" }, { LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.nextn.eh_proj" }, { LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.nextn.embed_tokens" }, { LLM_TENSOR_NEXTN_ENORM, "blk.%d.nextn.enorm" }, @@ -759,6 +762,8 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_INDEXER_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_INDEXER_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_INDEXER_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_NEXTN_PROJ_PRE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_NEXTN_PROJ_POST, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // NextN/MTP tensors are stored per-block (blk.%d.nextn.*) even though only the // last nextn_predict_layers blocks carry them. Classify as LAYER_REPEATING so // the model loader doesn't fault on the block index. diff --git a/src/llama-arch.h b/src/llama-arch.h index dfb89eb8292a..975b6e39b805 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -61,6 +61,7 @@ enum llm_arch { LLM_ARCH_GEMMA3, LLM_ARCH_GEMMA3N, LLM_ARCH_GEMMA4, + LLM_ARCH_GEMMA4_ASSISTANT, LLM_ARCH_GEMMA_EMBEDDING, LLM_ARCH_STARCODER2, LLM_ARCH_MAMBA, @@ -551,6 +552,8 @@ enum llm_tensor { LLM_TENSOR_INDEXER_PROJ, LLM_TENSOR_INDEXER_ATTN_K, LLM_TENSOR_INDEXER_ATTN_Q_B, + LLM_TENSOR_NEXTN_PROJ_PRE, + LLM_TENSOR_NEXTN_PROJ_POST, LLM_TENSOR_NEXTN_EH_PROJ, LLM_TENSOR_NEXTN_EMBED_TOKENS, LLM_TENSOR_NEXTN_ENORM, diff --git a/src/llama-context.cpp b/src/llama-context.cpp index a41d2499ba7b..7a895f52226c 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -69,9 +69,10 @@ llama_context::llama_context( cparams.embeddings_nextn_masked = false; cparams.offload_kqv = params.offload_kqv; cparams.no_perf = params.no_perf; - cparams.pooling_type = params.pooling_type; cparams.warmup = false; + cparams.ctx_type = params.ctx_type; + cparams.pooling_type = params.pooling_type; cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; @@ -84,7 +85,17 @@ llama_context::llama_context( cparams.cb_eval = params.cb_eval; cparams.cb_eval_user_data = params.cb_eval_user_data; - cparams.ctx_type = params.ctx_type; + cparams.ctx_other = nullptr; + + // TODO: more generic + if (model.arch == LLM_ARCH_GEMMA4_ASSISTANT) { + if (params.ctx_other == nullptr) { + // TODO: change from runtime_error to llama_exception to avoid printing error message + throw std::runtime_error("Gemma4Assistant requires ctx_other to be set (this is normal during memory fitting)"); + } + + cparams.ctx_other = params.ctx_other; + } // Initialize backend samplers here so they are part of the sampling graph // before the reserve passes run later in this function. This avoids a later @@ -298,10 +309,11 @@ llama_context::llama_context( // init the memory module if (!hparams.vocab_only) { llama_memory_params params_mem = { - /*.type_k =*/ params.type_k, - /*.type_v =*/ params.type_v, - /*.swa_full =*/ params.swa_full, - /*.ctx_type= */ cparams.ctx_type, + /*.type_k =*/ params.type_k, + /*.type_v =*/ params.type_v, + /*.swa_full =*/ params.swa_full, + /*.ctx_type =*/ cparams.ctx_type, + /*.mem_other =*/ llama_get_memory(cparams.ctx_other), }; memory.reset(model.create_memory(params_mem, cparams)); @@ -898,7 +910,7 @@ float * llama_context::get_embeddings_nextn_ith(int32_t i) { throw std::runtime_error("no nextn embeddings"); } - const uint32_t n_embd = model.hparams.n_embd; + const uint32_t n_embd = model.hparams.n_embd_out(); if (!cparams.embeddings_nextn_masked) { // unmasked: nextn rows are stored densely, indexed by raw token position. @@ -1467,7 +1479,7 @@ int llama_context::encode(const llama_batch & batch_inp) { ggml_backend_t backend_h = ggml_backend_sched_get_tensor_backend(sched.get(), t_h_nextn); GGML_ASSERT(backend_h != nullptr); - const uint32_t n_embd = hparams.n_embd; + const uint32_t n_embd = hparams.n_embd_out(); GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_nextn.size); ggml_backend_tensor_get_async(backend_h, t_h_nextn, embd_nextn.data, 0, n_tokens*n_embd*sizeof(float)); } @@ -1918,7 +1930,7 @@ int llama_context::decode(const llama_batch & batch_inp) { ggml_backend_t backend_h = ggml_backend_sched_get_tensor_backend(sched.get(), t_h_nextn); GGML_ASSERT(backend_h != nullptr); - const uint32_t n_embd = hparams.n_embd; + const uint32_t n_embd = hparams.n_embd_out(); float * embd_nextn_out = embd_nextn.data + offset*n_embd; GGML_ASSERT((offset + n_rows)*n_embd <= (int64_t) embd_nextn.size); @@ -2011,7 +2023,6 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { const auto n_batch = cparams.n_batch; const auto n_vocab = vocab.n_tokens(); - const auto n_embd = hparams.n_embd; const auto n_embd_out = hparams.n_embd_out(); bool has_logits = true; @@ -2030,12 +2041,12 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { logits.size = has_logits ? n_vocab*n_outputs_max : 0; embd.size = has_embd ? n_embd_out*n_outputs_max : 0; - embd_nextn.size = has_embd_nextn ? n_embd*n_outputs_max : 0; + embd_nextn.size = has_embd_nextn ? n_embd_out*n_outputs_max : 0; if (has_embd_nextn && !cparams.embeddings_nextn_masked) { // unmasked: nextn row exists for every token in the batch, not just // those flagged via batch.logits[i] -> size by token count instead. - embd_nextn.size = (size_t) n_embd * n_batch; + embd_nextn.size = (size_t) n_embd_out * n_batch; } // Allocate backend sampling output buffers if there are backend samplers configured. @@ -3368,6 +3379,7 @@ llama_context_params llama_context_default_params() { /*.kv_unified =*/ false, /*.sampler =*/ nullptr, /*.n_sampler =*/ 0, + /*.ctx_other =*/ nullptr, }; return result; @@ -3476,7 +3488,6 @@ llama_context * llama_init_from_model( return nullptr; } - try { auto * ctx = new llama_context(*model, params); return ctx; @@ -3615,6 +3626,14 @@ void llama_set_embeddings_nextn(llama_context * ctx, bool value, bool masked) { ctx->set_embeddings_nextn(value, masked); } +llama_memory_t llama_get_memory(const struct llama_context * ctx) { + if (!ctx) { + return nullptr; + } + + return ctx->get_memory(); +} + float * llama_get_embeddings_nextn(llama_context * ctx) { ctx->synchronize(); @@ -3678,7 +3697,7 @@ struct ggml_cgraph * llama_graph_reserve( uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs) { - auto * memory = ctx->get_memory(); + auto memory = ctx->get_memory(); llama_memory_context_ptr mctx; if (memory) { mctx = memory->init_full(); @@ -3718,10 +3737,6 @@ int32_t llama_set_adapter_cvec( // memory // -llama_memory_t llama_get_memory(const struct llama_context * ctx) { - return ctx->get_memory(); -} - void llama_memory_clear(llama_memory_t mem, bool data) { if (!mem) { return; @@ -4032,3 +4047,7 @@ void llama_opt_epoch( llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx) { return ctx->memory_breakdown(); } + +llama_context * llama_get_ctx_other(struct llama_context * ctx) { + return ctx->get_cparams().ctx_other; +} diff --git a/src/llama-context.h b/src/llama-context.h index 2af92b0f0968..6f8f59a22a3e 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -6,6 +6,7 @@ #include "llama-graph.h" #include "llama-adapter.h" #include "llama-impl.h" +#include "llama-memory.h" #include "ggml-cpp.h" #include "ggml-opt.h" @@ -273,7 +274,7 @@ struct llama_context { llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably - std::unique_ptr memory; + llama_memory_ptr memory; // decode output (2-dimensional array: [n_outputs][n_vocab]) buffer_view logits = {nullptr, 0}; diff --git a/src/llama-cparams.h b/src/llama-cparams.h index a1515e9a061d..11060e863162 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -48,4 +48,6 @@ struct llama_cparams { ggml_backend_sched_eval_callback cb_eval; void * cb_eval_user_data; + + llama_context * ctx_other; }; diff --git a/src/llama-ext.h b/src/llama-ext.h index 7ad6125fad33..bd74544129b4 100644 --- a/src/llama-ext.h +++ b/src/llama-ext.h @@ -100,3 +100,5 @@ LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx); // LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i); LLAMA_API float * llama_get_embeddings_nextn_ith(struct llama_context * ctx, int32_t i); + +LLAMA_API llama_context * llama_get_ctx_other(struct llama_context * ctx); diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index ef712be139e9..2bc43eece94a 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -392,7 +392,7 @@ static void print_mask(const float * data, int64_t n_tokens, int64_t n_kv, int64 case LLAMA_SWA_TYPE_SYMMETRIC: swa_type_str = "LLAMA_SWA_TYPE_SYMMETRIC"; break; }; - LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swq_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str); + LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swa_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str); LLAMA_LOG_DEBUG("%s: '0' = can attend, '∞' = masked\n", __func__); LLAMA_LOG_DEBUG("%s: Rows = query tokens, Columns = key/value tokens\n\n", __func__); @@ -537,18 +537,18 @@ void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) { if (self_k_idxs && self_k_idxs->buffer) { mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch); mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch); - - mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); } + mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); + // swa tensors may not be allocated if there are no SWA attention layers if (self_k_idxs_swa && self_k_idxs_swa->buffer) { mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch); mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch); - - mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); } + mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); + if (self_k_rot) { mctx->get_base()->set_input_k_rot(self_k_rot); } @@ -577,18 +577,18 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) { if (self_k_idxs && self_k_idxs->buffer) { res &= self_k_idxs->ne[0] == params.ubatch.n_tokens; //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there - - res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams); } + res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams); + // swa tensors may not be allocated if there are no SWA attention layers if (self_k_idxs_swa && self_k_idxs_swa->buffer) { res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens; //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there - - res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams); } + res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams); + return res; } @@ -721,7 +721,9 @@ void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) { if (inp_attn->self_k_idxs && inp_attn->self_k_idxs->buffer) { attn_ctx->get_base()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch); attn_ctx->get_base()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch); + } + if (inp_attn->self_kq_mask && inp_attn->self_kq_mask->buffer) { attn_ctx->get_base()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn); } @@ -729,7 +731,9 @@ void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) { if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) { attn_ctx->get_swa()->set_input_k_idxs(inp_attn->self_k_idxs_swa, ubatch); attn_ctx->get_swa()->set_input_v_idxs(inp_attn->self_v_idxs_swa, ubatch); + } + if (inp_attn->self_kq_mask_swa && inp_attn->self_kq_mask_swa->buffer) { attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn); } @@ -775,18 +779,18 @@ bool llm_graph_input_mem_hybrid_iswa::can_reuse(const llm_graph_params & params) if (inp_attn->self_k_idxs && inp_attn->self_k_idxs->buffer) { res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens; //res &= inp_attn->self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there - - res &= can_reuse_kq_mask(inp_attn->self_kq_mask, attn_ctx->get_base(), params.ubatch, params.cparams); } + res &= can_reuse_kq_mask(inp_attn->self_kq_mask, attn_ctx->get_base(), params.ubatch, params.cparams); + // swa tensors may not be allocated if there are no SWA attention layers if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) { res &= inp_attn->self_k_idxs_swa->ne[0] == params.ubatch.n_tokens; //res &= inp_attn->self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there - - res &= can_reuse_kq_mask(inp_attn->self_kq_mask_swa, attn_ctx->get_swa(), params.ubatch, params.cparams); } + res &= can_reuse_kq_mask(inp_attn->self_kq_mask_swa, attn_ctx->get_swa(), params.ubatch, params.cparams); + res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs(); res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs; @@ -971,6 +975,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : ubatch (params.ubatch), n_embd (hparams.n_embd), n_layer (hparams.n_layer()), + n_layer_nextn (hparams.n_layer_nextn), n_rot (hparams.n_rot()), n_ctx (cparams.n_ctx), n_head (hparams.n_head()), diff --git a/src/llama-graph.h b/src/llama-graph.h index b514e4c23e07..06921868644b 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -744,6 +744,7 @@ struct llm_graph_context { const int64_t n_embd; const int64_t n_layer; + const int64_t n_layer_nextn; const int64_t n_rot; const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train) const int64_t n_head; diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index e1e49d1cc1fe..2bf576873824 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -91,6 +91,10 @@ uint32_t llama_hparams::n_rot(uint32_t il) const { } uint32_t llama_hparams::n_embd_inp() const { + if (n_embd_inp_impl > 0) { + return n_embd_inp_impl; + } + uint32_t n_embd_inp = n_embd; if (n_deepstack_layers > 0) { diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 094526eca82b..60f4025e88df 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -182,6 +182,9 @@ struct llama_hparams { // for Classifiers uint32_t n_cls_out = 1; + // input embedding dimension (0 = use n_embd) + uint32_t n_embd_inp_impl = 0; + // output embedding dimension (0 = use n_embd) uint32_t n_embd_out_impl = 0; @@ -216,6 +219,12 @@ struct llama_hparams { uint32_t indexer_top_k = 0; // qwen3vl deepstack + // When parsed from GGUF, this implies the first N layers consume the first + // N deepstack embeddings. Use deepstack_mapping_arr if you need a more + // complex mapping. If using deepstack_mapping_arr, also make sure to set + // n_deepstack_layers to the number of unique deepstack layers so that + // n_embd_imp is accurate (see granite.cpp). + // TODO: can be expressed via the `new n_embd_inp_impl` and remove this param uint32_t n_deepstack_layers = 0; // gemma4 per-layer embedding diff --git a/src/llama-kv-cache-dsa.cpp b/src/llama-kv-cache-dsa.cpp new file mode 100644 index 000000000000..916ab6537563 --- /dev/null +++ b/src/llama-kv-cache-dsa.cpp @@ -0,0 +1,261 @@ +#include "llama-kv-cache-dsa.h" + +#include "llama-impl.h" +#include "llama-batch.h" +#include "llama-model.h" + +#include +#include + +// +// llama_kv_cache_dsa +// + +llama_kv_cache_dsa::llama_kv_cache_dsa( + const llama_model & model, + ggml_type type_k, + ggml_type type_v, + bool v_trans, + bool offload, + bool unified, + uint32_t kv_size, + uint32_t n_seq_max, + uint32_t n_pad, + uint32_t n_swa, + llama_swa_type swa_type, + const layer_filter_cb & filter, + const layer_reuse_cb & reuse) : + hparams_lid(model.hparams), n_stream(unified ? 1 : n_seq_max) { + + LLAMA_LOG_INFO("%s: creating main KV cache, size = %u cells\n", __func__, kv_size); + + kv_mla = std::make_unique( + model, model.hparams, type_k, type_v, + v_trans, offload, unified, kv_size, n_seq_max, n_pad, + n_swa, swa_type, nullptr, filter, reuse, nullptr); + + // we use llama_kv_cache for caching indexer keys + // by hand-tweaking some hparams we fool it to create + // indexer key cache tensors with correct dimensions + // https://github.com/ggml-org/llama.cpp/pull/21149#discussion_r3015940823 + + // DSA lightning indexer uses MQA with single key head + std::fill(hparams_lid.n_head_kv_arr.begin(), hparams_lid.n_head_kv_arr.end(), 1); + hparams_lid.n_embd_head_k_full = model.hparams.indexer_head_size; + hparams_lid.rope_type = LLAMA_ROPE_TYPE_NEOX; + + LLAMA_LOG_INFO("%s: creating indexer KV cache, size = %u cells\n", __func__, kv_size); + + kv_lid = std::make_unique( + model, hparams_lid, type_k, type_v, + v_trans, offload, unified, kv_size, n_seq_max, n_pad, + n_swa, swa_type, nullptr, filter, reuse, nullptr); +} + +void llama_kv_cache_dsa::clear(bool data) { + kv_mla->clear(data); + kv_lid->clear(data); +} + +bool llama_kv_cache_dsa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { + bool res = true; + + res = res & kv_mla->seq_rm(seq_id, p0, p1); + res = res & kv_lid->seq_rm(seq_id, p0, p1); + + return res; +} + +void llama_kv_cache_dsa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { + kv_mla->seq_cp(seq_id_src, seq_id_dst, p0, p1); + kv_lid->seq_cp(seq_id_src, seq_id_dst, p0, p1); +} + +void llama_kv_cache_dsa::seq_keep(llama_seq_id seq_id) { + kv_mla->seq_keep(seq_id); + kv_lid->seq_keep(seq_id); +} + +void llama_kv_cache_dsa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) { + kv_mla->seq_add(seq_id, p0, p1, shift); + kv_lid->seq_add(seq_id, p0, p1, shift); +} + +void llama_kv_cache_dsa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { + kv_mla->seq_div(seq_id, p0, p1, d); + kv_lid->seq_div(seq_id, p0, p1, d); +} + +llama_pos llama_kv_cache_dsa::seq_pos_min(llama_seq_id seq_id) const { + return kv_mla->seq_pos_min(seq_id); +} + +llama_pos llama_kv_cache_dsa::seq_pos_max(llama_seq_id seq_id) const { + return kv_mla->seq_pos_max(seq_id); +} + +std::map llama_kv_cache_dsa::memory_breakdown() const { + std::map mb = kv_mla->memory_breakdown(); + for (const auto & buft_size : kv_lid->memory_breakdown()) { + mb[buft_size.first] += buft_size.second; + } + return mb; +} + +llama_memory_context_ptr llama_kv_cache_dsa::init_batch( + llama_batch_allocr & balloc, + uint32_t n_ubatch, + bool embd_all) { + GGML_UNUSED(embd_all); + + do { + balloc.split_reset(); + + std::vector ubatches; + while (true) { + auto ubatch = n_stream == 1 ? balloc.split_simple(n_ubatch) : balloc.split_equal(n_ubatch, true); + + if (ubatch.n_tokens == 0) { + break; + } + + ubatches.push_back(std::move(ubatch)); // NOLINT + } + + if (balloc.get_n_used() < balloc.get_n_tokens()) { + // failed to find a suitable split + break; + } + + auto sinfos_mla = kv_mla->prepare(ubatches); + if (sinfos_mla.empty()) { + break; + } + + auto sinfos_lid = kv_lid->prepare(ubatches); + if (sinfos_lid.empty()) { + break; + } + + assert(sinfos_mla.size() == sinfos_lid.size()); + + return std::make_unique( + this, std::move(sinfos_mla), std::move(sinfos_lid), std::move(ubatches)); + } while (false); + + return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); +} + +llama_memory_context_ptr llama_kv_cache_dsa::init_full() { + return std::make_unique(this); +} + +llama_memory_context_ptr llama_kv_cache_dsa::init_update(llama_context * lctx, bool optimize) { + return std::make_unique(this, lctx, optimize); +} + +bool llama_kv_cache_dsa::get_can_shift() const { + return kv_mla->get_can_shift() && + kv_lid->get_can_shift() && + kv_mla->get_size() == kv_lid->get_size(); +} + +void llama_kv_cache_dsa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const { + kv_mla->state_write(io, seq_id, flags); + kv_lid->state_write(io, seq_id, flags); +} + +void llama_kv_cache_dsa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) { + kv_mla->state_read(io, seq_id, flags); + kv_lid->state_read(io, seq_id, flags); +} + +llama_kv_cache * llama_kv_cache_dsa::get_mla() const { + return kv_mla.get(); +} + +llama_kv_cache * llama_kv_cache_dsa::get_lid() const { + return kv_lid.get(); +} + +// +// llama_kv_cache_dsa_context +// + +llama_kv_cache_dsa_context::llama_kv_cache_dsa_context(llama_memory_status status) : status(status) {} + +llama_kv_cache_dsa_context::llama_kv_cache_dsa_context( + llama_kv_cache_dsa * kv) : + ctx_mla(kv->get_mla()->init_full()), + ctx_lid(kv->get_lid()->init_full()), + status(llama_memory_status_combine(ctx_mla->get_status(), ctx_lid->get_status())) { +} + +llama_kv_cache_dsa_context::llama_kv_cache_dsa_context( + llama_kv_cache_dsa * kv, + llama_context * lctx, + bool optimize) : + ctx_mla(kv->get_mla()->init_update(lctx, optimize)), + ctx_lid(kv->get_lid()->init_update(lctx, optimize)), + status(llama_memory_status_combine(ctx_mla->get_status(), ctx_lid->get_status())) { +} + +llama_kv_cache_dsa_context::llama_kv_cache_dsa_context( + llama_kv_cache_dsa * kv, + slot_info_vec_t sinfos_mla, + slot_info_vec_t sinfos_lid, + std::vector ubatches) : + ubatches(std::move(ubatches)), + // note: here we copy the ubatches. not sure if this is ideal + ctx_mla(new llama_kv_cache_context(kv->get_mla(), std::move(sinfos_mla), this->ubatches)), + ctx_lid(new llama_kv_cache_context(kv->get_lid(), std::move(sinfos_lid), this->ubatches)), + status(llama_memory_status_combine(ctx_mla->get_status(), ctx_lid->get_status())) { +} + +llama_kv_cache_dsa_context:: ~llama_kv_cache_dsa_context() = default; + +bool llama_kv_cache_dsa_context::next() { + assert(status == LLAMA_MEMORY_STATUS_SUCCESS); + + ctx_mla->next(); + ctx_lid->next(); + + if (++i_next >= ubatches.size()) { + return false; + } + + return true; +} + +bool llama_kv_cache_dsa_context::apply() { + assert(!llama_memory_status_is_fail(status)); + + bool res = true; + + res = res & ctx_mla->apply(); + res = res & ctx_lid->apply(); + + return res; +} + +llama_memory_status llama_kv_cache_dsa_context::get_status() const { + return status; +} + +const llama_ubatch & llama_kv_cache_dsa_context::get_ubatch() const { + assert(status == LLAMA_MEMORY_STATUS_SUCCESS); + + return ubatches[i_next]; +} + +const llama_kv_cache_context * llama_kv_cache_dsa_context::get_mla() const { + assert(status == LLAMA_MEMORY_STATUS_SUCCESS); + + return static_cast(ctx_mla.get()); +} + +const llama_kv_cache_context * llama_kv_cache_dsa_context::get_lid() const { + assert(status == LLAMA_MEMORY_STATUS_SUCCESS); + + return static_cast(ctx_lid.get()); +} diff --git a/src/llama-kv-cache-iswa.cpp b/src/llama-kv-cache-iswa.cpp index 26e2cb4270b0..11eedc312ad8 100644 --- a/src/llama-kv-cache-iswa.cpp +++ b/src/llama-kv-cache-iswa.cpp @@ -23,8 +23,10 @@ llama_kv_cache_iswa::llama_kv_cache_iswa( uint32_t n_seq_max, uint32_t n_ubatch, uint32_t n_pad, + llama_memory_t mem_other, const layer_filter_cb & filter, - const layer_reuse_cb & reuse) : hparams(model.hparams), unified(unified) { + const layer_reuse_cb & reuse, + const layer_share_cb & share) : hparams(model.hparams), unified(unified) { // chain filters const layer_filter_cb filter_base = [&](int32_t il) { @@ -59,17 +61,27 @@ llama_kv_cache_iswa::llama_kv_cache_iswa( LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base); + llama_memory_t mem_other_base = nullptr; + if (mem_other) { + mem_other_base = static_cast(mem_other)->get_base(); + } + + llama_memory_t mem_other_swa = nullptr; + if (mem_other) { + mem_other_swa = static_cast(mem_other)->get_swa(); + } + kv_base = std::make_unique( model, type_k, type_v, v_trans, offload, unified, size_base, n_seq_max, n_pad, - 0, LLAMA_SWA_TYPE_NONE, filter_base, reuse); + 0, LLAMA_SWA_TYPE_NONE, mem_other_base, filter_base, reuse, share); LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa); kv_swa = std::make_unique( model, type_k, type_v, v_trans, offload, unified, size_swa, n_seq_max, n_pad, - hparams.n_swa, hparams.swa_type, filter_swa, reuse); + hparams.n_swa, hparams.swa_type, mem_other_swa, filter_swa, reuse, share); } void llama_kv_cache_iswa::clear(bool data) { diff --git a/src/llama-kv-cache-iswa.h b/src/llama-kv-cache-iswa.h index 70ab22f0d608..dfafc1ef510b 100644 --- a/src/llama-kv-cache-iswa.h +++ b/src/llama-kv-cache-iswa.h @@ -25,8 +25,10 @@ class llama_kv_cache_iswa : public llama_memory_i { uint32_t n_seq_max, uint32_t n_ubatch, uint32_t n_pad, + llama_memory_t mem_other, const layer_filter_cb & filter, - const layer_reuse_cb & reuse); + const layer_reuse_cb & reuse, + const layer_share_cb & share); ~llama_kv_cache_iswa() = default; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 11fd5547b387..2bf5ff125f5a 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -103,6 +103,7 @@ TURBO_IQ_IMPORT void turbo_innerq_mark_tensor_updated(void); llama_kv_cache::llama_kv_cache( const llama_model & model, + const llama_hparams & hparams, ggml_type type_k, ggml_type type_v, bool v_trans, @@ -113,9 +114,11 @@ llama_kv_cache::llama_kv_cache( uint32_t n_pad, uint32_t n_swa, llama_swa_type swa_type, + llama_memory_t mem_other, const layer_filter_cb & filter, - const layer_reuse_cb & reuse) : - model(model), hparams(model.hparams), v_trans(v_trans), + const layer_reuse_cb & reuse, + const layer_share_cb & share) : + model(model), hparams(hparams), v_trans(v_trans), n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) { GGML_ASSERT(kv_size % n_pad == 0); @@ -211,6 +214,8 @@ llama_kv_cache::llama_kv_cache( const bool is_mla = hparams.is_mla(); + other = static_cast(mem_other); + for (uint32_t il = 0; il < n_layer; il++) { if (!hparams.has_kv(il)) { LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il); @@ -222,6 +227,24 @@ llama_kv_cache::llama_kv_cache( continue; } + if (share && other) { + const int32_t il_share = share(il); + + if (il_share >= 0) { + const auto & layer_share = other->layers[other->map_layer_ids[il_share]]; + + LLAMA_LOG_WARN("%s: layer %3d: sharing with layer %d. k = %p, v = %p\n", __func__, il, il_share, + layer_share.k->data, layer_share.v->data); + + map_layer_ids[il] = layers.size(); + + layers.push_back(layer_share); + layers.back().il = il; + + continue; + } + } + if (n_embd_head_k_all == 0) { n_embd_head_k_all = (int32_t) hparams.n_embd_head_k(il); } else if (n_embd_head_k_all > 0 && n_embd_head_k_all != (int32_t) hparams.n_embd_head_k(il)) { @@ -464,61 +487,68 @@ llama_kv_cache::llama_kv_cache( ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); } - // TurboQuant: master's #21038 attention rotation is OFF by default on this - // fork. Enable per-side via LLAMA_ATTN_ROT_K_OVERRIDE=1 and/or - // LLAMA_ATTN_ROT_V_OVERRIDE=1 if your specific model+KV combo benefits. - // - // Why default OFF: empirical PPL+KLD testing on 7 model families - // (gemma-4 26B-A4B/31B/E2B, Qwen2.5-7B, Qwen3.5-2B, Mistral-Small-24B, - // phi-4, on q8/turbo4 KV) showed the optimal rotation policy is highly - // model-and-quant specific: - // - // • gemma-4 31B Q8 q8/turbo4: V-only rotation gives -43% PPL (huge win). - // • gemma-4 26B-A4B Q8 q8/turbo4: V-only gives -3.9%. - // • gemma-4 E2B Q4_K_L q8/turbo4: V-only HURTS by +6.7%. - // • phi-4 Q8 q8/turbo4: V-side rotation crashes (graph hash overflow). - // • Qwen2.5/3.5/Mistral: rotation effect is within standard error. - // - // No single default is correct everywhere, including within the same - // architecture family (gemma-4 above shows three distinct optima across - // three sizes). Per-arch heuristics in code would silently regress users - // on variants we haven't tested. Default OFF + per-side env knobs lets - // each user tune for their specific config; documented findings in the - // README guide the choice. - // - // Reported by @erazortt (TheTom/turboquant_plus#88). - // - // LLAMA_ATTN_ROT_DISABLE retained as a no-op alias (default OFF makes it - // redundant but historical scripts may set it). - // Default attn_rot_disable=false now that rotation is OFF by default. The - // env var is preserved as a hard lock-out (=1 forces rotation off and - // blocks overrides), useful for users who want to guarantee no rotation - // regardless of any LLAMA_ATTN_ROT_*_OVERRIDE settings. - const char * LLAMA_ATTN_ROT_DISABLE = getenv("LLAMA_ATTN_ROT_DISABLE"); - const bool attn_rot_disable = LLAMA_ATTN_ROT_DISABLE ? (atoi(LLAMA_ATTN_ROT_DISABLE) != 0) : false; - - // Default: rotation OFF on both sides (safe across all tested model families). - // Override per side via env vars below. - attn_rot_k = false; - attn_rot_v = false; - - // Per-side overrides. Set LLAMA_ATTN_ROT_K_OVERRIDE=1 / LLAMA_ATTN_ROT_V_OVERRIDE=1 - // to enable rotation. The cache type and head-dim alignment guards below - // still apply: rotation only takes effect on quantized types with - // head_dim % 64 == 0 (master's #21038 requirements). - const char * ROT_K_OV = getenv("LLAMA_ATTN_ROT_K_OVERRIDE"); - if (ROT_K_OV && atoi(ROT_K_OV) != 0 && !attn_rot_disable) { - attn_rot_k = - n_embd_head_k_all > 0 && - ggml_is_quantized(type_k) && - hparams.n_embd_head_k() % 64 == 0; - } - const char * ROT_V_OV = getenv("LLAMA_ATTN_ROT_V_OVERRIDE"); - if (ROT_V_OV && atoi(ROT_V_OV) != 0 && !attn_rot_disable) { - attn_rot_v = - n_embd_head_v_all > 0 && - ggml_is_quantized(type_v) && - hparams.n_embd_head_v() % 64 == 0; + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + // KV-cache sharing (MTP draft): a shared cache inherits head dims and the + // resolved rotation policy from its parent so draft and target agree. + if (other) { + n_embd_head_k_all = other->n_embd_head_k_all; + n_embd_head_v_all = other->n_embd_head_v_all; + + attn_rot_k = other->attn_rot_k; + attn_rot_v = other->attn_rot_v; + } else { + // TurboQuant: master's #21038 attention rotation is OFF by default on this + // fork. Enable per-side via LLAMA_ATTN_ROT_K_OVERRIDE=1 and/or + // LLAMA_ATTN_ROT_V_OVERRIDE=1 if your specific model+KV combo benefits. + // + // Why default OFF: empirical PPL+KLD testing on 7 model families + // (gemma-4 26B-A4B/31B/E2B, Qwen2.5-7B, Qwen3.5-2B, Mistral-Small-24B, + // phi-4, on q8/turbo4 KV) showed the optimal rotation policy is highly + // model-and-quant specific: + // + // • gemma-4 31B Q8 q8/turbo4: V-only rotation gives -43% PPL (huge win). + // • gemma-4 26B-A4B Q8 q8/turbo4: V-only gives -3.9%. + // • gemma-4 E2B Q4_K_L q8/turbo4: V-only HURTS by +6.7%. + // • phi-4 Q8 q8/turbo4: V-side rotation crashes (graph hash overflow). + // • Qwen2.5/3.5/Mistral: rotation effect is within standard error. + // + // No single default is correct everywhere, including within the same + // architecture family (gemma-4 above shows three distinct optima across + // three sizes). Per-arch heuristics in code would silently regress users + // on variants we haven't tested. Default OFF + per-side env knobs lets + // each user tune for their specific config; documented findings in the + // README guide the choice. + // + // Reported by @erazortt (TheTom/turboquant_plus#88). + // + // LLAMA_ATTN_ROT_DISABLE retained as a hard lock-out: =1 forces rotation + // off on both sides and blocks the per-side overrides below. + const char * LLAMA_ATTN_ROT_DISABLE = getenv("LLAMA_ATTN_ROT_DISABLE"); + const bool attn_rot_disable = LLAMA_ATTN_ROT_DISABLE ? (atoi(LLAMA_ATTN_ROT_DISABLE) != 0) : false; + + // Default: rotation OFF on both sides (safe across all tested model families). + // Override per side via env vars below. + attn_rot_k = false; + attn_rot_v = false; + + // Per-side overrides. Set LLAMA_ATTN_ROT_K_OVERRIDE=1 / LLAMA_ATTN_ROT_V_OVERRIDE=1 + // to enable rotation. The cache type and head-dim alignment guards below + // still apply: rotation only takes effect on quantized types with + // head_dim % 64 == 0 (master's #21038 requirements). + const char * ROT_K_OV = getenv("LLAMA_ATTN_ROT_K_OVERRIDE"); + if (ROT_K_OV && atoi(ROT_K_OV) != 0 && !attn_rot_disable) { + attn_rot_k = + n_embd_head_k_all > 0 && + ggml_is_quantized(type_k) && + hparams.n_embd_head_k() % 64 == 0; + } + const char * ROT_V_OV = getenv("LLAMA_ATTN_ROT_V_OVERRIDE"); + if (ROT_V_OV && atoi(ROT_V_OV) != 0 && !attn_rot_disable) { + attn_rot_v = + n_embd_head_v_all > 0 && + ggml_is_quantized(type_v) && + hparams.n_embd_head_v() % 64 == 0; + } } LLAMA_LOG_INFO("%s: attn_rot_k = %d, n_embd_head_k_all = %d\n", __func__, attn_rot_k, n_embd_head_k_all); @@ -577,6 +607,11 @@ void llama_kv_cache::clear(bool data) { } bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return true; + } + GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size())); if (p0 < 0) { @@ -640,6 +675,11 @@ bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { } void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return; + } + GGML_ASSERT(seq_id_src >= 0 && (size_t) seq_id_src < seq_to_stream.size()); GGML_ASSERT(seq_id_dst >= 0 && (size_t) seq_id_dst < seq_to_stream.size()); @@ -727,6 +767,11 @@ void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, ll } void llama_kv_cache::seq_keep(llama_seq_id seq_id) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return; + } + GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); auto & cells = v_cells[seq_to_stream[seq_id]]; @@ -749,6 +794,11 @@ void llama_kv_cache::seq_keep(llama_seq_id seq_id) { } void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return; + } + GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); GGML_ASSERT(hparams.n_pos_per_embd() == 1 && "seq_add() is only supported for n_pos_per_embd() == 1"); @@ -794,6 +844,11 @@ void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, ll } void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return; + } + GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); GGML_ASSERT(hparams.n_pos_per_embd() == 1 && "seq_div() is only supported for n_pos_per_embd() == 1"); @@ -828,6 +883,11 @@ void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, in } llama_pos llama_kv_cache::seq_pos_min(llama_seq_id seq_id) const { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return other->seq_pos_min(seq_id); + } + GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); const auto & cells = v_cells[seq_to_stream[seq_id]]; @@ -836,6 +896,11 @@ llama_pos llama_kv_cache::seq_pos_min(llama_seq_id seq_id) const { } llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return other->seq_pos_max(seq_id); + } + GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); const auto & cells = v_cells[seq_to_stream[seq_id]]; @@ -976,6 +1041,11 @@ llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vectorget_sched(); @@ -1251,6 +1321,12 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch, } void llama_kv_cache::apply_ubatch(const slot_info & sinfo, const llama_ubatch & ubatch) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + v_cells = other->v_cells; + return; + } + // keep track of the max sequence position that we would overwrite with this ubatch // for non-SWA cache, this would be always empty llama_seq_id seq_pos_max_rm[LLAMA_MAX_SEQ]; @@ -2095,6 +2171,9 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) { } ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_context * lctx) const { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + GGML_ASSERT(!other); + auto * ctx = res->get_ctx(); auto * gf = res->get_gf(); @@ -2140,6 +2219,11 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co } void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return; + } + GGML_UNUSED(flags); io.write(&n_stream, sizeof(n_stream)); @@ -2193,6 +2277,11 @@ void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, lla } void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return; + } + GGML_UNUSED(flags); GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size())); diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index d19d9945de40..aa4d466a60e1 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -95,6 +95,7 @@ class llama_kv_cache : public llama_memory_i { llama_kv_cache( const llama_model & model, + const llama_hparams & hparams, ggml_type type_k, ggml_type type_v, bool v_trans, @@ -105,8 +106,10 @@ class llama_kv_cache : public llama_memory_i { uint32_t n_pad, uint32_t n_swa, llama_swa_type swa_type, + llama_memory_t mem_other, const layer_filter_cb & filter, - const layer_reuse_cb & reuse); + const layer_reuse_cb & reuse, + const layer_share_cb & share); ~llama_kv_cache() = default; @@ -269,6 +272,9 @@ class llama_kv_cache : public llama_memory_i { // note: this is not part of the KV state and it's only used to speed-up the find_slot() method std::vector v_heads; + // TODO: temporary until we refactor to be able to share the same cells between 2 kv caches [TAG_KV_CACHE_SHARE_CELLS] + llama_kv_cache * other; + std::vector v_cells; // maps from a sequence id to a stream id diff --git a/src/llama-memory-hybrid-iswa.cpp b/src/llama-memory-hybrid-iswa.cpp index a242079b4066..c7d4bcd413e8 100644 --- a/src/llama-memory-hybrid-iswa.cpp +++ b/src/llama-memory-hybrid-iswa.cpp @@ -43,9 +43,11 @@ llama_memory_hybrid_iswa::llama_memory_hybrid_iswa( n_seq_max, n_ubatch, n_pad, + nullptr, filter_attn == nullptr ? [&](int32_t il) { return !hparams.is_recr(il); } : filter_attn, + nullptr, nullptr )), mem_recr(new llama_memory_recurrent( diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp index c14c4cd9a797..dda457d9ebe2 100644 --- a/src/llama-memory-hybrid.cpp +++ b/src/llama-memory-hybrid.cpp @@ -43,9 +43,11 @@ llama_memory_hybrid::llama_memory_hybrid( n_pad, n_swa, swa_type, + nullptr, filter_attn == nullptr ? [&](int32_t il) { return !hparams.is_recr(il); } : filter_attn, + nullptr, nullptr )), mem_recr(new llama_memory_recurrent( diff --git a/src/llama-memory.h b/src/llama-memory.h index 1ab412261f38..6bd5ff6339a7 100644 --- a/src/llama-memory.h +++ b/src/llama-memory.h @@ -23,6 +23,8 @@ struct llama_memory_params { bool swa_full; llama_context_type ctx_type; + + llama_memory_t mem_other; }; enum llama_memory_status { @@ -85,6 +87,8 @@ struct llama_memory_i { // return negative value to indicate that the layer il should not reuse memory using layer_reuse_cb = std::function; + using layer_share_cb = std::function; + virtual ~llama_memory_i() = default; // split the input batch into a set of ubatches and verify that they can fit into the cache diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 0ffb7cdc4787..3c96e31c63e2 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -134,6 +134,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params return new llama_model_gemma3n(params); case LLM_ARCH_GEMMA4: return new llama_model_gemma4(params); + case LLM_ARCH_GEMMA4_ASSISTANT: + return new llama_model_gemma4_assistant(params); case LLM_ARCH_GEMMA_EMBEDDING: return new llama_model_gemma_embedding(params); case LLM_ARCH_STARCODER2: @@ -1671,19 +1673,21 @@ void llama_model::print_info() const { if (!hparams.vocab_only) { LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train); - LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp()); + LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); + LLAMA_LOG_INFO("%s: n_embd_out = %u\n", __func__, hparams.n_embd_out()); LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer()); - LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer()).c_str()); - LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer()).c_str()); + LLAMA_LOG_INFO("%s: n_layer_all = %u\n", __func__, hparams.n_layer_all); + LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer_all).c_str()); + LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer_all).c_str()); LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot_full); LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa); LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any()); LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k_full); LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v_full); - LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer()).c_str()); - LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer()).c_str()); - LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer()).c_str()); + LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer_all).c_str()); + LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer_all).c_str()); + LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer_all).c_str()); LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps); LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps); LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv); @@ -1691,7 +1695,7 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale); LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale); LLAMA_LOG_INFO("%s: f_attn_value_scale = %.4f\n", __func__, hparams.f_attn_value_scale); - LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer()).c_str()); + LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer_all).c_str()); LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert); LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used); LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups); @@ -1712,6 +1716,14 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn); LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul); LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown"); + if (arch == LLM_ARCH_GRANITE && + std::any_of(hparams.deepstack_mapping_arr.begin(), + hparams.deepstack_mapping_arr.end(), + [](const auto & entry) { return entry >= 0; })) { + LLAMA_LOG_INFO("%s: deepstack_mapping_arr = %s\n", __func__, + print_f([&](uint32_t il) { return hparams.deepstack_mapping_arr[il]; }, + hparams.n_layer_all).c_str()); + } // MRoPE (Multi-axis Rotary Position Embedding) sections if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) { LLAMA_LOG_INFO("%s: mrope sections = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]); @@ -2038,8 +2050,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, /* filter_recr */ std::move(filter_recr)); } } else { - llama_memory_i::layer_reuse_cb reuse = nullptr; llama_kv_cache::layer_filter_cb filter = nullptr; + llama_memory_i::layer_reuse_cb reuse = nullptr; + llama_kv_cache::layer_share_cb share = nullptr; if (arch == LLM_ARCH_GEMMA3N || arch == LLM_ARCH_GEMMA4) { reuse = [&](uint32_t il) { @@ -2068,20 +2081,53 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { GGML_ASSERT(hparams.is_swa_any()); - res = new llama_kv_cache_iswa( - *this, - params.type_k, - params.type_v, - !cparams.flash_attn, - cparams.offload_kqv, - params.swa_full, - cparams.kv_unified, - cparams.n_ctx_seq, - cparams.n_seq_max, - cparams.n_ubatch, - 1, - filter, - reuse); + if (arch == LLM_ARCH_GEMMA4_ASSISTANT) { + llama_memory_t mem_other = llama_get_memory(cparams.ctx_other); + + share = [&](int32_t il) { + const llama_model * model_other = llama_get_model(cparams.ctx_other); + + if (hparams.is_swa(il)) { + return llama_model_n_layer(model_other) - 2; + } + + return llama_model_n_layer(model_other) - 1; + }; + + res = new llama_kv_cache_iswa( + *this, + params.type_k, + params.type_v, + !cparams.flash_attn, + cparams.offload_kqv, + params.swa_full, + cparams.kv_unified, + cparams.n_ctx_seq, + cparams.n_seq_max, + cparams.n_ubatch, + 1, + mem_other, + filter, + reuse, + share); + } else { + res = new llama_kv_cache_iswa( + *this, + params.type_k, + params.type_v, + !cparams.flash_attn, + cparams.offload_kqv, + params.swa_full, + cparams.kv_unified, + cparams.n_ctx_seq, + cparams.n_seq_max, + cparams.n_ubatch, + 1, + nullptr, + filter, + reuse, + share); + } } else { GGML_ASSERT(!hparams.is_swa_any()); @@ -2097,7 +2143,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, 1, hparams.n_swa, hparams.swa_type, + nullptr, filter, + nullptr, nullptr); } } @@ -2329,6 +2377,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_GEMMA3: case LLM_ARCH_GEMMA3N: case LLM_ARCH_GEMMA4: + case LLM_ARCH_GEMMA4_ASSISTANT: case LLM_ARCH_GEMMA_EMBEDDING: case LLM_ARCH_STARCODER2: case LLM_ARCH_OPENELM: diff --git a/src/llama-model.h b/src/llama-model.h index 3a8866d30a21..2d2b83208540 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -542,6 +542,10 @@ struct llama_model { struct ggml_tensor * output_s = nullptr; struct ggml_tensor * output_in_s = nullptr; + // NextN/MTP model-level projections + struct ggml_tensor * nextn_proj_pre = nullptr; + struct ggml_tensor * nextn_proj_post = nullptr; + // classifier struct ggml_tensor * cls = nullptr; struct ggml_tensor * cls_b = nullptr; @@ -696,6 +700,7 @@ const char * llm_type_name(llm_type type); #define LLAMA_LOAD_LOCALS \ const int n_layer = hparams.n_layer(); GGML_UNUSED(n_layer); \ const int n_layer_all = hparams.n_layer_all; GGML_UNUSED(n_layer_all); \ + const int n_layer_nextn = hparams.n_layer_nextn; GGML_UNUSED(n_layer_nextn); \ const int64_t n_head = hparams.n_head(); GGML_UNUSED(n_head); \ const int64_t n_head_kv = hparams.n_head_kv(); GGML_UNUSED(n_head_kv); \ const int64_t n_embd = hparams.n_embd; GGML_UNUSED(n_embd); \ diff --git a/src/models/gemma4-assistant.cpp b/src/models/gemma4-assistant.cpp new file mode 100644 index 000000000000..5b7a25a5abaf --- /dev/null +++ b/src/models/gemma4-assistant.cpp @@ -0,0 +1,200 @@ +#include "models.h" + +void llama_model_gemma4_assistant::load_arch_hparams(llama_model_loader & ml) { + hparams.n_embd_inp_impl = hparams.n_embd_out(); + + hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer()); + + uint32_t n_kv_shared_layers = 0; + ml.get_key(LLM_KV_ATTENTION_SHARED_KV_LAYERS, n_kv_shared_layers, false); + + hparams.f_attention_scale = 1.0f; + + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn == hparams.n_layer_all && "n_layer_nextn must be == n_layer_impl"); + + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa); + ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa); +} + +void llama_model_gemma4_assistant::load_arch_tensors(llama_model_loader &) { + LLAMA_LOAD_LOCALS; + + if (n_embd_head_k != n_embd_head_v) { + throw std::runtime_error("Gemma 4 assistant requires n_embd_head_k == n_embd_head_v"); + } + if (hparams.n_embd_head_k_swa != hparams.n_embd_head_v_swa) { + throw std::runtime_error("Gemma 4 assistant requires n_embd_head_k_swa == n_embd_head_v_swa"); + } + if (hparams.n_embd_out() == n_embd) { + throw std::runtime_error("Gemma 4 assistant requires embedding_length_out to carry the target hidden size"); + } + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED); + + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); + + const int64_t n_embd_backbone = hparams.n_embd_inp(); + nextn_proj_post = create_tensor(tn(LLM_TENSOR_NEXTN_PROJ_POST, "weight"), { n_embd, n_embd_backbone }, 0); + + int rope_freqs_flag = 0; + + for (int i = 0; i < n_layer_nextn; ++i) { + auto & layer = layers[i]; + + const int64_t n_head = hparams.n_head(i); + const int64_t n_embd_head = hparams.n_embd_head_k(i); + const int64_t n_ff = hparams.n_ff(i); + + if (i == 0) { + nextn_proj_pre = create_tensor(tn(LLM_TENSOR_NEXTN_PROJ_PRE, "weight", i), { 2*n_embd_backbone, n_embd }, 0); + } + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head*n_head }, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head*n_head, n_embd }, 0); + + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head }, 0); + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0); + + layer.out_scale = create_tensor(tn(LLM_TENSOR_LAYER_OUT_SCALE, "weight", i), { 1u }, 0); + + if (!hparams.is_swa(i)) { + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_embd_head/2 }, rope_freqs_flag); + rope_freqs_flag = TENSOR_DUPLICATED; + } + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0); + layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), { n_embd }, 0); + } +} + +std::unique_ptr llama_model_gemma4_assistant::build_arch_graph(const llm_graph_params & params) const { + return std::make_unique(*this, params); +} + +llama_model_gemma4_assistant::graph::graph(const llama_model & model, const llm_graph_params & params) : + llm_graph_context(params) { + const int64_t n_embd_backbone = hparams.n_embd_inp(); + + ggml_tensor * inp_tokens; + ggml_tensor * inp_h; + { + auto inp = std::make_unique(n_embd_backbone); + + inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); + cb(inp->tokens, "inp_tokens", -1); + ggml_set_input(inp->tokens); + inp_tokens = inp->tokens; + res->t_inp_tokens = inp->tokens; + + inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_backbone, ubatch.n_tokens); + cb(inp->embd, "inp_h", -1); + ggml_set_input(inp->embd); + inp_h = inp->embd; + res->t_inp_embd = inp->embd; + + res->add_input(std::move(inp)); + } + + GGML_ASSERT(cparams.ctx_other != nullptr); + const auto * model_other = llama_get_model(cparams.ctx_other); + + ggml_tensor * x = ggml_get_rows(ctx0, model_other->tok_embd, inp_tokens); + x = ggml_scale(ctx0, x, sqrtf((float) n_embd_backbone)); + cb(x, "inp_embd_target", -1); + + ggml_tensor * xh = ggml_concat(ctx0, x, inp_h, 0); + cb(xh, "inp_xh", -1); + + ggml_tensor * cur = ggml_mul_mat(ctx0, model.nextn_proj_pre, xh); + cb(cur, "pre_proj", -1); + + auto * inp_attn = build_attn_inp_kv_iswa(); + ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + ggml_tensor * inpL = cur; + + for (int il = 0; il < n_layer_nextn; ++il) { + const bool is_swa = hparams.is_swa(il); + + const int64_t n_embd_head = hparams.n_embd_head_k(il); + const int64_t n_head = hparams.n_head(il); + + const float freq_base_l = model.get_rope_freq_base(cparams, il); + const float freq_scale_l = model.get_rope_freq_scale(cparams, il); + const int n_rot_l = hparams.n_rot(il); + + ggml_tensor * cur_norm = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); + cb(cur_norm, "attn_norm", il); + + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur_norm); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); + cb(Qcur, "Qcur_normed", il); + + ggml_tensor * freq_factors = is_swa ? nullptr : model.layers[il].rope_freqs; + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, + freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "Qcur_pos", il); + + cur = build_attn(inp_attn, model.layers[il].wo, nullptr, nullptr, + Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il); + + if (il == n_layer_nextn - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + cur = build_norm(cur, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "attn_post_norm", il); + + ggml_tensor * attn_out = ggml_add(ctx0, cur, inpL); + cb(attn_out, "attn_out", il); + + cur = build_norm(attn_out, model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, + LLM_FFN_GELU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = build_norm(cur, model.layers[il].ffn_post_norm, nullptr, LLM_NORM_RMS, -1); + cb(cur, "ffn_post_norm", il); + + cur = ggml_add(ctx0, cur, attn_out); + + cur = ggml_mul(ctx0, cur, model.layers[il].out_scale); + cb(cur, "out_scaled", il); + + inpL = cur; + } + cur = inpL; + + cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + ggml_tensor * logits = build_lora_mm(model.output, cur); + cb(logits, "result_output", -1); + res->t_logits = logits; + + ggml_tensor * h_next = ggml_mul_mat(ctx0, model.nextn_proj_post, cur); + cb(h_next, "h_nextn", -1); + res->t_h_nextn = h_next; + + ggml_build_forward_expand(gf, logits); + ggml_build_forward_expand(gf, h_next); +} diff --git a/src/models/gemma4.cpp b/src/models/gemma4.cpp index 7198e5411164..6f7fcd645cbd 100644 --- a/src/models/gemma4.cpp +++ b/src/models/gemma4.cpp @@ -155,12 +155,14 @@ class llm_graph_input_logits_bias : public llm_graph_input_i { } virtual ~llm_graph_input_logits_bias() = default; - void set_input(const llama_ubatch *) override { + void set_input(const llama_ubatch * /*ubatch*/) override { const int64_t n_vocab = arr.size(); ggml_backend_tensor_set(logits_bias, arr.data(), 0, n_vocab*ggml_element_size(logits_bias)); } - // bool can_reuse(const llm_graph_params & params) override; + bool can_reuse(const llm_graph_params & /*params*/) override { + return true; + } ggml_tensor * logits_bias = nullptr; // F32 [n_vocab] @@ -270,7 +272,8 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para } // TODO @ngxson : strip unused token right after the last KV layer to speed up prompt processing - if (il == n_layer - 1 && inp_out_ids) { + // keep all rows when extracting unmasked nextn embeddings (MTP target needs the hidden state for every token) + if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); } @@ -370,7 +373,7 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para ggml_tensor * inp_this_layer = ggml_view_2d_slice(ctx0, inp_per_layer, il); // [n_embd_per_layer, n_tokens] // TODO @ngxson : improve this - if (il == n_layer - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { inp_this_layer = ggml_get_rows(ctx0, inp_this_layer, inp_out_ids); } @@ -401,6 +404,17 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para model.output_norm, nullptr, LLM_NORM_RMS, -1); + // Expose the post-output-norm hidden state (the LM-head input feature) so that + // MTP draft contexts can read it via llama_get_embeddings_nextn_ith() as the + // recurrent h input. This matches the reference (transformers/vLLM/SGLang), + // which feeds the drafter the target's post-final-norm hidden state. + cb(cur, "h_nextn", -1); + res->t_h_nextn = cur; + + if (!cparams.embeddings_nextn_masked && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + } + cb(cur, "result_norm", -1); res->t_embd = cur; diff --git a/src/models/models.h b/src/models/models.h index 7e551eb965b1..bc20a82bc0e2 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -797,6 +797,19 @@ struct llama_model_gemma4 : public llama_model_base { }; +struct llama_model_gemma4_assistant : public llama_model_base { + llama_model_gemma4_assistant(const struct llama_model_params & params) : llama_model_base(params) {} + void load_arch_hparams(llama_model_loader & ml) override; + void load_arch_tensors(llama_model_loader & ml) override; + + struct graph : public llm_graph_context { + graph(const llama_model & model, const llm_graph_params & params); + }; + + std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; +}; + + struct llama_model_gemma_embedding : public llama_model_base { llama_model_gemma_embedding(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp index 16af11a28623..d1138da0f6f5 100644 --- a/tests/test-llama-archs.cpp +++ b/tests/test-llama-archs.cpp @@ -387,7 +387,7 @@ static bool arch_supported(const llm_arch arch) { if (arch == LLM_ARCH_WAVTOKENIZER_DEC) { return false; // FIXME CUDA backend crashes. } - if (arch == LLM_ARCH_GEMMA4) { + if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) { return false; // FIXME @ngxson } if (arch == LLM_ARCH_LLAMA_EMBED || arch == LLM_ARCH_GEMMA_EMBEDDING || arch == LLM_ARCH_T5ENCODER) { @@ -442,7 +442,7 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) { continue; } - if (arch == LLM_ARCH_GEMMA4) { + if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) { continue; // FIXME: ISWA KV cache initialization needs more fixture params } for (bool moe : {false, true}) { @@ -526,7 +526,7 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) { continue; } - if (arch == LLM_ARCH_GEMMA4) { + if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) { continue; // FIXME: ISWA KV cache initialization needs more fixture params } diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index e48efd7663a7..b7154fff6612 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -1,4 +1,3 @@ - #include "server-context.h" #include "server-chat.h" #include "server-common.h" @@ -15,6 +14,11 @@ #include "mtmd.h" #include "mtmd-helper.h" +#include "ggml-cpp.h" + +// TODO: tmp until the mtmd draft processing is refactored [TAG_MTMD_DRAFT_PROCESSING] +#include "../../src/llama-ext.h" + #include #include #include @@ -874,7 +878,7 @@ struct server_context_impl { has_draft ? "draft model" : "MTP context", total / (1024.0 * 1024.0)); } catch (const std::exception & e) { - SRV_ERR("[spec] failed to measure %s memory: %s\n", + SRV_WRN("[spec] failed to measure %s memory: %s\n", has_draft ? "draft model" : "MTP context", e.what()); } } @@ -930,16 +934,17 @@ struct server_context_impl { const bool spec_mtp = std::find(params_base.speculative.types.begin(), params_base.speculative.types.end(), COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params_base.speculative.types.end(); + if (spec_mtp) { cparams.ctx_type = LLAMA_CONTEXT_TYPE_MTP; } // note: for small models maybe we can set this to the maximum possible draft from all speculative types // the extra memory for small models is likely negligible? - cparams.n_rs_seq = 0; - ctx_dft.reset(llama_init_from_model(model_dft.get(), cparams)); + cparams.n_rs_seq = 0; + cparams.ctx_other = ctx_tgt; - ctx_dft_seq_rm_type = common_context_can_seq_rm(ctx_dft.get()); + ctx_dft.reset(llama_init_from_model(model_dft.get(), cparams)); params_base.speculative.draft.ctx_tgt = ctx_tgt; params_base.speculative.draft.ctx_dft = ctx_dft.get(); @@ -949,8 +954,12 @@ struct server_context_impl { params_base.model.path.c_str()); auto cparams_mtp = common_context_params_to_llama(params_base); - cparams_mtp.ctx_type = LLAMA_CONTEXT_TYPE_MTP; - cparams_mtp.n_rs_seq = 0; + cparams_mtp.ctx_type = LLAMA_CONTEXT_TYPE_MTP; + cparams_mtp.type_k = params_base.speculative.draft.cache_type_k; + cparams_mtp.type_v = params_base.speculative.draft.cache_type_v; + cparams_mtp.n_rs_seq = 0; + cparams_mtp.n_outputs_max = params_base.n_parallel; + cparams_mtp.ctx_other = ctx_tgt; ctx_dft.reset(llama_init_from_model(model_tgt, cparams_mtp)); if (ctx_dft == nullptr) { @@ -958,8 +967,6 @@ struct server_context_impl { return false; } - ctx_dft_seq_rm_type = common_context_can_seq_rm(ctx_dft.get()); - params_base.speculative.draft.ctx_tgt = ctx_tgt; params_base.speculative.draft.ctx_dft = ctx_dft.get(); } @@ -1057,6 +1064,10 @@ struct server_context_impl { } } + if (ctx_dft) { + ctx_dft_seq_rm_type = common_context_can_seq_rm(ctx_dft.get()); + } + if (spec) { SRV_INF("%s", "speculative decoding context initialized\n"); } else { @@ -2993,10 +3004,11 @@ struct server_context_impl { continue; } - if (ctx_dft) { + if (ctx_dft && llama_get_ctx_other(ctx_dft.get()) != ctx_tgt) { // TODO: in the future, figure out how to infuse target embeddings to the images // for now, we skip this for simplicity // maybe we simply need to call `common_speculative_process()` on the mtmd batches in the `process_chunk` above? + // [TAG_MTMD_DRAFT_PROCESSING] res = input_tokens.process_chunk(ctx_dft.get(), mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out); if (res != 0) { GGML_ABORT("failed to process multi-modal data on draft context\n"); From 2f756e67e3e59f06120e863bffc47f31adc77aa8 Mon Sep 17 00:00:00 2001 From: TheTom Date: Mon, 8 Jun 2026 12:56:28 -0500 Subject: [PATCH 17/19] fork: reconcile MTP lineage with TurboQuant+ KV cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Integration glue so the upstream MTP lineage (#23198..#23398) builds on this fork without disturbing TurboQuant+ or the custom kernels: - llama_kv_cache ctor: thread the new `hparams` param and `layer_share_cb` through all call sites (iswa, memory-hybrid, dsa, model.cpp); keep the fork's turbo auto-asymmetric K upgrade, n_layer_kv() sizing (+3 rotation tensors), and per-side LLAMA_ATTN_ROT_* policy (default OFF) — now nested under the new `if (other) { share } else { ... }` KV-sharing branch. - hparams: carry n_layer_all/n_layer_nextn + n_layer()/n_layer_kv() from the refactor while keeping the fork's n_layer_kv_from_start; restore the swa_layers->is_swa_impl / recurrent_layer_arr->is_recr_impl / nextn_predict_layers->n_layer_nextn renames across fork models. - add n_outputs_max to cparams / common_params / llama_context_params and wire it through; restore deepstack_mapping_arr. - server: keep the #23398 ctx_other (MTP draft KV-sharing) wiring; drop the #23988 --fit VRAM pre-estimation block (depends on upstream helpers not on this fork; MTP does not need it). - drop upstream-only models pulled in by the refactor (deepseek32, mellum, talkie); keep non-MTP fork models on their own source + mechanical refactor. Builds clean on Metal; turbo quant unit test passes (turbo2/3/4 round-trip). Kernels (ggml-cuda / ggml-metal) untouched. --- common/common.cpp | 1 + common/common.h | 1 + include/llama.h | 1 + src/llama-context.cpp | 2 + src/llama-cparams.h | 1 + src/llama-hparams.cpp | 10 ++ src/llama-hparams.h | 6 + src/llama-kv-cache-iswa.cpp | 4 +- src/llama-kv-cache.cpp | 4 +- src/llama-memory-hybrid.cpp | 1 + src/llama-model.cpp | 3 +- src/models/step35.cpp | 289 +------------------------------- tools/server/server-context.cpp | 122 +------------- 13 files changed, 34 insertions(+), 411 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 6cf2d5b48f79..3b058def7db5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1539,6 +1539,7 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.n_ctx = params.n_ctx; cparams.n_seq_max = params.n_parallel; + cparams.n_outputs_max = params.n_outputs_max; cparams.n_rs_seq = params.speculative.need_n_rs_seq(); cparams.n_batch = params.n_batch; cparams.n_ubatch = params.n_ubatch; diff --git a/common/common.h b/common/common.h index 85c562286c6e..9718863ecc55 100644 --- a/common/common.h +++ b/common/common.h @@ -430,6 +430,7 @@ struct common_params { int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_parallel = 1; // number of parallel sequences to decode + int32_t n_outputs_max = 0; // max outputs supported by the context (0 = derive) int32_t n_sequences = 1; // number of sequences to decode int32_t grp_attn_n = 1; // group-attention factor int32_t grp_attn_w = 512; // group-attention width diff --git a/include/llama.h b/include/llama.h index 43d72d07fbd8..6f6cd5f1f5c0 100644 --- a/include/llama.h +++ b/include/llama.h @@ -340,6 +340,7 @@ extern "C" { uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode uint32_t n_ubatch; // physical maximum batch size uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models) + uint32_t n_outputs_max; // max outputs supported by the context (0 = derive from n_seq_max) uint32_t n_rs_seq; // number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL] int32_t n_threads; // number of threads to use for generation int32_t n_threads_batch; // number of threads to use for batch processing diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 7a895f52226c..43c2b722f76e 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -47,6 +47,7 @@ llama_context::llama_context( const auto & hparams = model.hparams; cparams.n_seq_max = std::max(1u, params.n_seq_max); + cparams.n_outputs_max = params.n_outputs_max; if (cparams.n_seq_max > LLAMA_MAX_SEQ) { throw std::runtime_error("n_seq_max must be <= " + std::to_string(LLAMA_MAX_SEQ)); } @@ -3349,6 +3350,7 @@ llama_context_params llama_context_default_params() { /*.n_batch =*/ 2048, /*.n_ubatch =*/ 512, /*.n_seq_max =*/ 1, + /*.n_outputs_max =*/ 0, /*.n_rs_seq =*/ 0, /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS, diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 11060e863162..e1b3cb9cfa4b 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -13,6 +13,7 @@ struct llama_cparams { uint32_t n_ubatch; uint32_t n_seq_max; uint32_t n_rs_seq; // number of recurrent-state snapshots per seq for rollback + uint32_t n_outputs_max; // max outputs supported by the context int32_t n_threads; // number of threads to use for generation int32_t n_threads_batch; // number of threads to use for batch processing diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 2bf576873824..08f41f60f4f0 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -269,6 +269,16 @@ uint32_t llama_hparams::n_layer() const { return n_layer_all - n_layer_nextn; } +uint32_t llama_hparams::n_layer_kv() const { + uint32_t res = 0; + for (uint32_t il = 0; il < n_layer(); ++il) { + if (has_kv(il)) { + res++; + } + } + return res; +} + bool llama_hparams::use_mrope() const { return rope_sections[0] > 0 && rope_sections[1] > 0; } diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 60f4025e88df..1bb99dcf3738 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -227,6 +227,9 @@ struct llama_hparams { // TODO: can be expressed via the `new n_embd_inp_impl` and remove this param uint32_t n_deepstack_layers = 0; + // deepstack layer array (Granite4 Vision): -1 => none, >=0 => input embedding index + std::array deepstack_mapping_arr; + // gemma4 per-layer embedding uint32_t n_embd_per_layer = 0; @@ -329,6 +332,9 @@ struct llama_hparams { // number of effective layers (excludes nextn layers) uint32_t n_layer() const; + // number of layers that carry a KV cache (respects n_layer_kv_from_start) + uint32_t n_layer_kv() const; + // note that this function uses different SWA parameters from those in the hparams // note: inlined on purpose for performance reasons // TODO: think of a better place for this function diff --git a/src/llama-kv-cache-iswa.cpp b/src/llama-kv-cache-iswa.cpp index 11eedc312ad8..07d7dedb71fe 100644 --- a/src/llama-kv-cache-iswa.cpp +++ b/src/llama-kv-cache-iswa.cpp @@ -72,14 +72,14 @@ llama_kv_cache_iswa::llama_kv_cache_iswa( } kv_base = std::make_unique( - model, type_k, type_v, + model, model.hparams, type_k, type_v, v_trans, offload, unified, size_base, n_seq_max, n_pad, 0, LLAMA_SWA_TYPE_NONE, mem_other_base, filter_base, reuse, share); LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa); kv_swa = std::make_unique( - model, type_k, type_v, + model, model.hparams, type_k, type_v, v_trans, offload, unified, size_swa, n_seq_max, n_pad, hparams.n_swa, hparams.swa_type, mem_other_swa, filter_swa, reuse, share); } diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 2bf5ff125f5a..c3c5bcfe6df0 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -310,7 +310,7 @@ llama_kv_cache::llama_kv_cache( return mode; } // Auto-enable Boundary V (mode 7) when V is turbo2 - if (type_v == GGML_TYPE_TURBO2_0 && hparams.n_layer >= 8) { + if (type_v == GGML_TYPE_TURBO2_0 && hparams.n_layer() >= 8) { LLAMA_LOG_INFO("llama_kv_cache: Boundary V auto-enabled for turbo2-V (opt-out: TURBO_LAYER_ADAPTIVE=0)\n"); return 7; } @@ -318,7 +318,7 @@ llama_kv_cache::llama_kv_cache( }(); const bool is_turbo = (type_k == GGML_TYPE_TURBO3_0 || type_k == GGML_TYPE_TURBO4_0 || type_k == GGML_TYPE_TURBO2_0); const bool v_is_turbo = (type_v == GGML_TYPE_TURBO3_0 || type_v == GGML_TYPE_TURBO4_0 || type_v == GGML_TYPE_TURBO2_0); - const uint32_t n_layer = hparams.n_layer; + const uint32_t n_layer = hparams.n_layer(); if (adaptive_mode == 1 && is_turbo && n_layer >= 8) { if (il < 4 || il >= n_layer - 4) { layer_type_k = GGML_TYPE_Q8_0; diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp index dda457d9ebe2..fc0444cb7caa 100644 --- a/src/llama-memory-hybrid.cpp +++ b/src/llama-memory-hybrid.cpp @@ -33,6 +33,7 @@ llama_memory_hybrid::llama_memory_hybrid( hparams(model.hparams), mem_attn(new llama_kv_cache( model, + model.hparams, type_k, type_v, v_trans, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 3c96e31c63e2..f519784ccafb 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -547,7 +547,7 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str return {tensor->ne[axis]}; }; - auto get_split_granularity = [&](int64_t blck_size, uint32_t il, const std::vector> & segments) -> std::vector { + auto get_split_granularity = [&](int64_t blck_size, uint32_t il, const std::vector & segments) -> std::vector { if (hparams.is_recr(il)) { // linear attention const int64_t head_dim = hparams.ssm_d_state; @@ -2133,6 +2133,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, res = new llama_kv_cache( *this, + hparams, params.type_k, params.type_v, !cparams.flash_attn, diff --git a/src/models/step35.cpp b/src/models/step35.cpp index 753bd68a4429..7f52fa61d1fd 100644 --- a/src/models/step35.cpp +++ b/src/models/step35.cpp @@ -22,16 +22,10 @@ void llama_model_step35::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); - ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer()); - ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp, hparams.n_layer(), false); ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer(), false); - // NextN/MTP (Step3p5): extra decoder block appended beyond the main stack. - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); - GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); - switch (hparams.n_layer()) { case 45: type = LLM_TYPE_196B_A11B; break; default: type = LLM_TYPE_UNKNOWN; @@ -41,15 +35,6 @@ void llama_model_step35::load_arch_hparams(llama_model_loader & ml) { void llama_model_step35::load_arch_tensors(llama_model_loader &) { LLAMA_LOAD_LOCALS; - const bool mtp_only = (hparams.n_layer_nextn > 0) && (ml.get_weight("blk.0.attn_norm.weight") == nullptr); - // Trunk-only: the GGUF declares MTP layers in metadata but the actual MTP - // tensors live in a separate file (e.g. user split target/draft). Mark - // MTP tensors NOT_REQUIRED so the trunk loads cleanly. - const std::string mtp_probe = "blk." + std::to_string(n_layer) + ".nextn.eh_proj.weight"; - const bool trunk_only = (hparams.n_layer_nextn > 0) && (ml.get_weight(mtp_probe.c_str()) == nullptr); - const int trunk_flags = mtp_only ? TENSOR_NOT_REQUIRED : 0; - const int mtp_flags = trunk_only ? TENSOR_NOT_REQUIRED : 0; - tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // output @@ -110,79 +95,6 @@ void llama_model_step35::load_arch_tensors(llama_model_loader &) { layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED); layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED); layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, TENSOR_NOT_REQUIRED); - }; - - auto load_block_mtp = [&](int i, bool is_first_mtp) { - auto & layer = layers[i]; - - const uint32_t n_head_l = hparams.n_head(i); - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i); - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i); - - // The MTP block is a full Step3p5 decoder layer (mtp_block) plus the - // NextN-specific wiring (enorm/hnorm/eh_proj + optional shared head). - // `mtp_flags` becomes NOT_REQUIRED when the GGUF is trunk-only. - // - // Only the FIRST MTP block (i == n_main) is required for the - // single-block MTP runtime; trailing MTP blocks are always tolerated - // as missing so pruned GGUFs (block 0 only) load cleanly. Override - // mtp_flags to NOT_REQUIRED for those. - const int eff_mtp_flags = is_first_mtp ? mtp_flags : (mtp_flags | TENSOR_NOT_REQUIRED); - - layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, eff_mtp_flags); - layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED); - layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED); - - if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) { - layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | TENSOR_DUPLICATED); - layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | TENSOR_DUPLICATED); - } else { - layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | TENSOR_DUPLICATED); - } - - create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head_l, n_embd_k_gqa, n_embd_v_gqa, eff_mtp_flags); - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, eff_mtp_flags); - - layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_head_l}, TENSOR_NOT_REQUIRED); - - layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, eff_mtp_flags); - - // dense MLP (leading dense blocks) — present if the MTP block isn't MoE - layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, TENSOR_NOT_REQUIRED); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED); - - // MoE routed experts + selection bias (router_bias) - const int64_t n_ff_exp = hparams.n_ff_exp; - layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, TENSOR_NOT_REQUIRED); - layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED); - layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, TENSOR_NOT_REQUIRED); - layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, TENSOR_NOT_REQUIRED); - layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED); - - layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED); - layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED); - layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, TENSOR_NOT_REQUIRED); - - // NextN-specific tensors that define the MTP block. - layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, eff_mtp_flags); - layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, eff_mtp_flags); - layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, eff_mtp_flags); - layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); - layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); - layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED); - }; - - for (int i = 0; i < n_layer; ++i) { - load_block_trunk(i, trunk_flags); - } - // Only the first MTP block (i == n_main) is required at runtime — the - // single-block-MTP graph in build_arch_graph always uses that one. - // Trailing MTP blocks are loaded if present (so an un-pruned GGUF with - // all MTP layers still works) but tolerated when absent via the pruning - // path. See scripts/prune_step35_extra_mtp.py for the pruner. - for (int i = n_layer; i < n_layer_all; ++i) { - load_block_mtp(i, /*is_first_mtp=*/ i == n_layer); } } @@ -199,7 +111,6 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para auto * inp_attn = build_attn_inp_kv_iswa(); ggml_tensor * inp_out_ids = build_inp_out_ids(); - // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass. for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -287,8 +198,8 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para cb(cur, "attn_proj", il); } - if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { - cur = ggml_get_rows(ctx0, cur, inp_out_ids); + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -346,13 +257,6 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para cur = inpL; - cb(cur, "h_nextn", -1); - res->t_h_nextn = cur; - - if (!cparams.embeddings_nextn_masked && inp_out_ids) { - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - } - cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); res->t_embd = cur; @@ -363,192 +267,3 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para ggml_build_forward_expand(gf, cur); } - -// LLM_GRAPH_TYPE_DECODER_MTP draft head for Step3p5 (MoE) -llama_model_step35::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params) - : llm_graph_context(params) { - GGML_ASSERT(hparams.n_layer_nextn > 0 && "STEP35 MTP requires n_layer_nextn > 0"); - - // Single-block MTP only: always run the first trained MTP block (Qwen - // MTP / vLLM single-MTP-layer style). Multi-block round-robin proved to - // be a much deeper refactor than this PR justifies; the trailing MTP - // blocks are loaded with TENSOR_NOT_REQUIRED so pruned GGUFs (with just - // block 0) also work — see load_arch_tensors below and - // scripts/prune_step35_extra_mtp.py. - const int il = hparams.n_layer(); - const auto & layer = model.layers[il]; - - GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj"); - GGML_ASSERT(layer.nextn.enorm && "MTP block missing nextn.enorm"); - GGML_ASSERT(layer.nextn.hnorm && "MTP block missing nextn.hnorm"); - - const uint32_t n_head_l = hparams.n_head(il); - const uint32_t n_head_kv_l = hparams.n_head_kv(il); - - const float freq_base_l = model.get_rope_freq_base(cparams, il); - const float freq_scale_l = model.get_rope_freq_scale(cparams, il); - - auto inp = std::make_unique(hparams.n_embd); - - inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_set_input(inp->tokens); - - inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens); - ggml_set_input(inp->embd); - ggml_set_name(inp->embd, "mtp_h_input"); - - ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd; - - ggml_tensor * h_input = inp->embd; - ggml_tensor * tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens); - cb(tok_embd, "mtp_tok_embd", il); - - res->add_input(std::move(inp)); - - ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_iswa(); - - ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il); - cb(h_norm, "mtp_hnorm", il); - - ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il); - cb(e_norm, "mtp_enorm", il); - - ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0); - cb(concat, "mtp_concat", il); - - ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat); - cb(cur, "mtp_eh_proj", il); - - ggml_tensor * inpSA = cur; - - // mtp_block: full Step3p5 decoder layer (attention with optional head-wise gate, then MoE/dense FFN) - cur = build_norm(cur, layer.attn_norm, nullptr, LLM_NORM_RMS, il); - cb(cur, "mtp_attn_norm", il); - - ggml_tensor * Qcur = build_lora_mm(layer.wq, cur, layer.wq_s); - ggml_tensor * Kcur = build_lora_mm(layer.wk, cur, layer.wk_s); - ggml_tensor * Vcur = build_lora_mm(layer.wv, cur, layer.wv_s); - cb(Qcur, "mtp_Qcur", il); - cb(Kcur, "mtp_Kcur", il); - cb(Vcur, "mtp_Vcur", il); - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head_l, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv_l, n_tokens); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv_l, n_tokens); - - if (layer.attn_q_norm) { - Qcur = build_norm(Qcur, layer.attn_q_norm, nullptr, LLM_NORM_RMS, il); - cb(Qcur, "mtp_Qcur_normed", il); - } - if (layer.attn_k_norm) { - Kcur = build_norm(Kcur, layer.attn_k_norm, nullptr, LLM_NORM_RMS, il); - cb(Kcur, "mtp_Kcur_normed", il); - } - - const bool is_swa = hparams.is_swa(il); - ggml_tensor * rope_factors = is_swa ? nullptr : model.get_rope_factors(cparams, il); - const int64_t n_rot_l = hparams.n_rot(il); - - Qcur = ggml_rope_ext( - ctx0, Qcur, inp_pos, rope_factors, - n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, - ext_factor, attn_factor, beta_fast, beta_slow); - Kcur = ggml_rope_ext( - ctx0, Kcur, inp_pos, rope_factors, - n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, - ext_factor, attn_factor, beta_fast, beta_slow); - cb(Qcur, "mtp_Qcur_pos", il); - cb(Kcur, "mtp_Kcur_pos", il); - - const float kq_scale = 1.0f / sqrtf(float(n_embd_head_k)); - ggml_tensor * attn_out = build_attn(inp_attn, - nullptr, nullptr, nullptr, - Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); - cb(attn_out, "mtp_attn_out", il); - - // head-wise attention gate: sigmoid(g_proj(x)) - if (layer.wqkv_gate) { - ggml_tensor * gate = build_lora_mm(layer.wqkv_gate, cur); // [n_head_l, n_tokens] - cb(gate, "mtp_attn_gate", il); - - gate = ggml_sigmoid(ctx0, gate); - cb(gate, "mtp_attn_gate_sigmoid", il); - - ggml_tensor * attn_3d = ggml_reshape_3d(ctx0, attn_out, n_embd_head_v, n_head_l, n_tokens); - ggml_tensor * gate_3d = ggml_reshape_3d(ctx0, gate, 1, n_head_l, n_tokens); - cb(gate_3d, "mtp_attn_gate_3d", il); - - attn_3d = ggml_mul(ctx0, attn_3d, gate_3d); - cb(attn_3d, "mtp_attn_gated_3d", il); - - attn_out = ggml_reshape_2d(ctx0, attn_3d, n_embd_head_v * n_head_l, n_tokens); - cb(attn_out, "mtp_attn_gated", il); - } - - cur = build_lora_mm(layer.wo, attn_out, layer.wo_s); - cb(cur, "mtp_attn_proj", il); - - cur = ggml_add(ctx0, cur, inpSA); - cb(cur, "mtp_attn_residual", il); - - ggml_tensor * ffn_inp = cur; - cur = build_norm(cur, layer.ffn_norm, nullptr, LLM_NORM_RMS, il); - cb(cur, "mtp_ffn_norm", il); - - // FFN: dense MLP or MoE (mirrors trunk path) - if (layer.ffn_gate_inp == nullptr) { - cur = build_ffn(cur, - layer.ffn_up, layer.ffn_up_b, nullptr, - layer.ffn_gate, layer.ffn_gate_b, nullptr, - layer.ffn_down, layer.ffn_down_b, nullptr, - nullptr, - LLM_FFN_SILU, LLM_FFN_PAR, il); - cb(cur, "mtp_ffn_out", il); - } else { - ggml_tensor * moe_out = build_moe_ffn(cur, - layer.ffn_gate_inp, - layer.ffn_up_exps, - layer.ffn_gate_exps, - layer.ffn_down_exps, - layer.ffn_exp_probs_b, - n_expert, n_expert_used, - LLM_FFN_SILU, hparams.expert_weights_norm, - hparams.expert_weights_scale, - (llama_expert_gating_func_type) hparams.expert_gating_func, - il); - cb(moe_out, "mtp_ffn_moe_out", il); - - ggml_tensor * sh_out = build_ffn(cur, - layer.ffn_up_shexp, nullptr, nullptr, - layer.ffn_gate_shexp, nullptr, nullptr, - layer.ffn_down_shexp, nullptr, nullptr, - nullptr, - LLM_FFN_SILU, LLM_FFN_PAR, il); - cb(sh_out, "mtp_ffn_shared_out", il); - - cur = ggml_add(ctx0, moe_out, sh_out); - cb(cur, "mtp_ffn_out", il); - } - cur = ggml_add(ctx0, cur, ffn_inp); - cb(cur, "mtp_post_ffn", il); - - // Pre-norm hidden state: used by the AR draft loop to seed the next MTP step. - cb(cur, "h_nextn", -1); - res->t_h_nextn = cur; - - ggml_tensor * head_norm_w = layer.nextn.shared_head_norm - ? layer.nextn.shared_head_norm - : model.output_norm; - GGML_ASSERT(head_norm_w && "STEP35 MTP: missing both nextn.shared_head_norm and output_norm"); - cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1); - cb(cur, "mtp_shared_head_norm", -1); - - ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output; - GGML_ASSERT(head_w && "STEP35 MTP: missing LM head (nextn.shared_head_head or model.output)"); - cur = build_lora_mm(head_w, cur); - cb(cur, "result_output", -1); - - res->t_logits = cur; - ggml_build_forward_expand(gf, cur); -} diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index b7154fff6612..cbb79547c012 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -764,125 +764,9 @@ struct server_context_impl { params_base = params; params_base.n_outputs_max = server_n_outputs_max(params_base); - std::string & mmproj_path = params_base.mmproj.path; - bool has_mmproj = !mmproj_path.empty(); - mtmd_context_params mparams = mtmd_context_params_default(); - if (has_mmproj) { - mparams.use_gpu = params_base.mmproj_use_gpu; - mparams.print_timings = false; - mparams.n_threads = params_base.cpuparams.n_threads; - mparams.flash_attn_type = params_base.flash_attn_type; - mparams.warmup = params_base.warmup; - mparams.image_min_tokens = params_base.image_min_tokens; - mparams.image_max_tokens = params_base.image_max_tokens; - mparams.media_marker = get_media_marker(); - } - - // optionally get the memory usage of mmproj - if (has_mmproj && params_base.fit_params) { - auto mmproj_mem = mtmd_get_memory_usage(mmproj_path.c_str(), mparams); - if (!mmproj_mem.empty()) { - size_t total = 0; - for (auto & [dev, size] : mmproj_mem) { - total += size; - } - SRV_INF("[mtmd] estimated worst-case memory usage of mmproj is %.2f MiB\n", total / (1024.0 * 1024.0)); - GGML_ASSERT(!params_base.fit_params_target.empty()); - for (auto & [dev, size] : mmproj_mem) { - for (size_t i = 0; i < ggml_backend_dev_count(); i++) { - if (ggml_backend_dev_get(i) == dev) { - if (i < params_base.fit_params_target.size()) { - SRV_DBG("[mtmd] adding %.2f MiB to fit_params_target for device %s\n", size / (1024.0 * 1024.0), ggml_backend_dev_name(dev)); - params_base.fit_params_target[i] += size; - } - break; - } - } - } - } else { - SRV_ERR("%s", "[mtmd] failed to get memory usage of mmproj\n"); - } - } - - // optionally reserve VRAM for the draft / MTP context before fitting the target model - if (params_base.fit_params) { - const bool spec_mtp = std::find(params_base.speculative.types.begin(), - params_base.speculative.types.end(), - COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params_base.speculative.types.end(); - const bool has_draft = params_base.speculative.has_dft(); - - if (has_draft || spec_mtp) { - common_params params_dft = params_base; - bool measure_model_bytes = true; - - if (has_draft) { - const auto & params_spec = params_base.speculative.draft; - params_dft.devices = params_spec.devices; - params_dft.model = params_spec.mparams; - params_dft.n_gpu_layers = params_spec.n_gpu_layers; - params_dft.cache_type_k = params_spec.cache_type_k; - params_dft.cache_type_v = params_spec.cache_type_v; - params_dft.tensor_buft_overrides = params_spec.tensor_buft_overrides; - } else { - // MTP draft context lives on the target model, only context+compute are new - measure_model_bytes = false; - } - - params_dft.n_outputs_max = params_base.n_parallel; - - auto mparams_dft = common_model_params_to_llama(params_dft); - auto cparams_dft = common_context_params_to_llama(params_dft); - if (spec_mtp) { - cparams_dft.ctx_type = LLAMA_CONTEXT_TYPE_MTP; - cparams_dft.type_k = params_base.speculative.draft.cache_type_k; - cparams_dft.type_v = params_base.speculative.draft.cache_type_v; - } - cparams_dft.n_rs_seq = 0; - - std::vector devs; - uint32_t hp_ngl = 0; - uint32_t hp_nct = 0; - uint32_t hp_nex = 0; - try { - auto dmd = common_get_device_memory_data( - params_dft.model.path.c_str(), &mparams_dft, &cparams_dft, - devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR); - - GGML_ASSERT(!params_base.fit_params_target.empty()); - size_t total = 0; - - std::vector tgt_devices = params.devices; - - if (tgt_devices.empty()) { - for(size_t i = 0; i < ggml_backend_dev_count(); ++i) { - tgt_devices.push_back(ggml_backend_dev_get(i)); - } - } - - for (size_t j = 0; j < devs.size(); ++j) { - const size_t bytes = - (measure_model_bytes ? dmd[j].mb.model : 0) + - dmd[j].mb.context + - dmd[j].mb.compute; - total += bytes; - for (size_t i = 0; i < tgt_devices.size(); i++) { - if (tgt_devices[i] == devs[j]) { - SRV_DBG("[spec] adding %.2f MiB to fit_params_target for device %s\n", - bytes / (1024.0 * 1024.0), ggml_backend_dev_name(devs[j])); - params_base.fit_params_target[i] += bytes; - break; - } - } - } - SRV_INF("[spec] estimated memory usage of %s is %.2f MiB\n", - has_draft ? "draft model" : "MTP context", - total / (1024.0 * 1024.0)); - } catch (const std::exception & e) { - SRV_WRN("[spec] failed to measure %s memory: %s\n", - has_draft ? "draft model" : "MTP context", e.what()); - } - } - } + // note: upstream's --fit VRAM pre-estimation for the mmproj / draft-MTP + // context (mtmd_get_memory_usage + common_get_device_memory_data) is not + // carried on this fork yet; MTP itself does not depend on it. llama_init = common_init_from_params(params_base); From de389e02144224035641ceb786d856e4f3fe736c Mon Sep 17 00:00:00 2001 From: TheTom Date: Tue, 9 Jun 2026 16:29:06 -0500 Subject: [PATCH 18/19] vulkan: fix f16vec4 casts in cm1 FA quantized K/V decode paths dequantize4() returns vec4; the USE_DECODE_K / USE_DECODE_V sites assigned it to f16vec4 locals/shared buffers, which glslc rejects under explicit-arithmetic-types. The f16-cache and zero-fill branches were already correct. Wrap each in f16vec4(). Unblocks coopmat1 FA with quantized KV caches (--cache-type-k q8_0 / --cache-type-v turbo4) on RDNA4. --- ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp index c418b42cec8e..262d51a041e6 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp @@ -232,7 +232,7 @@ void main() { uint coord = (j * Bc + c) * k_stride * BLOCK_SIZE_K + 4 * d; uint ib = coord / BLOCK_SIZE_K; uint iqs = (coord % BLOCK_SIZE_K); - K_Tf = dequantize4(ib, iqs, k_offset, BINDING_IDX_K); + K_Tf = f16vec4(dequantize4(ib, iqs, k_offset, BINDING_IDX_K)); } else { #if !defined(DATA_A_TURBO3_0) K_Tf = f16vec4(data_kv4[k_offset / 4 + (j * Bc + c) * k_stride / 4 + d]); @@ -274,7 +274,7 @@ void main() { uint coord = (j * Bc + row) * k_stride * BLOCK_SIZE_K + d * 16 + col_vec * 4; uint ib = coord / BLOCK_SIZE_K; uint iqs = (coord % BLOCK_SIZE_K); - K_Tf = dequantize4(ib, iqs, k_offset, BINDING_IDX_K); + K_Tf = f16vec4(dequantize4(ib, iqs, k_offset, BINDING_IDX_K)); } else { #if !defined(DATA_A_TURBO3_0) K_Tf = f16vec4(data_kv4[k_offset / 4 + (j * Bc + row) * k_stride / 4 + d * 16 / 4 + col_vec]); @@ -403,7 +403,7 @@ void main() { uint coord = (j * Bc + c) * v_stride * BLOCK_SIZE_V + 4 * d; uint ib = coord / BLOCK_SIZE_V; uint iqs = (coord % BLOCK_SIZE_V); - V_Tf = dequantize4(ib, iqs, v_offset, BINDING_IDX_V); + V_Tf = f16vec4(dequantize4(ib, iqs, v_offset, BINDING_IDX_V)); } else { #if !defined(DATA_A_TURBO3_0) V_Tf = f16vec4(data_vv4[v_offset / 4 + (j * Bc + c) * v_stride / 4 + d]); @@ -453,7 +453,7 @@ void main() { if (!KV_bounds_check || (v_row < KV && v_col < HSV)) { if (USE_DECODE_V) { - kvsh[row * vsh_stride + col] = dequantize4(ib, iqs, v_offset, BINDING_IDX_V); + kvsh[row * vsh_stride + col] = f16vec4(dequantize4(ib, iqs, v_offset, BINDING_IDX_V)); } else { #if !defined(DATA_A_TURBO3_0) kvsh[row * vsh_stride + col] = data_vv4[(v_offset + v_row * v_stride + v_col) / 4]; From 469c9c43258bb5300da0148fb72ad3768aa69f33 Mon Sep 17 00:00:00 2001 From: TheTom Date: Tue, 9 Jun 2026 16:29:06 -0500 Subject: [PATCH 19/19] mtp: fix standalone all-nextn draft KV cache (gemma4-assistant) Two fixes so a standalone MTP draft GGUF whose layers are ALL nextn (gemma4-assistant: n_layer_all == n_layer_nextn, so n_layer() == 0) initializes and engages speculative decoding: 1. llama-kv-cache.cpp: the ctor iterated hparams.n_layer() (excludes nextn layers) for the per-layer KV loop; the #24060 reconciliation wired it to the nextn-excluding method, but upstream loops the full hparams.n_layer member. With n_layer() == 0 the draft registered ZERO KV layers -> map_layer_ids empty -> get_k(0) threw std::out_of_range during draft-context reserve. Loop over hparams.n_layer_all instead; has_kv() still gates per-layer. 2. llama-graph.cpp: port upstream #24294 - guard the iSWA kq_mask on its own buffer in set_input/can_reuse (base and swa). A SWA-only draft head leaves the base sub-cache empty, so its mask buffer is null. Verified on RDNA4/Vulkan: gemma4-12B MTP assistant loads, drafts at ~0.56-0.75 acceptance with q8_0 K / turbo4 V. --- src/llama-graph.cpp | 17 +++++++++++++---- src/llama-kv-cache.cpp | 5 ++++- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 2bc43eece94a..b2d7af9b1a3a 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -539,7 +539,10 @@ void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) { mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch); } - mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); + // #24294: guard kq mask on its own buffer (SWA-only/empty sub-cache) + if (self_kq_mask && self_kq_mask->buffer) { + mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); + } // swa tensors may not be allocated if there are no SWA attention layers if (self_k_idxs_swa && self_k_idxs_swa->buffer) { @@ -547,7 +550,9 @@ void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) { mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch); } - mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); + if (self_kq_mask_swa && self_kq_mask_swa->buffer) { + mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); + } if (self_k_rot) { mctx->get_base()->set_input_k_rot(self_k_rot); @@ -579,7 +584,9 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) { //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there } - res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams); + if (self_kq_mask && self_kq_mask->buffer) { + res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams); + } // swa tensors may not be allocated if there are no SWA attention layers if (self_k_idxs_swa && self_k_idxs_swa->buffer) { @@ -587,7 +594,9 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) { //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there } - res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams); + if (self_kq_mask_swa && self_kq_mask_swa->buffer) { + res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams); + } return res; } diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index c3c5bcfe6df0..5f4aa3010a42 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -149,7 +149,10 @@ llama_kv_cache::llama_kv_cache( } } - const uint32_t n_layer = hparams.n_layer(); + // #24060/MTP fix: iterate ALL layers (incl. nextn) so an all-nextn draft + // (gemma4-assistant: n_layer()==0) registers its KV layers; has_kv() still + // gates per-layer. Upstream loops the full hparams.n_layer member here. + const uint32_t n_layer = hparams.n_layer_all; const uint32_t n_layer_kv = hparams.n_layer_kv(); // define a comparator for the buft -> ctx map to ensure that the order is well-defined: