diff --git a/common/arg.cpp b/common/arg.cpp index a4e5bc661202..8b8e5f59a487 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -540,7 +540,11 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str())); } if (!seen_args.insert(arg).second) { - LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str()); + const bool skip = (arg == "--spec-type"); + + if (!skip) { + LOG_WRN("DEPRECATED: argument '%s' specified multiple times, use comma-separated values instead (only last value will be used)\n", arg.c_str()); + } } auto & tmp = arg_to_options[arg]; auto opt = *tmp.first; @@ -903,7 +907,11 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map", "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n" @@ -4141,6 +4154,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.speculative.ngram_mod.n_match = 24; params.speculative.ngram_mod.n_min = 48; params.speculative.ngram_mod.n_max = 64; + + // TODO: not sure if this is a good config - explore more settings and potentially enable it + //params.speculative.types.push_back(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V); + //params.speculative.ngram_map_k4v.size_n = 8; + //params.speculative.ngram_map_k4v.size_m = 24; + //params.speculative.ngram_map_k4v.min_hits = 2; } ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI})); diff --git a/common/common.cpp b/common/common.cpp index 506db3ef64d0..3b058def7db5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1258,29 +1258,6 @@ common_init_result::common_init_result(common_params & params) : cparams.n_samplers = pimpl->samplers_seq_config.size(); } - // [TAG_RS_STATE_ROLLBACK_SUPPORT] - // TODO: ngram speculative methods require checkpointing in addition to partial RS rollback - // currently this is not supported. so we disable the partial rollback - if (cparams.n_rs_seq > 0 && (llama_model_is_recurrent(model) || llama_model_is_hybrid(model))) { - auto & types = params.speculative.types; - - for (int i = 0; i < (int) types.size(); i++) { - if (types[i] == COMMON_SPECULATIVE_TYPE_NONE) { - continue; - } - if (types[i] == COMMON_SPECULATIVE_TYPE_DRAFT_MTP) { - continue; - } - - cparams.n_rs_seq = 0; - - LOG_WRN("%s: recurrent state rollback is not compatible with '%s' - disabling rollback support\n", __func__, - common_speculative_type_to_str(types[i]).c_str()); - - break; - } - } - llama_context * lctx = llama_init_from_model(model, cparams); if (lctx == NULL) { LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str()); @@ -1562,6 +1539,7 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.n_ctx = params.n_ctx; cparams.n_seq_max = params.n_parallel; + cparams.n_outputs_max = params.n_outputs_max; cparams.n_rs_seq = params.speculative.need_n_rs_seq(); cparams.n_batch = params.n_batch; cparams.n_ubatch = params.n_ubatch; diff --git a/common/common.h b/common/common.h index 46efe749185b..9718863ecc55 100644 --- a/common/common.h +++ b/common/common.h @@ -299,11 +299,13 @@ struct common_params_model { // draft-model-based speculative decoding parameters struct common_params_speculative_draft { - int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding - int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding + int32_t n_max = 3; // maximum number of tokens to draft during speculative decoding + int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding - float p_split = 0.1f; // speculative decoding split probability - float p_min = 0.75f; // minimum speculative decoding probability (greedy) // TODO: change default to 0.0f + float p_split = 0.1f; // speculative decoding split probability + float p_min = 0.0f; // minimum speculative decoding probability (greedy) + + bool backend_sampling = true; // offload draft sampling to the backend (default: on) common_params_model mparams; @@ -428,6 +430,7 @@ struct common_params { int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_parallel = 1; // number of parallel sequences to decode + int32_t n_outputs_max = 0; // max outputs supported by the context (0 = derive) int32_t n_sequences = 1; // number of sequences to decode int32_t grp_attn_n = 1; // group-attention factor int32_t grp_attn_w = 512; // group-attention width diff --git a/common/ngram-map.cpp b/common/ngram-map.cpp index 8e3978f7ed02..a2a4e50575f2 100644 --- a/common/ngram-map.cpp +++ b/common/ngram-map.cpp @@ -500,7 +500,7 @@ void common_ngram_map_draft(common_ngram_map & map, draft.push_back(inp[match_pos + n + i]); } - LOG_INF("%s: key_offset = %zu, slot_max = %d, key_num = %d, draft.size = %zu\n", __func__, + LOG_DBG("%s: key_offset = %zu, slot_max = %d, key_num = %d, draft.size = %zu\n", __func__, key_offset, slot_max, curr_key.key_num, draft.size()); diff --git a/common/speculative.cpp b/common/speculative.cpp index ebaef15bde11..86c1e6a42903 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -3,13 +3,14 @@ #include "common.h" #include "ggml.h" #include "llama.h" -#include "../src/llama-ext.h" // staging API: llama_set_embeddings_pre_norm / llama_get_embeddings_pre_norm_ith (used by MTP) #include "log.h" #include "ngram-cache.h" #include "ngram-map.h" #include "ngram-mod.h" #include "sampling.h" +#include "../src/llama-ext.h" // staging API: llama_set_embeddings_nextn / llama_get_embeddings_nextn_ith (used by MTP) + #include #include #include @@ -32,6 +33,18 @@ const std::map common_speculative_type_fro {"ngram-cache", COMMON_SPECULATIVE_TYPE_NGRAM_CACHE} }; +static std::string common_speculative_get_devices_str(const std::vector & devices) { + std::string result; + for (size_t i = 0; i < devices.size(); i++) { + if (devices[i] == nullptr) { + continue; + } + if (!result.empty()) result += ", "; + result += ggml_backend_dev_name(devices[i]); + } + return result.empty() ? "default" : result; +} + struct common_speculative_config { common_speculative_type type; common_params_speculative params; @@ -46,10 +59,10 @@ static bool common_speculative_are_compatible( const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt); const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft); - const bool vocab_type_tgt = llama_vocab_type(vocab_tgt); + const auto vocab_type_tgt = llama_vocab_type(vocab_tgt); LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt); - const bool vocab_type_dft = llama_vocab_type(vocab_dft); + const auto vocab_type_dft = llama_vocab_type(vocab_dft); LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft); if (vocab_type_tgt != vocab_type_dft) { @@ -144,10 +157,13 @@ struct common_speculative_impl { virtual void draft(common_speculative_draft_params_vec & dparams) = 0; - virtual void accept(llama_seq_id seq_id, uint16_t n_accepted) = 0; + virtual void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) = 0; - // true if this implementation requires the target context to extract embeddings + // true if this implementation requires the target context to extract post-norm embeddings virtual bool need_embd() const = 0; + + // true if this implementation requires the target context to extract pre-norm embeddings + virtual bool need_embd_nextn() const { return false; } }; struct common_speculative_impl_draft_simple : public common_speculative_impl { @@ -164,6 +180,16 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl { auto * ctx_dft = this->params.ctx_dft; auto * ctx_tgt = this->params.ctx_tgt; + LOG_INF("%s: adding speculative implementation 'draft-simple'\n", __func__); + LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min); + LOG_INF("%s: - gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n", __func__, + this->params.n_gpu_layers, + ggml_type_name(this->params.cache_type_k), + ggml_type_name(this->params.cache_type_v), + ctx_tgt ? "yes" : "no", + ctx_dft ? "yes" : "no", + common_speculative_get_devices_str(this->params.devices).c_str()); + batch = llama_batch_init(llama_n_batch(ctx_dft), 0, 1); // TODO: optimize or pass from outside? @@ -340,7 +366,7 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl { } } - void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override { + void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override { // noop } @@ -352,8 +378,12 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl { struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { //common_params_speculative_eagle3 params; - common_speculative_impl_draft_eagle3(const common_params_speculative & /*params*/, uint32_t n_seq) - : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq) {} + common_speculative_impl_draft_eagle3(const common_params_speculative & params, uint32_t n_seq) + : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq) + { + LOG_INF("%s: adding speculative implementation 'draft-eagle3'\n", __func__); + LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, params.draft.n_max, params.draft.n_min, params.draft.p_min); + } void begin(llama_seq_id /*seq_id*/, const llama_tokens & /*prompt*/) override { // noop @@ -368,7 +398,7 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { // TODO: implement } - void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override { + void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override { // noop } @@ -377,15 +407,20 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { } }; -struct common_speculative_state_draft_mtp : public common_speculative_impl { +struct common_speculative_impl_draft_mtp : public common_speculative_impl { common_params_speculative_draft params; // reuses the draft-model params slot (ctx_tgt/ctx_dft) llama_batch batch; std::vector smpls; + // backend sampler chain per seq, attached to ctx_dft + std::vector backend_chains; + int32_t n_embd = 0; + bool is_mem_shared = false; + // Per-sequence cross-batch carryover: pair (h_p, x_{p+1}) at MTP pos p+1. // The last h-row of one process() call needs the first token of the NEXT // call to pair with, so it's stashed here until that next call fires. @@ -404,7 +439,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { // pre-advancement before process() mirrored the verify batch. std::vector last_n_drafted; - common_speculative_state_draft_mtp(const common_params_speculative & params, uint32_t n_seq) + common_speculative_impl_draft_mtp(const common_params_speculative & params, uint32_t n_seq) : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_MTP, n_seq) , params(params.draft) { @@ -412,7 +447,19 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { auto * ctx_dft = this->params.ctx_dft; GGML_ASSERT(ctx_tgt && ctx_dft && "MTP requires ctx_tgt and ctx_dft to be set"); - n_embd = llama_model_n_embd(llama_get_model(ctx_dft)); + n_embd = llama_model_n_embd_out(llama_get_model(ctx_dft)); + GGML_ASSERT(n_embd == llama_model_n_embd(llama_get_model(ctx_tgt)) && + "MTP input row width must match the target h_nextn width"); + + LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__); + LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d, backend_sampling=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd, (int) this->params.backend_sampling); + LOG_INF("%s: - gpu_layers=%d, cache_k=%s, cache_v=%s, ctx_tgt=%s, ctx_dft=%s, devices=[%s]\n", __func__, + this->params.n_gpu_layers, + ggml_type_name(this->params.cache_type_k), + ggml_type_name(this->params.cache_type_v), + ctx_tgt ? "yes" : "no", + ctx_dft ? "yes" : "no", + common_speculative_get_devices_str(this->params.devices).c_str()); const int32_t n_b = (int32_t) llama_n_batch(ctx_dft); batch = llama_batch_init(/*n_tokens=*/ n_b, /*embd=*/ n_embd, /*n_seq_max=*/ 1); @@ -424,13 +471,31 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { for (auto & s : smpls) { common_params_sampling sparams; sparams.no_perf = false; - sparams.top_k = 1; // TODO: re-enable top_k == 10 and utilize `p_min` spec param + sparams.top_k = 10; sparams.samplers = { COMMON_SAMPLER_TYPE_TOP_K }; s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams)); } - llama_set_embeddings_pre_norm(ctx_tgt, true); - llama_set_embeddings_pre_norm(ctx_dft, true); + // offload draft sampling to the backend + backend_chains.assign(n_seq, nullptr); + if (this->params.backend_sampling) { + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + llama_sampler * chain = llama_sampler_chain_init(llama_sampler_chain_default_params()); + llama_sampler_chain_add(chain, llama_sampler_init_top_k(10)); + + if (!llama_set_sampler(ctx_dft, seq_id, chain)) { + LOG_WRN("%s: backend offload failed for seq_id=%d; using CPU sampler\n", __func__, (int) seq_id); + llama_sampler_free(chain); + chain = nullptr; + } + backend_chains[seq_id] = chain; + } + } + + llama_set_embeddings_nextn(ctx_tgt, true, /*masked*/ false); + llama_set_embeddings_nextn(ctx_dft, true, /*masked*/ true); + + is_mem_shared = llama_get_ctx_other(ctx_dft) == ctx_tgt; pending_h.assign(n_seq, std::vector(n_embd, 0.0f)); @@ -443,7 +508,19 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { last_n_drafted.assign(n_seq, 0); } - ~common_speculative_state_draft_mtp() override { + ~common_speculative_impl_draft_mtp() override { + auto * ctx_dft = this->params.ctx_dft; + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) backend_chains.size(); ++seq_id) { + if (backend_chains[seq_id] == nullptr) { + continue; + } + if (ctx_dft) { + llama_set_sampler(ctx_dft, seq_id, nullptr); + } + llama_sampler_free(backend_chains[seq_id]); + } + backend_chains.clear(); + if (batch.token != nullptr) { free(batch.token); batch.token = nullptr; @@ -456,10 +533,12 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { if (N <= 0) { return; } + auto * ctx_dft = this->params.ctx_dft; const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id); - if (pos_max < N - 1) { - LOG_WRN("%s: ctx_dft pos_max=%d < N-1=%d — " + + if (pos_max < N - 1 && !is_mem_shared) { + LOG_WRN("%s: ctx_dft pos_max=%d < N-1=%d - " "process() hook may not have run on every prefill ubatch " "(need_embd / logits=1 on every prompt position?). " "Drafts may degrade.\n", @@ -501,53 +580,42 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { const size_t row_bytes = (size_t) n_embd * sizeof(float); - common_batch_clear(batch); + // if kv is shared with target (e.g Gemma4), then we can skip this catch-up decode + if (!is_mem_shared) { + common_batch_clear(batch); - for (int k = 0; k < n_tokens; ++k) { - common_batch_add(batch, batch_in.token[k], batch_in.pos[k], { batch_in.seq_id[k][0] }, 0); - } + for (int k = 0; k < n_tokens; ++k) { + common_batch_add(batch, batch_in.token[k], batch_in.pos[k], { batch_in.seq_id[k][0] }, 0); + } - // shift the tgt embeddings to the right by one position - // assumes that the tokens in the batch are sequential for each sequence - // i.e. we cannot have seq_id like this: [0, 0, 0, 1, 1, 0, 1, 1] - // ^--- this is a problem - // TODO:this is generally true, but would be nice to assert it - { - const float * h_tgt = llama_get_embeddings_pre_norm(ctx_tgt); - std::memcpy(batch.embd + (size_t) 1 * n_embd, h_tgt, row_bytes * (n_tokens-1)); + // shift the tgt embeddings to the right by one position + // assumes that the tokens in the batch are sequential for each sequence + // i.e. we cannot have seq_id like this: [0, 0, 0, 1, 1, 0, 1, 1] + // ^--- this is a problem + // TODO:this is generally true, but would be nice to assert it + { + const float * h_tgt = llama_get_embeddings_nextn(ctx_tgt); + std::memcpy(batch.embd + (size_t) 1 * n_embd, h_tgt, row_bytes * (n_tokens-1)); + } - //{ - // // string with seq_ids in the batch - // std::stringstream ss; - // for (int i = 0; i < n_tokens; ++i) { - // ss << batch_in.seq_id[i][0] << ","; - // } - // LOG_WRN("%s: batch_in.seq_id = %s\n", __func__, ss.str().c_str()); - //} - } + // fill the pending embeddings from a previous run + auto set_h = [&](int idx, const float * h_row) { + std::memcpy(batch.embd + (size_t) idx * n_embd, h_row, row_bytes); + }; - // fill the pending embeddings from a previous run - auto set_h = [&](int idx, const float * h_row) { - std::memcpy(batch.embd + (size_t) idx * n_embd, h_row, row_bytes); - }; + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + if (i_batch_beg[seq_id] < 0) { + continue; + } - for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { - if (i_batch_beg[seq_id] < 0) { - continue; + set_h(i_batch_beg[seq_id], pending_h[seq_id].data()); } - set_h(i_batch_beg[seq_id], pending_h[seq_id].data()); - } - - // During prompt sync the draft MTP context only needs its cache/state - // updated. Host-visible pre-norm rows are consumed during draft() - // generation, not while mirroring prompt batches. - llama_set_embeddings_pre_norm(ctx_dft, false); - const int32_t rc = llama_decode(ctx_dft, batch); - llama_set_embeddings_pre_norm(ctx_dft, true); - if (rc != 0) { - LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (pos=%d)\n", __func__, (int) rc, (int) batch_in.pos[0]); - return false; + const int32_t rc = llama_decode(ctx_dft, batch); + if (rc != 0) { + LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (pos=%d)\n", __func__, (int) rc, (int) batch_in.pos[0]); + return false; + } } for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { @@ -560,7 +628,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { verify_h[seq_id].resize((size_t) n_rows * n_embd); for (int32_t i = 0; i < n_rows; ++i) { - const float * h = llama_get_embeddings_pre_norm_ith(ctx_tgt, i_batch_beg[seq_id] + i); + const float * h = llama_get_embeddings_nextn_ith(ctx_tgt, i_batch_beg[seq_id] + i); std::memcpy(verify_h[seq_id].data() + (size_t) i * n_embd, h, row_bytes); } @@ -621,7 +689,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { auto * smpl = smpls[seq_id].get(); common_sampler_sample(smpl, ctx_dft, i_batch, true); - h_row = llama_get_embeddings_pre_norm_ith(ctx_dft, i_batch); + h_row = llama_get_embeddings_nextn_ith(ctx_dft, i_batch); ++i_batch; const auto * cur_p = common_sampler_get_candidates(smpl, true); @@ -635,6 +703,14 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { // add drafted token for each sequence const llama_token id = cur_p->data[0].id; + // only collect very high-confidence draft tokens + if (cur_p->data[0].p < params.p_min) { + drafting[seq_id] = false; + n_drafting--; + + continue; + } + common_sampler_accept(smpl, id, true); auto & dp = dparams.at(seq_id); @@ -648,7 +724,13 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { continue; } - common_batch_add(batch, id, dp.n_past + i + 1, { seq_id }, true); + if (is_mem_shared) { + // note: with shared memory (e.g. Gemma4 assistants) we use the same position for all draft tokens + // ref: https://github.com/huggingface/transformers/blob/effde20942e3f82a1b97449f60b3a48c5ff96145/docs/source/en/model_doc/gemma4_assistant.md?plain=1#L36-L37 + common_batch_add(batch, id, dp.n_past, { seq_id }, true); + } else { + common_batch_add(batch, id, dp.n_past + i + 1, { seq_id }, true); + } std::memcpy(batch.embd + n_embd*(batch.n_tokens - 1), h_row, row_bytes); } @@ -680,7 +762,7 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { } } - void accept(llama_seq_id seq_id, uint16_t n_accepted) override { + void accept(llama_seq_id seq_id, uint16_t n_accepted, bool /*is_other*/) override { if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) { return; } @@ -696,6 +778,10 @@ struct common_speculative_state_draft_mtp : public common_speculative_impl { } bool need_embd() const override { + return false; + } + + bool need_embd_nextn() const override { return true; } }; @@ -712,7 +798,12 @@ struct common_speculative_impl_ngram_simple : public common_speculative_impl { common_ngram_simple_config config) : common_speculative_impl(COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, n_seq) , params(params.ngram_simple) - , config(config) {} + , config(config) + { + LOG_INF("%s: adding speculative implementation 'ngram-simple'\n", __func__); + LOG_INF("%s: - size_n=%d, size_m=%d, min_hits=%d\n", __func__, + this->params.size_n, this->params.size_m, this->params.min_hits); + } void begin(llama_seq_id /*seq_id*/, const llama_tokens & /*prompt*/) override { // noop @@ -736,7 +827,7 @@ struct common_speculative_impl_ngram_simple : public common_speculative_impl { } } - void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override { + void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override { // noop } @@ -746,20 +837,21 @@ struct common_speculative_impl_ngram_simple : public common_speculative_impl { }; struct common_speculative_impl_ngram_map_k : public common_speculative_impl { - common_params_speculative_ngram_map params; - // n_seq configs std::vector config; common_speculative_impl_ngram_map_k( - const common_params_speculative & params, const common_ngram_map & config, uint32_t n_seq) : common_speculative_impl(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, n_seq) - , params(params.ngram_map_k) { + { for (uint32_t i = 0; i < n_seq; i++) { this->config.push_back(config); } + + LOG_INF("%s: adding speculative implementation '%s'\n", __func__, common_speculative_type_to_str(this->type).c_str()); + LOG_INF("%s: - size_key=%d, size_value=%d, key_only=%d, min_hits=%d\n", __func__, + config.size_key, config.size_value, config.key_only, config.min_hits); } void begin(llama_seq_id seq_id, const llama_tokens & prompt) override { @@ -786,9 +878,13 @@ struct common_speculative_impl_ngram_map_k : public common_speculative_impl { } } - void accept(llama_seq_id seq_id, uint16_t n_accepted) override { + void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) override { GGML_ASSERT((seq_id < (llama_seq_id) config.size())); + if (is_other) { + return; + } + common_ngram_map_accept(config[seq_id], n_accepted); } @@ -810,7 +906,7 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl { // the last position in the prompt that was added to the ngram container size_t i_last = 0; - // length of the last drafted n‑gram (number of tokens returned by draft) + // length of the last drafted n-gram (number of tokens returned by draft) size_t n_draft_last = 0; // consecutive accept rounds with low acceptance fraction (< 0.5) @@ -828,8 +924,11 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl { , verbose(std::getenv("LLAMA_TRACE") != nullptr) { static_assert(sizeof(llama_token) == sizeof(common_ngram_mod::entry_t)); - LOG_INF("%s: initialized ngram_mod with n_match=%d, size=%zu (%.3f MB)\n", __func__, - this->params.n_match, mod.size(), (float)(mod.size_bytes())/1024/1024); + LOG_INF("%s: adding speculative implementation 'ngram-mod'\n", __func__); + LOG_INF("%s: - n_match=%d, n_max=%d, n_min=%d\n", __func__, + this->params.n_match, this->params.n_max, this->params.n_min); + LOG_INF("%s: - mod size=%zu (%.3f MB)\n", __func__, + mod.size(), (float)(mod.size_bytes())/1024/1024); if (this->params.n_match < 16) { LOG_WRN("%s: ngram_mod n_match=%d is too small - poor quality is possible, " @@ -919,7 +1018,7 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl { } result.resize(result.size() - n); - // store length of drafted n‑gram for later acceptance analysis + // store length of drafted n-gram for later acceptance analysis sinfo.n_draft_last = result.size(); } @@ -941,17 +1040,21 @@ struct common_speculative_impl_ngram_mod : public common_speculative_impl { } } - void accept(llama_seq_id seq_id, uint16_t n_accepted) override { + void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) override { + if (is_other) { + return; + } + auto & sinfo = sinfos[seq_id]; // compute acceptance fraction if we have a recorded draft length if (sinfo.n_draft_last > 0) { const double f_acc = (double)n_accepted / (double)sinfo.n_draft_last; - if (f_acc < 0.5) { + if (f_acc < 0.25) { sinfo.n_low++; - if (sinfo.n_low >= 3) { + if (sinfo.n_low >= 5) { if (verbose) { - LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, sinfo.n_low); + LOG_WRN("%s: low acceptance streak (%d) - resetting ngram_mod\n", __func__, sinfo.n_low); } mod.reset(); @@ -1001,6 +1104,12 @@ struct common_speculative_impl_ngram_cache : public common_speculative_impl { , save_dynamic(save_dynamic) , save_static(save_static) { + LOG_INF("%s: adding speculative implementation 'ngram-cache'\n", __func__); + LOG_INF("%s: - n_draft=%d, cache_static=%s, cache_dynamic=%s\n", __func__, + n_draft, + path_static.empty() ? "none" : path_static.c_str(), + path_dynamic.empty() ? "none" : path_dynamic.c_str()); + sinfos.resize(n_seq); if (!path_static.empty()) { @@ -1097,7 +1206,7 @@ struct common_speculative_impl_ngram_cache : public common_speculative_impl { } } - void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/) override { + void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override { // noop } @@ -1217,6 +1326,40 @@ static uint32_t common_get_enabled_speculative_configs(const std::vectortypes) { + switch (type) { + case COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE: + case COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3: + case COMMON_SPECULATIVE_TYPE_DRAFT_MTP: + n_max = std::max(n_max, std::max(0, spec->draft.n_max)); + break; + case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: + n_max = std::max(n_max, (int32_t) spec->ngram_simple.size_m); + break; + case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: + n_max = std::max(n_max, (int32_t) spec->ngram_map_k.size_m); + break; + case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: + n_max = std::max(n_max, (int32_t) spec->ngram_map_k4v.size_m); + break; + case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: + n_max = std::max(n_max, std::max(0, spec->ngram_mod.n_max)); + break; + case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: + n_max = std::max(n_max, (int32_t) 8); + break; + case COMMON_SPECULATIVE_TYPE_NONE: + case COMMON_SPECULATIVE_TYPE_COUNT: + break; + } + } + + return n_max; +} + // initialization of the speculative decoding system // common_speculative * common_speculative_init(common_params_speculative & params, uint32_t n_seq) { @@ -1225,8 +1368,6 @@ common_speculative * common_speculative_init(common_params_speculative & params, { uint32_t enabled_configs = common_get_enabled_speculative_configs(params.types); - bool has_draft_model_path = !params.draft.mparams.path.empty(); - bool has_draft_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE)); bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3 bool has_mtp = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP)) && params.draft.ctx_dft != nullptr; @@ -1259,16 +1400,6 @@ common_speculative * common_speculative_init(common_params_speculative & params, if (has_ngram_cache) { configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_CACHE, params)); } - if (has_draft_simple) { - if (!has_draft_model_path) { - LOG_WRN("%s: draft model is not specified - cannot use 'draft' type\n", __func__); - has_draft_simple = false; - } - } else if (has_draft_model_path && !has_mtp && !has_draft_eagle3) { - LOG_WRN("%s: draft model is specified but 'draft' speculative type is not explicitly enabled - enabling it\n", __func__); - has_draft_simple = true; - } - if (has_draft_simple) { configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE, params)); } @@ -1283,7 +1414,6 @@ common_speculative * common_speculative_init(common_params_speculative & params, std::vector> impls = {}; for (const common_speculative_config & config : configs) { - LOG_INF("%s: adding speculative implementation '%s'\n", __func__, common_speculative_type_to_str(config.type).c_str()); switch (config.type) { case COMMON_SPECULATIVE_TYPE_NONE: break; @@ -1296,7 +1426,7 @@ common_speculative * common_speculative_init(common_params_speculative & params, break; } case COMMON_SPECULATIVE_TYPE_DRAFT_MTP: { - impls.push_back(std::make_unique(config.params, n_seq)); + impls.push_back(std::make_unique(config.params, n_seq)); break; } case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: { @@ -1317,11 +1447,16 @@ common_speculative * common_speculative_init(common_params_speculative & params, impls.push_back(std::move(state)); break; } - case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: + case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K: { + impls.push_back( + std::make_unique( + get_common_ngram_map(config.type, config.params.ngram_map_k), n_seq)); + break; + } case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: { impls.push_back( std::make_unique( - config.params, get_common_ngram_map(config.type, config.params.ngram_map_k), n_seq)); + get_common_ngram_map(config.type, config.params.ngram_map_k4v), n_seq)); break; } case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: { @@ -1413,6 +1548,20 @@ bool common_speculative_need_embd(common_speculative * spec) { return false; } +bool common_speculative_need_embd_nextn(common_speculative * spec) { + if (spec == nullptr) { + return false; + } + + for (auto & impl : spec->impls) { + if (impl->need_embd_nextn()) { + return true; + } + } + + return false; +} + void common_speculative_draft(common_speculative * spec) { if (spec == nullptr) { return; @@ -1499,11 +1648,6 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u GGML_ASSERT(impl); - // TODO: currently only the implementation that generated the draft is used to accept it - // however, some implementations (such as MTP) need to also "see" the accepted tokens - // extend `common_speculative_impl::accept()` with an extra argument `bool is_other` to - // inform the implementation if the accepted tokens are from another implementation and - // pass the accepted tokens to all remaining implementations using `is_other == true` { common_time_meas tm(impl->t_accept_us, !impl->gen_perf); if (n_accepted > 0) { @@ -1511,9 +1655,16 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u impl->n_acc_tokens += n_accepted; } - impl->accept(seq_id, n_accepted); + impl->accept(seq_id, n_accepted, false); impl->n_call_accept++; } + + // accept with the rest of the implementations, using is_other == true + for (auto & impl_other : spec->impls) { + if (impl_other.get() != impl) { + impl_other->accept(seq_id, n_accepted, true); + } + } } void common_speculative_print_stats(const common_speculative * spec) { @@ -1533,7 +1684,7 @@ void common_speculative_print_stats(const common_speculative * spec) { str_perf = ""; } - LOG_INF("statistics %s: #calls(b,g,a) = %zu %zu %zu, #gen drafts = %zu, #acc drafts = %zu, #gen tokens = %zu, #acc tokens = %zu%s\n", + LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s\n", common_speculative_type_to_str(impl->type).c_str(), impl->n_call_begin, impl->n_call_draft, impl->n_call_accept, impl->n_gen_drafts, diff --git a/common/speculative.h b/common/speculative.h index 614db9b1b509..bf76ad709e26 100644 --- a/common/speculative.h +++ b/common/speculative.h @@ -20,6 +20,9 @@ enum common_speculative_type common_speculative_type_from_name(const std::string // convert type to string std::string common_speculative_type_to_str(enum common_speculative_type type); +// return the max number of draft tokens based on the speculative parameters +int32_t common_speculative_n_max(const common_params_speculative * spec); + common_speculative * common_speculative_init(common_params_speculative & params, uint32_t n_seq); void common_speculative_free(common_speculative * spec); @@ -53,9 +56,12 @@ void common_speculative_begin(common_speculative * spec, llama_seq_id seq_id, co // process the batch and update the internal state of the speculative context bool common_speculative_process(common_speculative * spec, const llama_batch & batch); -// true if any implementation requires target embeddings to be extracted +// true if any implementation requires target post-norm embeddings to be extracted bool common_speculative_need_embd(common_speculative * spec); +// true if any implementation requires target nextn embeddings to be extracted +bool common_speculative_need_embd_nextn(common_speculative * spec); + // generate drafts for the sequences specified with `common_speculative_get_draft_params` void common_speculative_draft(common_speculative * spec); diff --git a/conversion/__init__.py b/conversion/__init__.py index e933280a9422..14b86198631b 100644 --- a/conversion/__init__.py +++ b/conversion/__init__.py @@ -73,9 +73,11 @@ "Gemma3TextModel": "gemma", "Gemma3nForCausalLM": "gemma", "Gemma3nForConditionalGeneration": "gemma", + "Gemma4AssistantForCausalLM": "gemma", "Gemma4ForConditionalGeneration": "gemma", "Gemma4ForCausalLM": "gemma", "Gemma4UnifiedForConditionalGeneration": "gemma", + "Gemma4UnifiedAssistantForCausalLM": "gemma", "GemmaForCausalLM": "gemma", "Glm4ForCausalLM": "glm", "Glm4MoeForCausalLM": "glm", diff --git a/conversion/gemma.py b/conversion/gemma.py index 31a450ed4824..5339ec016780 100644 --- a/conversion/gemma.py +++ b/conversion/gemma.py @@ -785,6 +785,16 @@ def set_gguf_parameters(self): self.gguf_writer.add_suppress_tokens(suppress_tokens) +@ModelBase.register("Gemma4AssistantForCausalLM", "Gemma4UnifiedAssistantForCausalLM") +class Gemma4AssistantModel(Gemma4Model): + model_arch = gguf.MODEL_ARCH.GEMMA4_ASSISTANT + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_embedding_length_out(self.hparams["backbone_hidden_size"]) + self.gguf_writer.add_nextn_predict_layers(self.block_count) + + @ModelBase.register("Gemma4ForConditionalGeneration") class Gemma4VisionAudioModel(MmprojModel): has_audio_encoder = True diff --git a/docs/speculative.md b/docs/speculative.md index fb6ef03067d3..45e42d42a436 100644 --- a/docs/speculative.md +++ b/docs/speculative.md @@ -108,11 +108,12 @@ If a draft model is combined with a draftless decoding the draftless decoding ha ### General Speculative Parameters ``` ---spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod] - type of speculative decoding to use when no draft model is provided +--spec-type [none|draft-simple|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod] + comma-separated list of types of speculative decoding to use (default: none) (env: LLAMA_ARG_SPEC_TYPE) ---spec-default use default speculative decoding +--spec-default use default speculative decoding config + (enables ngram-mod) ``` ### Draft Model Parameters @@ -123,8 +124,9 @@ If a draft model is combined with a draftless decoding the draftless decoding ha (env: LLAMA_ARG_SPEC_DRAFT_MODEL) --spec-draft-hf, -hfd, -hfrd, --hf-repo-draft /[:quant] HuggingFace repository for the draft model + (env: LLAMA_ARG_SPEC_DRAFT_HF_REPO) --spec-draft-n-max N - number of tokens to draft for speculative decoding (default: 16) + number of tokens to draft for speculative decoding (default: 3) (env: LLAMA_ARG_SPEC_DRAFT_N_MAX) --spec-draft-n-min N minimum number of draft tokens to use for speculative decoding (default: 0) @@ -133,18 +135,64 @@ If a draft model is combined with a draftless decoding the draftless decoding ha speculative decoding split probability (default: 0.10) (env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT) --spec-draft-p-min, --draft-p-min P - minimum speculative decoding probability (greedy) (default: 0.75) + minimum speculative decoding probability (greedy) (default: 0.00) (env: LLAMA_ARG_SPEC_DRAFT_P_MIN) ---spec-draft-ctx-size, -cd, --ctx-size-draft N - size of the prompt context for the draft model (default: 0, 0 = loaded from model) - (env: LLAMA_ARG_SPEC_DRAFT_CTX_SIZE) --spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto) (env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) --spec-draft-device, -devd, --device-draft comma-separated list of devices to use for offloading the draft model ---spec-draft-replace, --spec-replace TARGET DRAFT - translate the string in TARGET into DRAFT if the draft model and main model are not compatible + (use --list-devices to see available devices) +``` + +### Draft Model CPU Scheduling Parameters + +``` +--spec-draft-threads, -td, --threads-draft N + number of CPU threads to use during generation +--spec-draft-threads-batch, -tbd, --threads-batch-draft N + number of threads to use during batch and prompt processing (default: same as --threads-draft) +--spec-draft-cpu-mask, -Cd, --cpu-mask-draft M + Draft model CPU affinity mask. Complements cpu-range-draft +--spec-draft-cpu-range, -Crd, --cpu-range-draft lo-hi + Ranges of CPUs for affinity. Complements --cpu-mask-draft +--spec-draft-cpu-strict, --cpu-strict-draft <0|1> + Use strict CPU placement for draft model (default: same as --cpu-strict) +--spec-draft-prio, --prio-draft N + set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime +--spec-draft-poll, --poll-draft <0|1> + Use polling to wait for draft model work (default: same as --poll) +--spec-draft-cpu-mask-batch, -Cbd, --cpu-mask-batch-draft M + Draft model CPU affinity mask for batch. Complements cpu-range-batch-draft +--spec-draft-cpu-range-batch, -Crbd, --cpu-range-batch-draft lo-hi + Ranges of CPUs for affinity for batch. Complements --cpu-mask-batch-draft +--spec-draft-cpu-strict-batch, --cpu-strict-batch-draft <0|1> + Use strict CPU placement for draft model batch (default: --cpu-strict-draft) +--spec-draft-prio-batch, --prio-batch-draft N + set draft process/thread priority for batch : 0-normal, 1-medium, 2-high, 3-realtime +--spec-draft-poll-batch, --poll-batch-draft <0|1> + Use polling to wait for draft model work for batch (default: --poll-draft) +``` + +### Draft Model KV Cache and Tensor Override Parameters + +``` +--spec-draft-type-k, -ctkd, --cache-type-k-draft TYPE + KV cache data type for K for the draft model + allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1 + (env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_K) +--spec-draft-type-v, -ctvd, --cache-type-v-draft TYPE + KV cache data type for V for the draft model + allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1 + (env: LLAMA_ARG_SPEC_DRAFT_CACHE_TYPE_V) +--spec-draft-override-tensor, -otd, --override-tensor-draft =,... + override tensor buffer type for draft model +--spec-draft-cpu-moe, -cmoed, --cpu-moe-draft + keep all Mixture of Experts (MoE) weights in the CPU for the draft model + (env: LLAMA_ARG_SPEC_DRAFT_CPU_MOE) +--spec-draft-n-cpu-moe, --spec-draft-ncmoe, -ncmoed, --n-cpu-moe-draft N + keep the MoE weights of the first N layers in the CPU for the draft model + (env: LLAMA_ARG_SPEC_DRAFT_N_CPU_MOE) ``` ### n-gram Mod Parameters @@ -193,11 +241,13 @@ If a draft model is combined with a draftless decoding the draftless decoding ha ### `--spec-type TYPE` -Specifies a type of speculative decoding without draft model. +Specifies a comma-separated list of speculative decoding types to use. | Type | Description | |------|-------------| | `none` | No speculative decoding (default) | +| `draft-simple` | Use a simple draft model for speculation | +| `draft-mtp` | Use Masked Token Prediction (MTP) heads from the main model | | `ngram-cache` | Use n-gram cache lookup | | `ngram-simple` | Use simple n-gram pattern matching | | `ngram-map-k` | Use n-gram pattern matching with n-gram-keys | @@ -209,6 +259,11 @@ Specifies a type of speculative decoding without draft model. ./llama-server [...] --spec-type ngram-simple ``` +**Example:** Multiple speculative implementations. +```bash +./llama-server [...] --spec-type ngram-mod,ngram-map-k4v +``` + ### `--spec-ngram-*-size-n N` Sets the size N of the lookup n-gram for n-gram map based speculative decoding. diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp index c418b42cec8e..262d51a041e6 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp @@ -232,7 +232,7 @@ void main() { uint coord = (j * Bc + c) * k_stride * BLOCK_SIZE_K + 4 * d; uint ib = coord / BLOCK_SIZE_K; uint iqs = (coord % BLOCK_SIZE_K); - K_Tf = dequantize4(ib, iqs, k_offset, BINDING_IDX_K); + K_Tf = f16vec4(dequantize4(ib, iqs, k_offset, BINDING_IDX_K)); } else { #if !defined(DATA_A_TURBO3_0) K_Tf = f16vec4(data_kv4[k_offset / 4 + (j * Bc + c) * k_stride / 4 + d]); @@ -274,7 +274,7 @@ void main() { uint coord = (j * Bc + row) * k_stride * BLOCK_SIZE_K + d * 16 + col_vec * 4; uint ib = coord / BLOCK_SIZE_K; uint iqs = (coord % BLOCK_SIZE_K); - K_Tf = dequantize4(ib, iqs, k_offset, BINDING_IDX_K); + K_Tf = f16vec4(dequantize4(ib, iqs, k_offset, BINDING_IDX_K)); } else { #if !defined(DATA_A_TURBO3_0) K_Tf = f16vec4(data_kv4[k_offset / 4 + (j * Bc + row) * k_stride / 4 + d * 16 / 4 + col_vec]); @@ -403,7 +403,7 @@ void main() { uint coord = (j * Bc + c) * v_stride * BLOCK_SIZE_V + 4 * d; uint ib = coord / BLOCK_SIZE_V; uint iqs = (coord % BLOCK_SIZE_V); - V_Tf = dequantize4(ib, iqs, v_offset, BINDING_IDX_V); + V_Tf = f16vec4(dequantize4(ib, iqs, v_offset, BINDING_IDX_V)); } else { #if !defined(DATA_A_TURBO3_0) V_Tf = f16vec4(data_vv4[v_offset / 4 + (j * Bc + c) * v_stride / 4 + d]); @@ -453,7 +453,7 @@ void main() { if (!KV_bounds_check || (v_row < KV && v_col < HSV)) { if (USE_DECODE_V) { - kvsh[row * vsh_stride + col] = dequantize4(ib, iqs, v_offset, BINDING_IDX_V); + kvsh[row * vsh_stride + col] = f16vec4(dequantize4(ib, iqs, v_offset, BINDING_IDX_V)); } else { #if !defined(DATA_A_TURBO3_0) kvsh[row * vsh_stride + col] = data_vv4[(v_offset + v_row * v_stride + v_col) / 4]; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 67e81ab5f836..5bb723f6416a 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -431,6 +431,7 @@ class MODEL_ARCH(IntEnum): GEMMA3 = auto() GEMMA3N = auto() GEMMA4 = auto() + GEMMA4_ASSISTANT = auto() GEMMA_EMBEDDING = auto() STARCODER2 = auto() RWKV6 = auto() @@ -858,6 +859,8 @@ class MODEL_TENSOR(IntEnum): A_PER_DIM_K_SCALE = auto() # gemma4 A_PER_DIM_SCALE = auto() # gemma4 # nextn/mtp + NEXTN_PROJ_PRE = auto() + NEXTN_PROJ_POST = auto() NEXTN_EH_PROJ = auto() NEXTN_EMBED_TOKENS = auto() NEXTN_ENORM = auto() @@ -947,6 +950,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.GEMMA3: "gemma3", MODEL_ARCH.GEMMA3N: "gemma3n", MODEL_ARCH.GEMMA4: "gemma4", + MODEL_ARCH.GEMMA4_ASSISTANT: "gemma4-assistant", MODEL_ARCH.GEMMA_EMBEDDING: "gemma-embedding", MODEL_ARCH.STARCODER2: "starcoder2", MODEL_ARCH.RWKV6: "rwkv6", @@ -1404,6 +1408,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.A_QF_FFN_DOWN: "a.proj_blk.{bid}.ffn_down", MODEL_TENSOR.A_QF_FFN_NORM: "a.proj_blk.{bid}.ffn_norm", # NextN/MTP + MODEL_TENSOR.NEXTN_PROJ_PRE: "nextn.pre_projection", + MODEL_TENSOR.NEXTN_PROJ_POST: "nextn.post_projection", MODEL_TENSOR.NEXTN_EH_PROJ: "blk.{bid}.nextn.eh_proj", MODEL_TENSOR.NEXTN_EMBED_TOKENS: "blk.{bid}.nextn.embed_tokens", MODEL_TENSOR.NEXTN_ENORM: "blk.{bid}.nextn.enorm", @@ -2485,6 +2491,24 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.PER_LAYER_PROJ_NORM, MODEL_TENSOR.PER_LAYER_POST_NORM, ], + MODEL_ARCH.GEMMA4_ASSISTANT: [ + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.NEXTN_PROJ_PRE, + MODEL_TENSOR.NEXTN_PROJ_POST, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_POST_NORM, + MODEL_TENSOR.FFN_PRE_NORM, + MODEL_TENSOR.FFN_POST_NORM, + MODEL_TENSOR.LAYER_OUT_SCALE, + ], MODEL_ARCH.GEMMA_EMBEDDING: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index ab9dfe36758f..85c4541486fc 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -2251,6 +2251,14 @@ class TensorNameMap: ), # NextN/MTP tensors + MODEL_TENSOR.NEXTN_PROJ_PRE: ( + "pre_projection", + ), + + MODEL_TENSOR.NEXTN_PROJ_POST: ( + "post_projection", + ), + MODEL_TENSOR.NEXTN_EH_PROJ: ( "model.layers.{bid}.eh_proj", ), diff --git a/include/llama.h b/include/llama.h index 52eb212466b5..6f6cd5f1f5c0 100644 --- a/include/llama.h +++ b/include/llama.h @@ -340,6 +340,7 @@ extern "C" { uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode uint32_t n_ubatch; // physical maximum batch size uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models) + uint32_t n_outputs_max; // max outputs supported by the context (0 = derive from n_seq_max) uint32_t n_rs_seq; // number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL] int32_t n_threads; // number of threads to use for generation int32_t n_threads_batch; // number of threads to use for batch processing @@ -389,6 +390,10 @@ extern "C" { // note: the samplers must be sampler chains (i.e. use llama_sampler_chain_init) struct llama_sampler_seq_config * samplers; size_t n_samplers; + + // a source/target/parent context + // can be utilized in various ways, for example by sharing results or llama_memory between 2 contexts + struct llama_context * ctx_other; }; struct llama_model_tensor_override { diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index 4a1aaa955a87..3e0fe66afff7 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -41,7 +41,7 @@ bool llama_adapter_cvec::init(const llama_model & model) { auto it = ctx_map.find(buft); if (it == ctx_map.end()) { ggml_init_params params = { - /*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(), + /*.mem_size =*/ hparams.n_layer()*ggml_tensor_overhead(), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; @@ -61,9 +61,9 @@ bool llama_adapter_cvec::init(const llama_model & model) { }; // make tensors - tensors.reserve(hparams.n_layer); + tensors.reserve(hparams.n_layer()); tensors.push_back(nullptr); // there's never a tensor for layer 0 - for (size_t il = 1; il < hparams.n_layer; il++) { + for (size_t il = 1; il < hparams.n_layer(); il++) { ggml_backend_buffer_type_t buft = model.select_buft(il); ggml_context * ctx = ctx_for_buft(buft); if (!ctx) { @@ -121,7 +121,7 @@ bool llama_adapter_cvec::apply( layer_start = il_start; layer_end = il_end; - for (size_t il = 1; il < hparams.n_layer; il++) { + for (size_t il = 1; il < hparams.n_layer(); il++) { assert(tensors[il] != nullptr); const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 02ebc2168292..dfbadd7c7a87 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -57,6 +57,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_GEMMA3, "gemma3" }, { LLM_ARCH_GEMMA3N, "gemma3n" }, { LLM_ARCH_GEMMA4, "gemma4" }, + { LLM_ARCH_GEMMA4_ASSISTANT, "gemma4-assistant" }, { LLM_ARCH_GEMMA_EMBEDDING, "gemma-embedding" }, { LLM_ARCH_STARCODER2, "starcoder2" }, { LLM_ARCH_MAMBA, "mamba" }, @@ -243,6 +244,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, "%s.attention.indexer.key_length" }, { LLM_KV_ATTENTION_INDEXER_TOP_K, "%s.attention.indexer.top_k" }, { LLM_KV_ATTENTION_SHARED_KV_LAYERS, "%s.attention.shared_kv_layers" }, + { LLM_KV_ATTENTION_RECURRENT_LAYERS, "%s.attention.recurrent_layers" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_DIMENSION_COUNT_SWA, "%s.rope.dimension_count_swa" }, @@ -446,6 +448,8 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" }, { LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" }, { LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" }, + { LLM_TENSOR_NEXTN_PROJ_PRE, "nextn.pre_projection" }, + { LLM_TENSOR_NEXTN_PROJ_POST, "nextn.post_projection" }, { LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.nextn.eh_proj" }, { LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.nextn.embed_tokens" }, { LLM_TENSOR_NEXTN_ENORM, "blk.%d.nextn.enorm" }, @@ -758,6 +762,8 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_INDEXER_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_INDEXER_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_INDEXER_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_NEXTN_PROJ_PRE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_NEXTN_PROJ_POST, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, // NextN/MTP tensors are stored per-block (blk.%d.nextn.*) even though only the // last nextn_predict_layers blocks carry them. Classify as LAYER_REPEATING so // the model loader doesn't fault on the block index. diff --git a/src/llama-arch.h b/src/llama-arch.h index f3ca6b51c17f..975b6e39b805 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -61,6 +61,7 @@ enum llm_arch { LLM_ARCH_GEMMA3, LLM_ARCH_GEMMA3N, LLM_ARCH_GEMMA4, + LLM_ARCH_GEMMA4_ASSISTANT, LLM_ARCH_GEMMA_EMBEDDING, LLM_ARCH_STARCODER2, LLM_ARCH_MAMBA, @@ -247,6 +248,7 @@ enum llm_kv { LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, LLM_KV_ATTENTION_INDEXER_TOP_K, LLM_KV_ATTENTION_SHARED_KV_LAYERS, + LLM_KV_ATTENTION_RECURRENT_LAYERS, LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_DIMENSION_COUNT_SWA, @@ -550,6 +552,8 @@ enum llm_tensor { LLM_TENSOR_INDEXER_PROJ, LLM_TENSOR_INDEXER_ATTN_K, LLM_TENSOR_INDEXER_ATTN_Q_B, + LLM_TENSOR_NEXTN_PROJ_PRE, + LLM_TENSOR_NEXTN_PROJ_POST, LLM_TENSOR_NEXTN_EH_PROJ, LLM_TENSOR_NEXTN_EMBED_TOKENS, LLM_TENSOR_NEXTN_ENORM, diff --git a/src/llama-context.cpp b/src/llama-context.cpp index c6ce7c707db1..43c2b722f76e 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -47,6 +47,7 @@ llama_context::llama_context( const auto & hparams = model.hparams; cparams.n_seq_max = std::max(1u, params.n_seq_max); + cparams.n_outputs_max = params.n_outputs_max; if (cparams.n_seq_max > LLAMA_MAX_SEQ) { throw std::runtime_error("n_seq_max must be <= " + std::to_string(LLAMA_MAX_SEQ)); } @@ -58,18 +59,21 @@ llama_context::llama_context( cparams.n_rs_seq = 0; } - cparams.n_threads = params.n_threads; - cparams.n_threads_batch = params.n_threads_batch; - cparams.yarn_ext_factor = params.yarn_ext_factor >= 0.0f ? params.yarn_ext_factor : hparams.yarn_ext_factor; - cparams.yarn_attn_factor = params.yarn_attn_factor >= 0.0f ? params.yarn_attn_factor : hparams.yarn_attn_factor; - cparams.yarn_beta_fast = params.yarn_beta_fast >= 0.0f ? params.yarn_beta_fast : hparams.yarn_beta_fast; - cparams.yarn_beta_slow = params.yarn_beta_slow >= 0.0f ? params.yarn_beta_slow : hparams.yarn_beta_slow; - cparams.embeddings = params.embeddings; - cparams.embeddings_pre_norm = false; - cparams.offload_kqv = params.offload_kqv; - cparams.no_perf = params.no_perf; - cparams.pooling_type = params.pooling_type; - cparams.warmup = false; + cparams.n_threads = params.n_threads; + cparams.n_threads_batch = params.n_threads_batch; + cparams.yarn_ext_factor = params.yarn_ext_factor >= 0.0f ? params.yarn_ext_factor : hparams.yarn_ext_factor; + cparams.yarn_attn_factor = params.yarn_attn_factor >= 0.0f ? params.yarn_attn_factor : hparams.yarn_attn_factor; + cparams.yarn_beta_fast = params.yarn_beta_fast >= 0.0f ? params.yarn_beta_fast : hparams.yarn_beta_fast; + cparams.yarn_beta_slow = params.yarn_beta_slow >= 0.0f ? params.yarn_beta_slow : hparams.yarn_beta_slow; + cparams.embeddings = params.embeddings; + cparams.embeddings_nextn = false; + cparams.embeddings_nextn_masked = false; + cparams.offload_kqv = params.offload_kqv; + cparams.no_perf = params.no_perf; + cparams.warmup = false; + + cparams.ctx_type = params.ctx_type; + cparams.pooling_type = params.pooling_type; cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; @@ -82,7 +86,17 @@ llama_context::llama_context( cparams.cb_eval = params.cb_eval; cparams.cb_eval_user_data = params.cb_eval_user_data; - cparams.ctx_type = params.ctx_type; + cparams.ctx_other = nullptr; + + // TODO: more generic + if (model.arch == LLM_ARCH_GEMMA4_ASSISTANT) { + if (params.ctx_other == nullptr) { + // TODO: change from runtime_error to llama_exception to avoid printing error message + throw std::runtime_error("Gemma4Assistant requires ctx_other to be set (this is normal during memory fitting)"); + } + + cparams.ctx_other = params.ctx_other; + } // Initialize backend samplers here so they are part of the sampling graph // before the reserve passes run later in this function. This avoids a later @@ -226,6 +240,7 @@ llama_context::llama_context( LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); LLAMA_LOG_INFO("%s: n_rs_seq = %u\n", __func__, cparams.n_rs_seq); + LLAMA_LOG_INFO("%s: n_outputs_max = %u\n", __func__, cparams.n_outputs_max); if (cparams.n_ctx_seq < hparams.n_ctx_train) { LLAMA_LOG_WARN("%s: n_ctx_seq (%u) < n_ctx_train (%u) -- the full capacity of the model will not be utilized\n", @@ -295,10 +310,11 @@ llama_context::llama_context( // init the memory module if (!hparams.vocab_only) { llama_memory_params params_mem = { - /*.type_k =*/ params.type_k, - /*.type_v =*/ params.type_v, - /*.swa_full =*/ params.swa_full, - /*.ctx_type= */ cparams.ctx_type, + /*.type_k =*/ params.type_k, + /*.type_v =*/ params.type_v, + /*.swa_full =*/ params.swa_full, + /*.ctx_type =*/ cparams.ctx_type, + /*.mem_other =*/ llama_get_memory(cparams.ctx_other), }; memory.reset(model.create_memory(params_mem, cparams)); @@ -336,7 +352,7 @@ llama_context::llama_context( // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary bool pipeline_parallel = model.n_devices() > 1 && - model.n_gpu_layers() > model.hparams.n_layer && + model.n_gpu_layers() > model.hparams.n_layer() && model.split_mode() == LLAMA_SPLIT_MODE_LAYER && cparams.offload_kqv && !model.has_tensor_overrides(); @@ -881,34 +897,34 @@ float * llama_context::get_embeddings_seq(llama_seq_id seq_id) { return it->second.data(); } -float * llama_context::get_embeddings_pre_norm() { +float * llama_context::get_embeddings_nextn() { output_reorder(); - return embd_pre_norm.data; + return embd_nextn.data; } -float * llama_context::get_embeddings_pre_norm_ith(int32_t i) { +float * llama_context::get_embeddings_nextn_ith(int32_t i) { + output_reorder(); + try { - if (embd_pre_norm.data == nullptr) { - throw std::runtime_error("no pre-norm embeddings"); + if (embd_nextn.data == nullptr) { + throw std::runtime_error("no nextn embeddings"); } - int64_t j = i; - if (j < 0) { - j = n_outputs_pre_norm + j; - if (j < 0) { - throw std::runtime_error(format("negative index out of range [0, %d)", n_outputs_pre_norm)); - } - } + const uint32_t n_embd = model.hparams.n_embd_out(); - if (j >= n_outputs_pre_norm) { - throw std::runtime_error(format("pre-norm embeddings id out of range [0, %d)", n_outputs_pre_norm)); + if (!cparams.embeddings_nextn_masked) { + // unmasked: nextn rows are stored densely, indexed by raw token position. + if (i < 0 || (size_t)(i + 1) * n_embd > embd_nextn.size) { + throw std::runtime_error(format("out of range [0, %zu)", embd_nextn.size / n_embd)); + } + return embd_nextn.data + (size_t) i * n_embd; } - const uint32_t n_embd = model.hparams.n_embd; - return embd_pre_norm.data + j*n_embd; + const int64_t j = output_resolve_row(i); + return embd_nextn.data + j*n_embd; } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: invalid pre-norm embeddings id %d, reason: %s\n", __func__, i, err.what()); + LLAMA_LOG_ERROR("%s: invalid nextn embeddings id %d, reason: %s\n", __func__, i, err.what()); #ifndef NDEBUG GGML_ABORT("fatal error"); #else @@ -1097,10 +1113,11 @@ void llama_context::set_embeddings(bool value) { //sched_need_reserve = true; } -void llama_context::set_embeddings_pre_norm(bool value) { - LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value); +void llama_context::set_embeddings_nextn(bool value, bool masked) { + LLAMA_LOG_DEBUG("%s: value = %d, masked = %d\n", __func__, value, masked); - cparams.embeddings_pre_norm = value; + cparams.embeddings_nextn = value; + cparams.embeddings_nextn_masked = masked; } void llama_context::set_causal_attn(bool value) { @@ -1135,6 +1152,19 @@ bool llama_context::set_sampler(llama_seq_id seq_id, llama_sampler * sampler) { LLAMA_LOG_DEBUG("%s: seq_id = %d, sampler = %p\n", __func__, (int) seq_id, (void *) sampler); + if (sampler && model.split_mode() == LLAMA_SPLIT_MODE_TENSOR) { + static bool warned = false; + if (!warned) { + LLAMA_LOG_WARN("%s: backend sampling not supported with SPLIT_MODE_TENSOR; using CPU\n", __func__); + warned = true; + } + if (sampling.samplers.count(seq_id) > 0) { + sched_need_reserve = true; + } + sampling.samplers.erase(seq_id); + return false; + } + const bool can_offload = sampler && sampler->iface->backend_init && @@ -1304,7 +1334,7 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll } int llama_context::encode(const llama_batch & batch_inp) { - // MTP hook batches carry both token (next-token id) and embd (h_pre_norm row), + // MTP hook batches carry both token (next-token id) and embd (h_nextn row), // so accept either present rather than requiring exactly one. GGML_ASSERT(batch_inp.token || batch_inp.embd); @@ -1355,7 +1385,6 @@ int llama_context::encode(const llama_batch & batch_inp) { } n_outputs = n_tokens; - n_outputs_pre_norm = cparams.embeddings_pre_norm ? n_tokens : 0; const auto causal_attn_org = cparams.causal_attn; @@ -1378,9 +1407,9 @@ int llama_context::encode(const llama_batch & batch_inp) { } } - auto * t_logits = res->get_logits(); - auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd(); - auto * t_h_pre_norm = cparams.embeddings_pre_norm ? res->get_h_pre_norm() : nullptr; + auto * t_logits = res->get_logits(); + auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd(); + auto * t_h_nextn = cparams.embeddings_nextn ? res->get_h_nextn() : nullptr; // extract logits if (logits.data && t_logits) { @@ -1446,14 +1475,14 @@ int llama_context::encode(const llama_batch & batch_inp) { } } - // extract pre-norm embeddings (hidden state before the final output norm) - if (embd_pre_norm.data && t_h_pre_norm && cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { - ggml_backend_t backend_h = ggml_backend_sched_get_tensor_backend(sched.get(), t_h_pre_norm); + // extract nextn embeddings (hidden state before the final output norm) + if (embd_nextn.data && t_h_nextn && cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { + ggml_backend_t backend_h = ggml_backend_sched_get_tensor_backend(sched.get(), t_h_nextn); GGML_ASSERT(backend_h != nullptr); - const uint32_t n_embd = hparams.n_embd; - GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_pre_norm.size); - ggml_backend_tensor_get_async(backend_h, t_h_pre_norm, embd_pre_norm.data, 0, n_tokens*n_embd*sizeof(float)); + const uint32_t n_embd = hparams.n_embd_out(); + GGML_ASSERT(n_tokens*n_embd <= (int64_t) embd_nextn.size); + ggml_backend_tensor_get_async(backend_h, t_h_nextn, embd_nextn.data, 0, n_tokens*n_embd*sizeof(float)); } // TODO: hacky solution @@ -1608,7 +1637,7 @@ static bool needs_raw_logits(const llama_ubatch & ubatch, const std::mapget_ubatch(); @@ -1808,9 +1837,9 @@ int llama_context::decode(const llama_batch & batch_inp) { // ggml_graph_dump_dot(gf, NULL, "llama.dot"); //} - auto * t_logits = res->get_logits(); - auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr; - auto * t_h_pre_norm = cparams.embeddings_pre_norm ? res->get_h_pre_norm() : nullptr; + auto * t_logits = res->get_logits(); + auto * t_embd = cparams.embeddings ? res->get_embd() : nullptr; + auto * t_h_nextn = cparams.embeddings_nextn ? res->get_h_nextn() : nullptr; if (t_embd && res->get_embd_pooled()) { t_embd = res->get_embd_pooled(); @@ -1891,19 +1920,23 @@ int llama_context::decode(const llama_batch & batch_inp) { } } - // extract pre-norm embeddings (hidden state before the final output norm) + // extract nextn embeddings before // only meaningful in LLAMA_POOLING_TYPE_NONE (per-token); other pooling modes are ignored. - if (embd_pre_norm.data && t_h_pre_norm && cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { - ggml_backend_t backend_h = ggml_backend_sched_get_tensor_backend(sched.get(), t_h_pre_norm); - GGML_ASSERT(backend_h != nullptr); + { + const bool masked = cparams.embeddings_nextn_masked; + const int64_t n_rows = masked ? n_outputs : (int64_t) ubatch.n_tokens; + const int64_t offset = masked ? n_outputs_prev : n_tokens_prev; - const uint32_t n_embd = hparams.n_embd; - const int64_t n_outputs_pre_norm_new = t_h_pre_norm->ne[1]; - float * embd_pre_norm_out = embd_pre_norm.data + n_outputs_pre_norm_prev*n_embd; + if (embd_nextn.data && t_h_nextn && n_rows > 0 && cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { + ggml_backend_t backend_h = ggml_backend_sched_get_tensor_backend(sched.get(), t_h_nextn); + GGML_ASSERT(backend_h != nullptr); - GGML_ASSERT(n_outputs_pre_norm_prev + n_outputs_pre_norm_new <= (int64_t) embd_pre_norm.size/(int64_t)n_embd); - ggml_backend_tensor_get_async(backend_h, t_h_pre_norm, embd_pre_norm_out, 0, n_outputs_pre_norm_new*n_embd*sizeof(float)); - n_outputs_pre_norm_prev += n_outputs_pre_norm_new; + const uint32_t n_embd = hparams.n_embd_out(); + float * embd_nextn_out = embd_nextn.data + offset*n_embd; + + GGML_ASSERT((offset + n_rows)*n_embd <= (int64_t) embd_nextn.size); + ggml_backend_tensor_get_async(backend_h, t_h_nextn, embd_nextn_out, 0, n_rows*n_embd*sizeof(float)); + } } // Copy backend sampling output if this ubatch produced any sampling tensors. @@ -1920,11 +1953,11 @@ int llama_context::decode(const llama_batch & batch_inp) { } n_outputs_prev += n_outputs; + n_tokens_prev += ubatch.n_tokens; } while (mctx->next()); // set to total number of outputs in the batch, for use in llama_get_logits_ith n_outputs = n_outputs_all; - n_outputs_pre_norm = n_outputs_pre_norm_prev; // set output mappings if (n_outputs > 0) { @@ -1983,23 +2016,19 @@ int llama_context::decode(const llama_batch & batch_inp) { // output // -uint32_t llama_context::output_reserve(int32_t n_outputs, int32_t n_outputs_pre_norm_req) { +uint32_t llama_context::output_reserve(int32_t n_outputs) { const auto & hparams = model.hparams; const auto & vocab = model.vocab; const int64_t n_outputs_max = std::max(n_outputs, n_seq_max()); - const int64_t n_outputs_pre_norm_max = cparams.embeddings_pre_norm - ? std::max(n_outputs_pre_norm_req < 0 ? n_outputs : n_outputs_pre_norm_req, n_seq_max()) - : 0; const auto n_batch = cparams.n_batch; const auto n_vocab = vocab.n_tokens(); - const auto n_embd = hparams.n_embd; const auto n_embd_out = hparams.n_embd_out(); - bool has_logits = true; - bool has_embd = cparams.embeddings; - bool has_embd_pre_norm = cparams.embeddings_pre_norm; + bool has_logits = true; + bool has_embd = cparams.embeddings; + bool has_embd_nextn = cparams.embeddings_nextn; // TODO: hacky enc-dec support if (model.arch == LLM_ARCH_T5) { @@ -2011,9 +2040,15 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, int32_t n_outputs_pre_ size_t backend_float_count = 0; size_t backend_token_count = 0; - logits.size = has_logits ? n_vocab*n_outputs_max : 0; - embd.size = has_embd ? n_embd_out*n_outputs_max : 0; - embd_pre_norm.size = has_embd_pre_norm ? n_embd*n_outputs_pre_norm_max : 0; + logits.size = has_logits ? n_vocab*n_outputs_max : 0; + embd.size = has_embd ? n_embd_out*n_outputs_max : 0; + embd_nextn.size = has_embd_nextn ? n_embd_out*n_outputs_max : 0; + + if (has_embd_nextn && !cparams.embeddings_nextn_masked) { + // unmasked: nextn row exists for every token in the batch, not just + // those flagged via batch.logits[i] -> size by token count instead. + embd_nextn.size = (size_t) n_embd_out * n_batch; + } // Allocate backend sampling output buffers if there are backend samplers configured. const bool has_sampling = !sampling.samplers.empty(); @@ -2029,7 +2064,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, int32_t n_outputs_pre_ const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0; const size_t new_size = - (logits.size + embd.size + embd_pre_norm.size + backend_float_count) * sizeof(float) + + (logits.size + embd.size + embd_nextn.size + backend_float_count) * sizeof(float) + ( backend_token_count) * sizeof(llama_token); // alloc only when more than the current capacity is required @@ -2046,7 +2081,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, int32_t n_outputs_pre_ buf_output = nullptr; logits.data = nullptr; embd.data = nullptr; - embd_pre_norm.data = nullptr; + embd_nextn.data = nullptr; } auto * buft = ggml_backend_cpu_buffer_type(); @@ -2075,8 +2110,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, int32_t n_outputs_pre_ embd = has_embd ? buffer_view{(float *) (base + offset), embd.size} : buffer_view{nullptr, 0}; offset += embd.size * sizeof(float); - embd_pre_norm = has_embd_pre_norm ? buffer_view{(float *) (base + offset), embd_pre_norm.size} : buffer_view{nullptr, 0}; - offset += embd_pre_norm.size * sizeof(float); + embd_nextn = has_embd_nextn ? buffer_view{(float *) (base + offset), embd_nextn.size} : buffer_view{nullptr, 0}; + offset += embd_nextn.size * sizeof(float); if (has_sampling) { sampling.logits = {(float *) (base + offset), (size_t)(n_vocab*n_outputs_max)}; @@ -2118,7 +2153,6 @@ uint32_t llama_context::output_reserve(int32_t n_outputs, int32_t n_outputs_pre_ std::fill(output_ids.begin(), output_ids.end(), -1); this->n_outputs = 0; - this->n_outputs_pre_norm = 0; return n_outputs_max; } @@ -2143,6 +2177,12 @@ void llama_context::output_reorder() { } } + if (embd_nextn.size > 0) { + for (uint64_t k = 0; k < n_embd; k++) { + std::swap(embd_nextn.data[i0*n_embd + k], embd_nextn.data[i1*n_embd + k]); + } + } + if (!sampling.samplers.empty()) { assert(sampling.logits.size > 0); assert(sampling.probs.size > 0); @@ -2317,7 +2357,7 @@ llm_graph_cb llama_context::graph_get_cb() const { // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends // FIXME: fix in ggml_backend_sched - const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer; + const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer(); if (ubatch.n_tokens < 32 || full_offload) { if (il != -1 && strcmp(name, "norm") == 0) { const auto & dev_layer = model.dev_layer(il); @@ -3310,6 +3350,7 @@ llama_context_params llama_context_default_params() { /*.n_batch =*/ 2048, /*.n_ubatch =*/ 512, /*.n_seq_max =*/ 1, + /*.n_outputs_max =*/ 0, /*.n_rs_seq =*/ 0, /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS, @@ -3340,6 +3381,7 @@ llama_context_params llama_context_default_params() { /*.kv_unified =*/ false, /*.sampler =*/ nullptr, /*.n_sampler =*/ 0, + /*.ctx_other =*/ nullptr, }; return result; @@ -3388,7 +3430,7 @@ llama_context * llama_init_from_model( const bool k_is_turbo = (params.type_k == GGML_TYPE_TURBO2_0 || params.type_k == GGML_TYPE_TURBO3_0 || params.type_k == GGML_TYPE_TURBO4_0); - for (uint32_t il = 0; il < model->hparams.n_layer; ++il) { + for (uint32_t il = 0; il < model->hparams.n_layer(); ++il) { uint32_t head_k = model->hparams.n_embd_head_k(il); // Turbo types zero-pad heads to next multiple of 128 in llama-kv-cache.cpp if (k_is_turbo && head_k % 128 != 0) { @@ -3408,7 +3450,7 @@ llama_context * llama_init_from_model( params.type_v == GGML_TYPE_TURBO3_0 || params.type_v == GGML_TYPE_TURBO4_0); const bool is_mla = model->hparams.is_mla(); - for (uint32_t il = 0; il < model->hparams.n_layer; ++il) { + for (uint32_t il = 0; il < model->hparams.n_layer(); ++il) { uint32_t head_v = model->hparams.n_embd_head_v(il); // Turbo types zero-pad; MLA has no separate V cache (V = view of K) if (v_is_turbo && !is_mla && head_v % 128 != 0) { @@ -3443,12 +3485,11 @@ llama_context * llama_init_from_model( } if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP && - model->hparams.nextn_predict_layers == 0) { + model->hparams.n_layer_nextn == 0) { LLAMA_LOG_WARN("%s: context type MTP requested but model doesn't contain MTP layers\n", __func__); return nullptr; } - try { auto * ctx = new llama_context(*model, params); return ctx; @@ -3583,20 +3624,28 @@ float * llama_get_embeddings_seq(llama_context * ctx, llama_seq_id seq_id) { return ctx->get_embeddings_seq(seq_id); } -void llama_set_embeddings_pre_norm(llama_context * ctx, bool value) { - ctx->set_embeddings_pre_norm(value); +void llama_set_embeddings_nextn(llama_context * ctx, bool value, bool masked) { + ctx->set_embeddings_nextn(value, masked); +} + +llama_memory_t llama_get_memory(const struct llama_context * ctx) { + if (!ctx) { + return nullptr; + } + + return ctx->get_memory(); } -float * llama_get_embeddings_pre_norm(llama_context * ctx) { +float * llama_get_embeddings_nextn(llama_context * ctx) { ctx->synchronize(); - return ctx->get_embeddings_pre_norm(); + return ctx->get_embeddings_nextn(); } -float * llama_get_embeddings_pre_norm_ith(llama_context * ctx, int32_t i) { +float * llama_get_embeddings_nextn_ith(llama_context * ctx, int32_t i) { ctx->synchronize(); - return ctx->get_embeddings_pre_norm_ith(i); + return ctx->get_embeddings_nextn_ith(i); } bool llama_set_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) { @@ -3650,7 +3699,7 @@ struct ggml_cgraph * llama_graph_reserve( uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs) { - auto * memory = ctx->get_memory(); + auto memory = ctx->get_memory(); llama_memory_context_ptr mctx; if (memory) { mctx = memory->init_full(); @@ -3690,10 +3739,6 @@ int32_t llama_set_adapter_cvec( // memory // -llama_memory_t llama_get_memory(const struct llama_context * ctx) { - return ctx->get_memory(); -} - void llama_memory_clear(llama_memory_t mem, bool data) { if (!mem) { return; @@ -4004,3 +4049,7 @@ void llama_opt_epoch( llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx) { return ctx->memory_breakdown(); } + +llama_context * llama_get_ctx_other(struct llama_context * ctx) { + return ctx->get_cparams().ctx_other; +} diff --git a/src/llama-context.h b/src/llama-context.h index 0a9e40de1628..6f8f59a22a3e 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -6,6 +6,7 @@ #include "llama-graph.h" #include "llama-adapter.h" #include "llama-impl.h" +#include "llama-memory.h" #include "ggml-cpp.h" #include "ggml-opt.h" @@ -84,8 +85,8 @@ struct llama_context { float * get_embeddings_ith(int32_t i); float * get_embeddings_seq(llama_seq_id seq_id); - float * get_embeddings_pre_norm(); - float * get_embeddings_pre_norm_ith(int32_t i); + float * get_embeddings_nextn(); + float * get_embeddings_nextn_ith(int32_t i); llama_token * get_sampled_tokens() const; llama_token get_sampled_token_ith(int32_t idx); @@ -110,7 +111,7 @@ struct llama_context { void set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data); void set_embeddings (bool value); - void set_embeddings_pre_norm(bool value); + void set_embeddings_nextn(bool value, bool masked); void set_causal_attn(bool value); void set_warmup(bool value); @@ -218,7 +219,7 @@ struct llama_context { // Make sure enough space is available for outputs. // Returns max number of outputs for which space was reserved. - uint32_t output_reserve(int32_t n_outputs, int32_t n_outputs_pre_norm = -1); + uint32_t output_reserve(int32_t n_outputs); void output_reorder(); @@ -273,7 +274,7 @@ struct llama_context { llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably - std::unique_ptr memory; + llama_memory_ptr memory; // decode output (2-dimensional array: [n_outputs][n_vocab]) buffer_view logits = {nullptr, 0}; @@ -282,11 +283,10 @@ struct llama_context { // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE buffer_view embd = {nullptr, 0}; - // hidden state before the final output norm (2-dimensional array: [n_outputs_pre_norm][n_embd]) - // populated only when cparams.embeddings_pre_norm is enabled and the model graph - // sets llm_graph_result::t_h_pre_norm - buffer_view embd_pre_norm = {nullptr, 0}; - int32_t n_outputs_pre_norm = 0; + // hidden state required by the nextn layers (2-dimensional array: [n_outputs][n_embd]) + // populated only when cparams.embeddings_nextn is enabled and the model graph + // sets llm_graph_result::t_h_nextn + buffer_view embd_nextn = {nullptr, 0}; struct sampling_info { // !samplers.empty() to check if any samplers are active diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 5898a1c38d51..e1b3cb9cfa4b 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -13,6 +13,7 @@ struct llama_cparams { uint32_t n_ubatch; uint32_t n_seq_max; uint32_t n_rs_seq; // number of recurrent-state snapshots per seq for rollback + uint32_t n_outputs_max; // max outputs supported by the context int32_t n_threads; // number of threads to use for generation int32_t n_threads_batch; // number of threads to use for batch processing @@ -28,7 +29,8 @@ struct llama_cparams { float yarn_beta_slow; bool embeddings; - bool embeddings_pre_norm; // also extract the hidden state before the final output norm + bool embeddings_nextn; // also extract the hidden state before the final output norm + bool embeddings_nextn_masked; // extract for only rows where batch.logits != 0 bool causal_attn; bool offload_kqv; bool flash_attn; @@ -47,4 +49,6 @@ struct llama_cparams { ggml_backend_sched_eval_callback cb_eval; void * cb_eval_user_data; + + llama_context * ctx_other; }; diff --git a/src/llama-ext.h b/src/llama-ext.h index 11f1986676a5..bd74544129b4 100644 --- a/src/llama-ext.h +++ b/src/llama-ext.h @@ -89,18 +89,16 @@ LLAMA_API ggml_backend_dev_t llama_model_get_device(const struct llama_model * m LLAMA_API llama_memory_breakdown llama_get_memory_breakdown(const struct llama_context * ctx); -// -// pre-norm embeddings (hidden state before the final output norm) -// - -// mirrors: -// LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings); -LLAMA_API void llama_set_embeddings_pre_norm(struct llama_context * ctx, bool value); +// Set whether the context outputs nextn embeddings or not +// If masked == true, output the embeddings only for the tokens with batch.logits != 0 +// If masked == false, output the embeddings for all tokens in the batch regardless of batch.logits +LLAMA_API void llama_set_embeddings_nextn(struct llama_context * ctx, bool value, bool masked); // mirrors: // LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); -LLAMA_API float * llama_get_embeddings_pre_norm(struct llama_context * ctx); +LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx); -// mirrors: // LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i); -LLAMA_API float * llama_get_embeddings_pre_norm_ith(struct llama_context * ctx, int32_t i); +LLAMA_API float * llama_get_embeddings_nextn_ith(struct llama_context * ctx, int32_t i); + +LLAMA_API llama_context * llama_get_ctx_other(struct llama_context * ctx); diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index be10cd28a164..b2d7af9b1a3a 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -102,6 +102,39 @@ bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) { return res; } +void llm_graph_input_embd_h::set_input(const llama_ubatch * ubatch) { + const int64_t n_tokens = ubatch->n_tokens; + + if (ubatch->token) { + ggml_backend_tensor_set(tokens, ubatch->token, 0, n_tokens*ggml_element_size(tokens)); + } else { + // note: mtmd embedding input goes through here + GGML_ASSERT(ubatch->embd); + GGML_ASSERT(n_embd == embd->ne[0]); + + ggml_backend_tensor_set(embd, ubatch->embd, 0, n_tokens*n_embd*ggml_element_size(h)); + } + + // TODO: extend llama_ubatch to differentiate between token embeddings and hidden states + // for now, we assume that the hidden state is always provided as an embedding + // ref: https://github.com/ggml-org/llama.cpp/pull/23643 + if (ubatch->embd) { + GGML_ASSERT(n_embd == h->ne[0]); + + ggml_backend_tensor_set(h, ubatch->embd, 0, n_tokens*n_embd*ggml_element_size(h)); + } +} + +bool llm_graph_input_embd_h::can_reuse(const llm_graph_params & params) { + bool res = true; + + res &= (!params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens); + res &= (!params.ubatch.embd) || (embd && embd->ne[1] == params.ubatch.n_tokens); + res &= (!params.ubatch.embd) || (h && h->ne[1] == params.ubatch.n_tokens); + + return res; +} + void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) { if (ubatch->pos && pos) { const int64_t n_tokens = ubatch->n_tokens; @@ -177,14 +210,6 @@ void llm_graph_input_pos_bucket_kv::set_input(const llama_ubatch * ubatch) { void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) { GGML_ASSERT(out_ids); - if (n_outputs == 0) { - return; - } - - if (out_ids->buffer == nullptr) { - return; - } - const int64_t n_tokens = ubatch->n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(out_ids->buffer)); @@ -367,7 +392,7 @@ static void print_mask(const float * data, int64_t n_tokens, int64_t n_kv, int64 case LLAMA_SWA_TYPE_SYMMETRIC: swa_type_str = "LLAMA_SWA_TYPE_SYMMETRIC"; break; }; - LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swq_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str); + LLAMA_LOG_DEBUG("%s: n_swa : %d, n_kv: %d, swa_type: %s\n", __func__, (int)n_swa, (int)n_kv, swa_type_str); LLAMA_LOG_DEBUG("%s: '0' = can attend, '∞' = masked\n", __func__); LLAMA_LOG_DEBUG("%s: Rows = query tokens, Columns = key/value tokens\n\n", __func__); @@ -508,15 +533,26 @@ bool llm_graph_input_attn_k::can_reuse(const llm_graph_params & params) { } void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) { - mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch); - mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch); + // base tensors may not be allocated if there are no non-SWA attention layers + if (self_k_idxs && self_k_idxs->buffer) { + mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch); + mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch); + } - mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); + // #24294: guard kq mask on its own buffer (SWA-only/empty sub-cache) + if (self_kq_mask && self_kq_mask->buffer) { + mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn); + } - mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch); - mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch); + // swa tensors may not be allocated if there are no SWA attention layers + if (self_k_idxs_swa && self_k_idxs_swa->buffer) { + mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch); + mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch); + } - mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); + if (self_kq_mask_swa && self_kq_mask_swa->buffer) { + mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn); + } if (self_k_rot) { mctx->get_base()->set_input_k_rot(self_k_rot); @@ -542,14 +578,25 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) { bool res = true; - res &= self_k_idxs->ne[0] == params.ubatch.n_tokens; - //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there + // base tensors may not be allocated if there are no non-SWA attention layers + if (self_k_idxs && self_k_idxs->buffer) { + res &= self_k_idxs->ne[0] == params.ubatch.n_tokens; + //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there + } - res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens; - //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there + if (self_kq_mask && self_kq_mask->buffer) { + res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams); + } + + // swa tensors may not be allocated if there are no SWA attention layers + if (self_k_idxs_swa && self_k_idxs_swa->buffer) { + res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens; + //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there + } - res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams); - res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams); + if (self_kq_mask_swa && self_kq_mask_swa->buffer) { + res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams); + } return res; } @@ -683,7 +730,9 @@ void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) { if (inp_attn->self_k_idxs && inp_attn->self_k_idxs->buffer) { attn_ctx->get_base()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch); attn_ctx->get_base()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch); + } + if (inp_attn->self_kq_mask && inp_attn->self_kq_mask->buffer) { attn_ctx->get_base()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn); } @@ -691,7 +740,9 @@ void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) { if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) { attn_ctx->get_swa()->set_input_k_idxs(inp_attn->self_k_idxs_swa, ubatch); attn_ctx->get_swa()->set_input_v_idxs(inp_attn->self_v_idxs_swa, ubatch); + } + if (inp_attn->self_kq_mask_swa && inp_attn->self_kq_mask_swa->buffer) { attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn); } @@ -737,18 +788,18 @@ bool llm_graph_input_mem_hybrid_iswa::can_reuse(const llm_graph_params & params) if (inp_attn->self_k_idxs && inp_attn->self_k_idxs->buffer) { res &= inp_attn->self_k_idxs->ne[0] == params.ubatch.n_tokens; //res &= inp_attn->self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there - - res &= can_reuse_kq_mask(inp_attn->self_kq_mask, attn_ctx->get_base(), params.ubatch, params.cparams); } + res &= can_reuse_kq_mask(inp_attn->self_kq_mask, attn_ctx->get_base(), params.ubatch, params.cparams); + // swa tensors may not be allocated if there are no SWA attention layers if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) { res &= inp_attn->self_k_idxs_swa->ne[0] == params.ubatch.n_tokens; //res &= inp_attn->self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there - - res &= can_reuse_kq_mask(inp_attn->self_kq_mask_swa, attn_ctx->get_swa(), params.ubatch, params.cparams); } + res &= can_reuse_kq_mask(inp_attn->self_kq_mask_swa, attn_ctx->get_swa(), params.ubatch, params.cparams); + res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs(); res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs; @@ -856,6 +907,9 @@ void llm_graph_result::set_outputs() { if (t_embd_pooled != nullptr) { ggml_set_output(t_embd_pooled); } + if (t_h_nextn != nullptr) { + ggml_set_output(t_h_nextn); + } for (auto & [seq_id, t] : t_sampled) { if (t != nullptr) { ggml_set_output(t); @@ -929,7 +983,8 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : cparams (params.cparams), ubatch (params.ubatch), n_embd (hparams.n_embd), - n_layer (hparams.n_layer), + n_layer (hparams.n_layer()), + n_layer_nextn (hparams.n_layer_nextn), n_rot (hparams.n_rot()), n_ctx (cparams.n_ctx), n_head (hparams.n_head()), diff --git a/src/llama-graph.h b/src/llama-graph.h index 9e55d0a675e0..06921868644b 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -121,6 +121,23 @@ class llm_graph_input_embd : public llm_graph_input_i { const int64_t n_embd = 0; }; +// similar to llm_graph_input_embd but with an additional hidden state input +class llm_graph_input_embd_h : public llm_graph_input_i { +public: + llm_graph_input_embd_h(int64_t n_embd) : n_embd(n_embd) {} + virtual ~llm_graph_input_embd_h() = default; + + void set_input(const llama_ubatch * ubatch) override; + + bool can_reuse(const llm_graph_params & params) override; + + ggml_tensor * tokens = nullptr; // I32 [n_batch] + ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch] + ggml_tensor * h = nullptr; // F32 [n_embd, n_batch] + + const int64_t n_embd = 0; +}; + class llm_graph_input_pos : public llm_graph_input_i { public: llm_graph_input_pos(uint32_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {} @@ -581,7 +598,8 @@ struct llm_graph_params { ubatch.n_seqs_unq == other.ubatch.n_seqs_unq && ( (!ubatch.token && !other.ubatch.token) || - (!ubatch.embd && !other.ubatch.embd) + (!ubatch.embd && !other.ubatch.embd) || + (ubatch.token && other.ubatch.token && ubatch.embd && other.ubatch.embd) ); // when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same @@ -645,7 +663,7 @@ class llm_graph_result { ggml_tensor * get_logits() const { return t_logits; } ggml_tensor * get_embd() const { return t_embd; } ggml_tensor * get_embd_pooled() const { return t_embd_pooled; } - ggml_tensor * get_h_pre_norm() const { return t_h_pre_norm; } + ggml_tensor * get_h_nextn() const { return t_h_nextn; } ggml_cgraph * get_gf() const { return gf; } ggml_context * get_ctx() const { return ctx_compute.get(); } @@ -674,7 +692,7 @@ class llm_graph_result { ggml_tensor * t_logits = nullptr; ggml_tensor * t_embd = nullptr; ggml_tensor * t_embd_pooled = nullptr; - ggml_tensor * t_h_pre_norm = nullptr; // [n_embd, n_outputs] hidden state before final output norm + ggml_tensor * t_h_nextn = nullptr; // [n_embd, n_outputs] hidden state before final output norm std::map t_sampled_logits; std::map t_candidates; @@ -726,6 +744,7 @@ struct llm_graph_context { const int64_t n_embd; const int64_t n_layer; + const int64_t n_layer_nextn; const int64_t n_rot; const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train) const int64_t n_head; diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 2239309c8fb4..08f41f60f4f0 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -7,19 +7,39 @@ void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) { if (dense_first) { - for (uint32_t il = 0; il < n_layer; ++il) { - swa_layers[il] = n_pattern == 0 || (il % n_pattern != 0); + for (uint32_t il = 0; il < n_layer(); ++il) { + is_swa_impl[il] = n_pattern == 0 || (il % n_pattern != 0); } } else { - for (uint32_t il = 0; il < n_layer; ++il) { - swa_layers[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1)); + for (uint32_t il = 0; il < n_layer(); ++il) { + is_swa_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1)); } } + + for (uint32_t il = n_layer(); il < n_layer_all; ++il) { + is_swa_impl[il] = false; + } +} + +void llama_hparams::set_recr_pattern(uint32_t n_pattern, bool dense_first) { + if (dense_first) { + for (uint32_t il = 0; il < n_layer(); ++il) { + is_recr_impl[il] = n_pattern == 0 || (il % n_pattern != 0); + } + } else { + for (uint32_t il = 0; il < n_layer(); ++il) { + is_recr_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1)); + } + } + + for (uint32_t il = n_layer(); il < n_layer_all; ++il) { + is_recr_impl[il] = false; + } } bool llama_hparams::is_swa_any() const { - for (uint32_t il = 0; il < n_layer; ++il) { - if (swa_layers[il]) { + for (uint32_t il = 0; il < n_layer_all; ++il) { + if (is_swa_impl[il]) { return true; } } @@ -28,7 +48,7 @@ bool llama_hparams::is_swa_any() const { } uint32_t llama_hparams::n_head(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return n_head_arr[il]; } @@ -36,7 +56,7 @@ uint32_t llama_hparams::n_head(uint32_t il) const { } uint32_t llama_hparams::n_head_kv(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return n_head_kv_arr[il]; } @@ -44,7 +64,7 @@ uint32_t llama_hparams::n_head_kv(uint32_t il) const { } uint32_t llama_hparams::n_ff(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return n_ff_arr[il]; } @@ -63,7 +83,7 @@ uint32_t llama_hparams::n_gqa(uint32_t il) const { } uint32_t llama_hparams::n_rot(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return is_swa(il) ? n_rot_swa : n_rot_full; } @@ -71,6 +91,10 @@ uint32_t llama_hparams::n_rot(uint32_t il) const { } uint32_t llama_hparams::n_embd_inp() const { + if (n_embd_inp_impl > 0) { + return n_embd_inp_impl; + } + uint32_t n_embd_inp = n_embd; if (n_deepstack_layers > 0) { @@ -85,7 +109,7 @@ uint32_t llama_hparams::n_embd_out() const { } uint32_t llama_hparams::n_embd_head_k(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return is_swa(il) ? n_embd_head_k_swa : n_embd_head_k_full; } @@ -93,7 +117,7 @@ uint32_t llama_hparams::n_embd_head_k(uint32_t il) const { } uint32_t llama_hparams::n_embd_head_v(uint32_t il) const { - if (il < n_layer) { + if (il < n_layer_all) { return is_swa(il) ? n_embd_head_v_swa : n_embd_head_v_full; } @@ -114,7 +138,7 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const { bool llama_hparams::is_n_embd_k_gqa_variable() const { const uint32_t val = n_embd_k_gqa(); - for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t il = 0; il < n_layer_all; ++il) { if (val != n_embd_k_gqa(il)) { return true; } @@ -125,7 +149,7 @@ bool llama_hparams::is_n_embd_k_gqa_variable() const { bool llama_hparams::is_n_embd_v_gqa_variable() const { const uint32_t val = n_embd_v_gqa(); - for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t il = 0; il < n_layer_all; ++il) { if (val != n_embd_v_gqa(il)) { return true; } @@ -136,7 +160,7 @@ bool llama_hparams::is_n_embd_v_gqa_variable() const { uint32_t llama_hparams::n_embd_k_gqa_max() const { uint32_t val = n_embd_k_gqa(); - for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t il = 0; il < n_layer_all; ++il) { val = std::max(val, n_embd_k_gqa(il)); } @@ -145,7 +169,7 @@ uint32_t llama_hparams::n_embd_k_gqa_max() const { uint32_t llama_hparams::n_embd_v_gqa_max() const { uint32_t val = n_embd_v_gqa(); - for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t il = 0; il < n_layer_all; ++il) { val = std::max(val, n_embd_v_gqa(il)); } @@ -193,12 +217,12 @@ uint32_t llama_hparams::n_embd_s() const { return ssm_d_state * ssm_d_inner; } -bool llama_hparams::is_recurrent(uint32_t il) const { - if (il < n_layer) { - return recurrent_layer_arr[il]; +bool llama_hparams::is_recr(uint32_t il) const { + if (il < n_layer_all) { + return is_recr_impl[il]; } - GGML_ABORT("%s: il (%u) out of bounds (n_layer: %u)\n", __func__, il, n_layer); + GGML_ABORT("%s: il (%u) out of bounds (n_layer_all: %u)\n", __func__, il, n_layer_all); } uint32_t llama_hparams::n_pos_per_embd() const { @@ -206,11 +230,11 @@ uint32_t llama_hparams::n_pos_per_embd() const { } bool llama_hparams::is_swa(uint32_t il) const { - if (il < n_layer) { - return swa_layers[il]; + if (il < n_layer_all) { + return is_swa_impl[il]; } - GGML_ABORT("fatal error"); + GGML_ABORT("%s: il (%u) out of bounds (n_layer_all: %u)\n", __func__, il, n_layer_all); } bool llama_hparams::is_mla() const { @@ -229,12 +253,6 @@ uint32_t llama_hparams::n_embd_head_v_mla() const { } bool llama_hparams::has_kv(uint32_t il) const { - if (kv_only_nextn) { - // MTP head: only the trailing nextn_predict_layers blocks own a KV cache; - // the leading trunk blocks are not executed in this graph. - return nextn_predict_layers > 0 && il >= (n_layer - nextn_predict_layers); - } - if (n_layer_kv_from_start >= 0) { if (il < (uint32_t) n_layer_kv_from_start) { return true; @@ -247,15 +265,17 @@ bool llama_hparams::has_kv(uint32_t il) const { return true; } +uint32_t llama_hparams::n_layer() const { + return n_layer_all - n_layer_nextn; +} + uint32_t llama_hparams::n_layer_kv() const { uint32_t res = 0; - - for (uint32_t il = 0; il < n_layer; ++il) { + for (uint32_t il = 0; il < n_layer(); ++il) { if (has_kv(il)) { res++; } } - return res; } diff --git a/src/llama-hparams.h b/src/llama-hparams.h index e2d051edc6cd..1bb99dcf3738 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -34,6 +34,9 @@ struct llama_hparams_convnext { }; struct llama_hparams { + // note: use the `_impl` suffix to avoid name conflict between members and getters + // for example: n_embd_out() vs n_embd_out_impl + bool vocab_only; bool no_alloc; bool rope_finetuned; @@ -42,12 +45,15 @@ struct llama_hparams { uint32_t n_ctx_train; // context size the model was trained on uint32_t n_embd; - uint32_t n_layer; - int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache + uint32_t n_layer_all; + uint32_t n_layer_nextn = 0; uint32_t n_expert = 0; uint32_t n_expert_used = 0; uint32_t n_rel_attn_bkts = 0; + // TODO: this needs to be reworked + int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache + // different head size for full_attention and SWA layers uint32_t n_embd_head_k_full; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads uint32_t n_embd_head_v_full; // dimension of values (d_v) aka n_embd_head @@ -90,9 +96,6 @@ struct llama_hparams { uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE; uint32_t moe_every_n_layers = 0; uint32_t moe_latent_size = 0; - uint32_t nextn_predict_layers = 0; - - bool kv_only_nextn = false; // if true, only the last nextn_predict_layers blocks have a KV cache (MTP head arches) float f_norm_eps; float f_norm_rms_eps; @@ -134,11 +137,15 @@ struct llama_hparams { llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE; // the size of the sliding window (0 - no SWA) uint32_t n_swa = 0; - // if swa_layers[il] == 1, then layer il is SWA - // if swa_layers[il] == 0, then layer il is dense (i.e. non-SWA) + + // if is_swa_impl[il] == 1, then layer il is SWA + // if is_swa_impl[il] == 0, then layer il is dense (i.e. non-SWA) // by default, all layers are dense // note: using uint32_t type for compatibility reason - std::array swa_layers; + std::array is_swa_impl; + + // for hybrid state space models + std::array is_recr_impl; // for State Space Models uint32_t ssm_d_conv = 0; @@ -150,9 +157,6 @@ struct llama_hparams { // for Kimi Linear KDA uint32_t n_embd_head_kda = 0; - // for hybrid state space models - std::array recurrent_layer_arr; - bool ssm_dt_b_c_rms = false; float f_clamp_kqv = 0.0f; @@ -178,6 +182,9 @@ struct llama_hparams { // for Classifiers uint32_t n_cls_out = 1; + // input embedding dimension (0 = use n_embd) + uint32_t n_embd_inp_impl = 0; + // output embedding dimension (0 = use n_embd) uint32_t n_embd_out_impl = 0; @@ -212,8 +219,17 @@ struct llama_hparams { uint32_t indexer_top_k = 0; // qwen3vl deepstack + // When parsed from GGUF, this implies the first N layers consume the first + // N deepstack embeddings. Use deepstack_mapping_arr if you need a more + // complex mapping. If using deepstack_mapping_arr, also make sure to set + // n_deepstack_layers to the number of unique deepstack layers so that + // n_embd_imp is accurate (see granite.cpp). + // TODO: can be expressed via the `new n_embd_inp_impl` and remove this param uint32_t n_deepstack_layers = 0; + // deepstack layer array (Granite4 Vision): -1 => none, >=0 => input embedding index + std::array deepstack_mapping_arr; + // gemma4 per-layer embedding uint32_t n_embd_per_layer = 0; @@ -255,6 +271,13 @@ struct llama_hparams { // return true if one of the layers is SWA bool is_swa_any() const; + bool is_swa(uint32_t il) const; + + void set_recr_pattern(uint32_t n_pattern, bool dense_first = false); + + // whether or not the given layer is recurrent (for hybrid models) + bool is_recr(uint32_t il) const; + uint32_t n_head(uint32_t il = 0) const; uint32_t n_head_kv(uint32_t il = 0) const; @@ -296,13 +319,8 @@ struct llama_hparams { // dimension of the recurrent state embeddings uint32_t n_embd_s() const; - // whether or not the given layer is recurrent (for hybrid models) - bool is_recurrent(uint32_t il) const; - uint32_t n_pos_per_embd() const; - bool is_swa(uint32_t il) const; - // note: currently only support if either all or none of the layers are MLA bool is_mla() const; @@ -311,7 +329,10 @@ struct llama_hparams { bool has_kv(uint32_t il) const; - // number of layers for which has_kv() returns true + // number of effective layers (excludes nextn layers) + uint32_t n_layer() const; + + // number of layers that carry a KV cache (respects n_layer_kv_from_start) uint32_t n_layer_kv() const; // note that this function uses different SWA parameters from those in the hparams diff --git a/src/llama-kv-cache-dsa.cpp b/src/llama-kv-cache-dsa.cpp new file mode 100644 index 000000000000..916ab6537563 --- /dev/null +++ b/src/llama-kv-cache-dsa.cpp @@ -0,0 +1,261 @@ +#include "llama-kv-cache-dsa.h" + +#include "llama-impl.h" +#include "llama-batch.h" +#include "llama-model.h" + +#include +#include + +// +// llama_kv_cache_dsa +// + +llama_kv_cache_dsa::llama_kv_cache_dsa( + const llama_model & model, + ggml_type type_k, + ggml_type type_v, + bool v_trans, + bool offload, + bool unified, + uint32_t kv_size, + uint32_t n_seq_max, + uint32_t n_pad, + uint32_t n_swa, + llama_swa_type swa_type, + const layer_filter_cb & filter, + const layer_reuse_cb & reuse) : + hparams_lid(model.hparams), n_stream(unified ? 1 : n_seq_max) { + + LLAMA_LOG_INFO("%s: creating main KV cache, size = %u cells\n", __func__, kv_size); + + kv_mla = std::make_unique( + model, model.hparams, type_k, type_v, + v_trans, offload, unified, kv_size, n_seq_max, n_pad, + n_swa, swa_type, nullptr, filter, reuse, nullptr); + + // we use llama_kv_cache for caching indexer keys + // by hand-tweaking some hparams we fool it to create + // indexer key cache tensors with correct dimensions + // https://github.com/ggml-org/llama.cpp/pull/21149#discussion_r3015940823 + + // DSA lightning indexer uses MQA with single key head + std::fill(hparams_lid.n_head_kv_arr.begin(), hparams_lid.n_head_kv_arr.end(), 1); + hparams_lid.n_embd_head_k_full = model.hparams.indexer_head_size; + hparams_lid.rope_type = LLAMA_ROPE_TYPE_NEOX; + + LLAMA_LOG_INFO("%s: creating indexer KV cache, size = %u cells\n", __func__, kv_size); + + kv_lid = std::make_unique( + model, hparams_lid, type_k, type_v, + v_trans, offload, unified, kv_size, n_seq_max, n_pad, + n_swa, swa_type, nullptr, filter, reuse, nullptr); +} + +void llama_kv_cache_dsa::clear(bool data) { + kv_mla->clear(data); + kv_lid->clear(data); +} + +bool llama_kv_cache_dsa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { + bool res = true; + + res = res & kv_mla->seq_rm(seq_id, p0, p1); + res = res & kv_lid->seq_rm(seq_id, p0, p1); + + return res; +} + +void llama_kv_cache_dsa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { + kv_mla->seq_cp(seq_id_src, seq_id_dst, p0, p1); + kv_lid->seq_cp(seq_id_src, seq_id_dst, p0, p1); +} + +void llama_kv_cache_dsa::seq_keep(llama_seq_id seq_id) { + kv_mla->seq_keep(seq_id); + kv_lid->seq_keep(seq_id); +} + +void llama_kv_cache_dsa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) { + kv_mla->seq_add(seq_id, p0, p1, shift); + kv_lid->seq_add(seq_id, p0, p1, shift); +} + +void llama_kv_cache_dsa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { + kv_mla->seq_div(seq_id, p0, p1, d); + kv_lid->seq_div(seq_id, p0, p1, d); +} + +llama_pos llama_kv_cache_dsa::seq_pos_min(llama_seq_id seq_id) const { + return kv_mla->seq_pos_min(seq_id); +} + +llama_pos llama_kv_cache_dsa::seq_pos_max(llama_seq_id seq_id) const { + return kv_mla->seq_pos_max(seq_id); +} + +std::map llama_kv_cache_dsa::memory_breakdown() const { + std::map mb = kv_mla->memory_breakdown(); + for (const auto & buft_size : kv_lid->memory_breakdown()) { + mb[buft_size.first] += buft_size.second; + } + return mb; +} + +llama_memory_context_ptr llama_kv_cache_dsa::init_batch( + llama_batch_allocr & balloc, + uint32_t n_ubatch, + bool embd_all) { + GGML_UNUSED(embd_all); + + do { + balloc.split_reset(); + + std::vector ubatches; + while (true) { + auto ubatch = n_stream == 1 ? balloc.split_simple(n_ubatch) : balloc.split_equal(n_ubatch, true); + + if (ubatch.n_tokens == 0) { + break; + } + + ubatches.push_back(std::move(ubatch)); // NOLINT + } + + if (balloc.get_n_used() < balloc.get_n_tokens()) { + // failed to find a suitable split + break; + } + + auto sinfos_mla = kv_mla->prepare(ubatches); + if (sinfos_mla.empty()) { + break; + } + + auto sinfos_lid = kv_lid->prepare(ubatches); + if (sinfos_lid.empty()) { + break; + } + + assert(sinfos_mla.size() == sinfos_lid.size()); + + return std::make_unique( + this, std::move(sinfos_mla), std::move(sinfos_lid), std::move(ubatches)); + } while (false); + + return std::make_unique(LLAMA_MEMORY_STATUS_FAILED_PREPARE); +} + +llama_memory_context_ptr llama_kv_cache_dsa::init_full() { + return std::make_unique(this); +} + +llama_memory_context_ptr llama_kv_cache_dsa::init_update(llama_context * lctx, bool optimize) { + return std::make_unique(this, lctx, optimize); +} + +bool llama_kv_cache_dsa::get_can_shift() const { + return kv_mla->get_can_shift() && + kv_lid->get_can_shift() && + kv_mla->get_size() == kv_lid->get_size(); +} + +void llama_kv_cache_dsa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const { + kv_mla->state_write(io, seq_id, flags); + kv_lid->state_write(io, seq_id, flags); +} + +void llama_kv_cache_dsa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) { + kv_mla->state_read(io, seq_id, flags); + kv_lid->state_read(io, seq_id, flags); +} + +llama_kv_cache * llama_kv_cache_dsa::get_mla() const { + return kv_mla.get(); +} + +llama_kv_cache * llama_kv_cache_dsa::get_lid() const { + return kv_lid.get(); +} + +// +// llama_kv_cache_dsa_context +// + +llama_kv_cache_dsa_context::llama_kv_cache_dsa_context(llama_memory_status status) : status(status) {} + +llama_kv_cache_dsa_context::llama_kv_cache_dsa_context( + llama_kv_cache_dsa * kv) : + ctx_mla(kv->get_mla()->init_full()), + ctx_lid(kv->get_lid()->init_full()), + status(llama_memory_status_combine(ctx_mla->get_status(), ctx_lid->get_status())) { +} + +llama_kv_cache_dsa_context::llama_kv_cache_dsa_context( + llama_kv_cache_dsa * kv, + llama_context * lctx, + bool optimize) : + ctx_mla(kv->get_mla()->init_update(lctx, optimize)), + ctx_lid(kv->get_lid()->init_update(lctx, optimize)), + status(llama_memory_status_combine(ctx_mla->get_status(), ctx_lid->get_status())) { +} + +llama_kv_cache_dsa_context::llama_kv_cache_dsa_context( + llama_kv_cache_dsa * kv, + slot_info_vec_t sinfos_mla, + slot_info_vec_t sinfos_lid, + std::vector ubatches) : + ubatches(std::move(ubatches)), + // note: here we copy the ubatches. not sure if this is ideal + ctx_mla(new llama_kv_cache_context(kv->get_mla(), std::move(sinfos_mla), this->ubatches)), + ctx_lid(new llama_kv_cache_context(kv->get_lid(), std::move(sinfos_lid), this->ubatches)), + status(llama_memory_status_combine(ctx_mla->get_status(), ctx_lid->get_status())) { +} + +llama_kv_cache_dsa_context:: ~llama_kv_cache_dsa_context() = default; + +bool llama_kv_cache_dsa_context::next() { + assert(status == LLAMA_MEMORY_STATUS_SUCCESS); + + ctx_mla->next(); + ctx_lid->next(); + + if (++i_next >= ubatches.size()) { + return false; + } + + return true; +} + +bool llama_kv_cache_dsa_context::apply() { + assert(!llama_memory_status_is_fail(status)); + + bool res = true; + + res = res & ctx_mla->apply(); + res = res & ctx_lid->apply(); + + return res; +} + +llama_memory_status llama_kv_cache_dsa_context::get_status() const { + return status; +} + +const llama_ubatch & llama_kv_cache_dsa_context::get_ubatch() const { + assert(status == LLAMA_MEMORY_STATUS_SUCCESS); + + return ubatches[i_next]; +} + +const llama_kv_cache_context * llama_kv_cache_dsa_context::get_mla() const { + assert(status == LLAMA_MEMORY_STATUS_SUCCESS); + + return static_cast(ctx_mla.get()); +} + +const llama_kv_cache_context * llama_kv_cache_dsa_context::get_lid() const { + assert(status == LLAMA_MEMORY_STATUS_SUCCESS); + + return static_cast(ctx_lid.get()); +} diff --git a/src/llama-kv-cache-iswa.cpp b/src/llama-kv-cache-iswa.cpp index 26e2cb4270b0..07d7dedb71fe 100644 --- a/src/llama-kv-cache-iswa.cpp +++ b/src/llama-kv-cache-iswa.cpp @@ -23,8 +23,10 @@ llama_kv_cache_iswa::llama_kv_cache_iswa( uint32_t n_seq_max, uint32_t n_ubatch, uint32_t n_pad, + llama_memory_t mem_other, const layer_filter_cb & filter, - const layer_reuse_cb & reuse) : hparams(model.hparams), unified(unified) { + const layer_reuse_cb & reuse, + const layer_share_cb & share) : hparams(model.hparams), unified(unified) { // chain filters const layer_filter_cb filter_base = [&](int32_t il) { @@ -59,17 +61,27 @@ llama_kv_cache_iswa::llama_kv_cache_iswa( LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base); + llama_memory_t mem_other_base = nullptr; + if (mem_other) { + mem_other_base = static_cast(mem_other)->get_base(); + } + + llama_memory_t mem_other_swa = nullptr; + if (mem_other) { + mem_other_swa = static_cast(mem_other)->get_swa(); + } + kv_base = std::make_unique( - model, type_k, type_v, + model, model.hparams, type_k, type_v, v_trans, offload, unified, size_base, n_seq_max, n_pad, - 0, LLAMA_SWA_TYPE_NONE, filter_base, reuse); + 0, LLAMA_SWA_TYPE_NONE, mem_other_base, filter_base, reuse, share); LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa); kv_swa = std::make_unique( - model, type_k, type_v, + model, model.hparams, type_k, type_v, v_trans, offload, unified, size_swa, n_seq_max, n_pad, - hparams.n_swa, hparams.swa_type, filter_swa, reuse); + hparams.n_swa, hparams.swa_type, mem_other_swa, filter_swa, reuse, share); } void llama_kv_cache_iswa::clear(bool data) { diff --git a/src/llama-kv-cache-iswa.h b/src/llama-kv-cache-iswa.h index 70ab22f0d608..dfafc1ef510b 100644 --- a/src/llama-kv-cache-iswa.h +++ b/src/llama-kv-cache-iswa.h @@ -25,8 +25,10 @@ class llama_kv_cache_iswa : public llama_memory_i { uint32_t n_seq_max, uint32_t n_ubatch, uint32_t n_pad, + llama_memory_t mem_other, const layer_filter_cb & filter, - const layer_reuse_cb & reuse); + const layer_reuse_cb & reuse, + const layer_share_cb & share); ~llama_kv_cache_iswa() = default; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 5a1a1c2e88b8..5f4aa3010a42 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -103,6 +103,7 @@ TURBO_IQ_IMPORT void turbo_innerq_mark_tensor_updated(void); llama_kv_cache::llama_kv_cache( const llama_model & model, + const llama_hparams & hparams, ggml_type type_k, ggml_type type_v, bool v_trans, @@ -113,9 +114,11 @@ llama_kv_cache::llama_kv_cache( uint32_t n_pad, uint32_t n_swa, llama_swa_type swa_type, + llama_memory_t mem_other, const layer_filter_cb & filter, - const layer_reuse_cb & reuse) : - model(model), hparams(model.hparams), v_trans(v_trans), + const layer_reuse_cb & reuse, + const layer_share_cb & share) : + model(model), hparams(hparams), v_trans(v_trans), n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) { GGML_ASSERT(kv_size % n_pad == 0); @@ -146,6 +149,10 @@ llama_kv_cache::llama_kv_cache( } } + // #24060/MTP fix: iterate ALL layers (incl. nextn) so an all-nextn draft + // (gemma4-assistant: n_layer()==0) registers its KV layers; has_kv() still + // gates per-layer. Upstream loops the full hparams.n_layer member here. + const uint32_t n_layer = hparams.n_layer_all; const uint32_t n_layer_kv = hparams.n_layer_kv(); // define a comparator for the buft -> ctx map to ensure that the order is well-defined: @@ -210,7 +217,9 @@ llama_kv_cache::llama_kv_cache( const bool is_mla = hparams.is_mla(); - for (uint32_t il = 0; il < hparams.n_layer; il++) { + other = static_cast(mem_other); + + for (uint32_t il = 0; il < n_layer; il++) { if (!hparams.has_kv(il)) { LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il); continue; @@ -221,6 +230,24 @@ llama_kv_cache::llama_kv_cache( continue; } + if (share && other) { + const int32_t il_share = share(il); + + if (il_share >= 0) { + const auto & layer_share = other->layers[other->map_layer_ids[il_share]]; + + LLAMA_LOG_WARN("%s: layer %3d: sharing with layer %d. k = %p, v = %p\n", __func__, il, il_share, + layer_share.k->data, layer_share.v->data); + + map_layer_ids[il] = layers.size(); + + layers.push_back(layer_share); + layers.back().il = il; + + continue; + } + } + if (n_embd_head_k_all == 0) { n_embd_head_k_all = (int32_t) hparams.n_embd_head_k(il); } else if (n_embd_head_k_all > 0 && n_embd_head_k_all != (int32_t) hparams.n_embd_head_k(il)) { @@ -286,7 +313,7 @@ llama_kv_cache::llama_kv_cache( return mode; } // Auto-enable Boundary V (mode 7) when V is turbo2 - if (type_v == GGML_TYPE_TURBO2_0 && hparams.n_layer >= 8) { + if (type_v == GGML_TYPE_TURBO2_0 && hparams.n_layer() >= 8) { LLAMA_LOG_INFO("llama_kv_cache: Boundary V auto-enabled for turbo2-V (opt-out: TURBO_LAYER_ADAPTIVE=0)\n"); return 7; } @@ -294,7 +321,7 @@ llama_kv_cache::llama_kv_cache( }(); const bool is_turbo = (type_k == GGML_TYPE_TURBO3_0 || type_k == GGML_TYPE_TURBO4_0 || type_k == GGML_TYPE_TURBO2_0); const bool v_is_turbo = (type_v == GGML_TYPE_TURBO3_0 || type_v == GGML_TYPE_TURBO4_0 || type_v == GGML_TYPE_TURBO2_0); - const uint32_t n_layer = hparams.n_layer; + const uint32_t n_layer = hparams.n_layer(); if (adaptive_mode == 1 && is_turbo && n_layer >= 8) { if (il < 4 || il >= n_layer - 4) { layer_type_k = GGML_TYPE_Q8_0; @@ -389,7 +416,7 @@ llama_kv_cache::llama_kv_cache( if (reuse) { LLAMA_LOG_DEBUG("%s: reusing layers:\n", __func__); - for (uint32_t il = 0; il < hparams.n_layer; il++) { + for (uint32_t il = 0; il < n_layer; il++) { const int32_t il_reuse = reuse(il); if (il_reuse < 0) { @@ -463,61 +490,68 @@ llama_kv_cache::llama_kv_cache( ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); } - // TurboQuant: master's #21038 attention rotation is OFF by default on this - // fork. Enable per-side via LLAMA_ATTN_ROT_K_OVERRIDE=1 and/or - // LLAMA_ATTN_ROT_V_OVERRIDE=1 if your specific model+KV combo benefits. - // - // Why default OFF: empirical PPL+KLD testing on 7 model families - // (gemma-4 26B-A4B/31B/E2B, Qwen2.5-7B, Qwen3.5-2B, Mistral-Small-24B, - // phi-4, on q8/turbo4 KV) showed the optimal rotation policy is highly - // model-and-quant specific: - // - // • gemma-4 31B Q8 q8/turbo4: V-only rotation gives -43% PPL (huge win). - // • gemma-4 26B-A4B Q8 q8/turbo4: V-only gives -3.9%. - // • gemma-4 E2B Q4_K_L q8/turbo4: V-only HURTS by +6.7%. - // • phi-4 Q8 q8/turbo4: V-side rotation crashes (graph hash overflow). - // • Qwen2.5/3.5/Mistral: rotation effect is within standard error. - // - // No single default is correct everywhere, including within the same - // architecture family (gemma-4 above shows three distinct optima across - // three sizes). Per-arch heuristics in code would silently regress users - // on variants we haven't tested. Default OFF + per-side env knobs lets - // each user tune for their specific config; documented findings in the - // README guide the choice. - // - // Reported by @erazortt (TheTom/turboquant_plus#88). - // - // LLAMA_ATTN_ROT_DISABLE retained as a no-op alias (default OFF makes it - // redundant but historical scripts may set it). - // Default attn_rot_disable=false now that rotation is OFF by default. The - // env var is preserved as a hard lock-out (=1 forces rotation off and - // blocks overrides), useful for users who want to guarantee no rotation - // regardless of any LLAMA_ATTN_ROT_*_OVERRIDE settings. - const char * LLAMA_ATTN_ROT_DISABLE = getenv("LLAMA_ATTN_ROT_DISABLE"); - const bool attn_rot_disable = LLAMA_ATTN_ROT_DISABLE ? (atoi(LLAMA_ATTN_ROT_DISABLE) != 0) : false; - - // Default: rotation OFF on both sides (safe across all tested model families). - // Override per side via env vars below. - attn_rot_k = false; - attn_rot_v = false; - - // Per-side overrides. Set LLAMA_ATTN_ROT_K_OVERRIDE=1 / LLAMA_ATTN_ROT_V_OVERRIDE=1 - // to enable rotation. The cache type and head-dim alignment guards below - // still apply: rotation only takes effect on quantized types with - // head_dim % 64 == 0 (master's #21038 requirements). - const char * ROT_K_OV = getenv("LLAMA_ATTN_ROT_K_OVERRIDE"); - if (ROT_K_OV && atoi(ROT_K_OV) != 0 && !attn_rot_disable) { - attn_rot_k = - n_embd_head_k_all > 0 && - ggml_is_quantized(type_k) && - hparams.n_embd_head_k() % 64 == 0; - } - const char * ROT_V_OV = getenv("LLAMA_ATTN_ROT_V_OVERRIDE"); - if (ROT_V_OV && atoi(ROT_V_OV) != 0 && !attn_rot_disable) { - attn_rot_v = - n_embd_head_v_all > 0 && - ggml_is_quantized(type_v) && - hparams.n_embd_head_v() % 64 == 0; + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + // KV-cache sharing (MTP draft): a shared cache inherits head dims and the + // resolved rotation policy from its parent so draft and target agree. + if (other) { + n_embd_head_k_all = other->n_embd_head_k_all; + n_embd_head_v_all = other->n_embd_head_v_all; + + attn_rot_k = other->attn_rot_k; + attn_rot_v = other->attn_rot_v; + } else { + // TurboQuant: master's #21038 attention rotation is OFF by default on this + // fork. Enable per-side via LLAMA_ATTN_ROT_K_OVERRIDE=1 and/or + // LLAMA_ATTN_ROT_V_OVERRIDE=1 if your specific model+KV combo benefits. + // + // Why default OFF: empirical PPL+KLD testing on 7 model families + // (gemma-4 26B-A4B/31B/E2B, Qwen2.5-7B, Qwen3.5-2B, Mistral-Small-24B, + // phi-4, on q8/turbo4 KV) showed the optimal rotation policy is highly + // model-and-quant specific: + // + // • gemma-4 31B Q8 q8/turbo4: V-only rotation gives -43% PPL (huge win). + // • gemma-4 26B-A4B Q8 q8/turbo4: V-only gives -3.9%. + // • gemma-4 E2B Q4_K_L q8/turbo4: V-only HURTS by +6.7%. + // • phi-4 Q8 q8/turbo4: V-side rotation crashes (graph hash overflow). + // • Qwen2.5/3.5/Mistral: rotation effect is within standard error. + // + // No single default is correct everywhere, including within the same + // architecture family (gemma-4 above shows three distinct optima across + // three sizes). Per-arch heuristics in code would silently regress users + // on variants we haven't tested. Default OFF + per-side env knobs lets + // each user tune for their specific config; documented findings in the + // README guide the choice. + // + // Reported by @erazortt (TheTom/turboquant_plus#88). + // + // LLAMA_ATTN_ROT_DISABLE retained as a hard lock-out: =1 forces rotation + // off on both sides and blocks the per-side overrides below. + const char * LLAMA_ATTN_ROT_DISABLE = getenv("LLAMA_ATTN_ROT_DISABLE"); + const bool attn_rot_disable = LLAMA_ATTN_ROT_DISABLE ? (atoi(LLAMA_ATTN_ROT_DISABLE) != 0) : false; + + // Default: rotation OFF on both sides (safe across all tested model families). + // Override per side via env vars below. + attn_rot_k = false; + attn_rot_v = false; + + // Per-side overrides. Set LLAMA_ATTN_ROT_K_OVERRIDE=1 / LLAMA_ATTN_ROT_V_OVERRIDE=1 + // to enable rotation. The cache type and head-dim alignment guards below + // still apply: rotation only takes effect on quantized types with + // head_dim % 64 == 0 (master's #21038 requirements). + const char * ROT_K_OV = getenv("LLAMA_ATTN_ROT_K_OVERRIDE"); + if (ROT_K_OV && atoi(ROT_K_OV) != 0 && !attn_rot_disable) { + attn_rot_k = + n_embd_head_k_all > 0 && + ggml_is_quantized(type_k) && + hparams.n_embd_head_k() % 64 == 0; + } + const char * ROT_V_OV = getenv("LLAMA_ATTN_ROT_V_OVERRIDE"); + if (ROT_V_OV && atoi(ROT_V_OV) != 0 && !attn_rot_disable) { + attn_rot_v = + n_embd_head_v_all > 0 && + ggml_is_quantized(type_v) && + hparams.n_embd_head_v() % 64 == 0; + } } LLAMA_LOG_INFO("%s: attn_rot_k = %d, n_embd_head_k_all = %d\n", __func__, attn_rot_k, n_embd_head_k_all); @@ -576,6 +610,11 @@ void llama_kv_cache::clear(bool data) { } bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return true; + } + GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size())); if (p0 < 0) { @@ -639,6 +678,11 @@ bool llama_kv_cache::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { } void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return; + } + GGML_ASSERT(seq_id_src >= 0 && (size_t) seq_id_src < seq_to_stream.size()); GGML_ASSERT(seq_id_dst >= 0 && (size_t) seq_id_dst < seq_to_stream.size()); @@ -726,6 +770,11 @@ void llama_kv_cache::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, ll } void llama_kv_cache::seq_keep(llama_seq_id seq_id) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return; + } + GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); auto & cells = v_cells[seq_to_stream[seq_id]]; @@ -748,6 +797,11 @@ void llama_kv_cache::seq_keep(llama_seq_id seq_id) { } void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return; + } + GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); GGML_ASSERT(hparams.n_pos_per_embd() == 1 && "seq_add() is only supported for n_pos_per_embd() == 1"); @@ -793,6 +847,11 @@ void llama_kv_cache::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, ll } void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return; + } + GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); GGML_ASSERT(hparams.n_pos_per_embd() == 1 && "seq_div() is only supported for n_pos_per_embd() == 1"); @@ -827,6 +886,11 @@ void llama_kv_cache::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, in } llama_pos llama_kv_cache::seq_pos_min(llama_seq_id seq_id) const { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return other->seq_pos_min(seq_id); + } + GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); const auto & cells = v_cells[seq_to_stream[seq_id]]; @@ -835,6 +899,11 @@ llama_pos llama_kv_cache::seq_pos_min(llama_seq_id seq_id) const { } llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return other->seq_pos_max(seq_id); + } + GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); const auto & cells = v_cells[seq_to_stream[seq_id]]; @@ -975,6 +1044,11 @@ llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vectorget_sched(); @@ -1250,6 +1324,12 @@ llama_kv_cache::slot_info llama_kv_cache::find_slot(const llama_ubatch & ubatch, } void llama_kv_cache::apply_ubatch(const slot_info & sinfo, const llama_ubatch & ubatch) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + v_cells = other->v_cells; + return; + } + // keep track of the max sequence position that we would overwrite with this ubatch // for non-SWA cache, this would be always empty llama_seq_id seq_pos_max_rm[LLAMA_MAX_SEQ]; @@ -2094,6 +2174,9 @@ void llm_graph_input_k_shift::set_input(const llama_ubatch * ubatch) { } ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_context * lctx) const { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + GGML_ASSERT(!other); + auto * ctx = res->get_ctx(); auto * gf = res->get_gf(); @@ -2139,6 +2222,11 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co } void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return; + } + GGML_UNUSED(flags); io.write(&n_stream, sizeof(n_stream)); @@ -2192,6 +2280,11 @@ void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id, lla } void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) { + // TODO: refactor [TAG_KV_CACHE_SHARE_CELLS] + if (other) { + return; + } + GGML_UNUSED(flags); GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size())); diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index d19d9945de40..aa4d466a60e1 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -95,6 +95,7 @@ class llama_kv_cache : public llama_memory_i { llama_kv_cache( const llama_model & model, + const llama_hparams & hparams, ggml_type type_k, ggml_type type_v, bool v_trans, @@ -105,8 +106,10 @@ class llama_kv_cache : public llama_memory_i { uint32_t n_pad, uint32_t n_swa, llama_swa_type swa_type, + llama_memory_t mem_other, const layer_filter_cb & filter, - const layer_reuse_cb & reuse); + const layer_reuse_cb & reuse, + const layer_share_cb & share); ~llama_kv_cache() = default; @@ -269,6 +272,9 @@ class llama_kv_cache : public llama_memory_i { // note: this is not part of the KV state and it's only used to speed-up the find_slot() method std::vector v_heads; + // TODO: temporary until we refactor to be able to share the same cells between 2 kv caches [TAG_KV_CACHE_SHARE_CELLS] + llama_kv_cache * other; + std::vector v_cells; // maps from a sequence id to a stream id diff --git a/src/llama-memory-hybrid-iswa.cpp b/src/llama-memory-hybrid-iswa.cpp index a59561ea54dd..c7d4bcd413e8 100644 --- a/src/llama-memory-hybrid-iswa.cpp +++ b/src/llama-memory-hybrid-iswa.cpp @@ -43,9 +43,11 @@ llama_memory_hybrid_iswa::llama_memory_hybrid_iswa( n_seq_max, n_ubatch, n_pad, + nullptr, filter_attn == nullptr ? - [&](int32_t il) { return !hparams.is_recurrent(il); } + [&](int32_t il) { return !hparams.is_recr(il); } : filter_attn, + nullptr, nullptr )), mem_recr(new llama_memory_recurrent( @@ -57,7 +59,7 @@ llama_memory_hybrid_iswa::llama_memory_hybrid_iswa( n_seq_max, n_rs_seq, filter_recr == nullptr ? - [&](int32_t il) { return hparams.is_recurrent(il); } + [&](int32_t il) { return hparams.is_recr(il); } : filter_recr )) {} @@ -75,9 +77,15 @@ llama_memory_context_ptr llama_memory_hybrid_iswa::init_batch(llama_batch_allocr // if all tokens are output, split by sequence ubatch = balloc.split_seq(n_ubatch); } else { - // Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice) - const bool unified = (mem_attn->get_base()->get_n_stream() == 1); - ubatch = balloc.split_equal(n_ubatch, !unified); + if (mem_recr->n_rs_seq > 0) { + // [TAG_RECURRENT_ROLLBACK_SPLITS] + // TODO: recurrent state rollback does not support equal splits + ubatch = balloc.split_seq(n_ubatch); + } else { + // Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice) + const bool unified = (mem_attn->get_base()->get_n_stream() == 1); + ubatch = balloc.split_equal(n_ubatch, !unified); + } } if (ubatch.n_tokens == 0) { diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp index 5c2e2afdbef1..fc0444cb7caa 100644 --- a/src/llama-memory-hybrid.cpp +++ b/src/llama-memory-hybrid.cpp @@ -33,6 +33,7 @@ llama_memory_hybrid::llama_memory_hybrid( hparams(model.hparams), mem_attn(new llama_kv_cache( model, + model.hparams, type_k, type_v, v_trans, @@ -43,9 +44,11 @@ llama_memory_hybrid::llama_memory_hybrid( n_pad, n_swa, swa_type, + nullptr, filter_attn == nullptr ? - [&](int32_t il) { return !hparams.is_recurrent(il); } + [&](int32_t il) { return !hparams.is_recr(il); } : filter_attn, + nullptr, nullptr )), mem_recr(new llama_memory_recurrent( @@ -57,7 +60,7 @@ llama_memory_hybrid::llama_memory_hybrid( n_seq_max, n_rs_seq, filter_recr == nullptr ? - [&](int32_t il) { return hparams.is_recurrent(il); } + [&](int32_t il) { return hparams.is_recr(il); } : filter_recr )) {} @@ -75,9 +78,15 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba // if all tokens are output, split by sequence ubatch = balloc.split_seq(n_ubatch); } else { - // Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice) - const bool unified = (mem_attn->get_n_stream() == 1); - ubatch = balloc.split_equal(n_ubatch, !unified); + if (mem_recr->n_rs_seq > 0) { + // [TAG_RECURRENT_ROLLBACK_SPLITS] + // TODO: recurrent state rollback does not support equal splits + ubatch = balloc.split_seq(n_ubatch); + } else { + // Use non-sequential split when KV cache is unified (needed for hellaswag/winogrande/multiple-choice) + const bool unified = (mem_attn->get_n_stream() == 1); + ubatch = balloc.split_equal(n_ubatch, !unified); + } } if (ubatch.n_tokens == 0) { diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp index aeb866657f24..6a4892fb471d 100644 --- a/src/llama-memory-recurrent.cpp +++ b/src/llama-memory-recurrent.cpp @@ -26,7 +26,7 @@ llama_memory_recurrent::llama_memory_recurrent( uint32_t n_seq_max, uint32_t n_rs_seq, const layer_filter_cb & filter) : hparams(model.hparams), n_seq_max(n_seq_max) { - const int32_t n_layer = hparams.n_layer; + const int32_t n_layer = hparams.n_layer(); head = 0; size = mem_size; @@ -416,9 +416,15 @@ llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & // if all tokens are output, split by sequence ubatch = balloc.split_seq(n_ubatch); } else { - // TODO: non-sequential equal split can be done if using unified KV cache - // for simplicity, we always use sequential equal split for now - ubatch = balloc.split_equal(n_ubatch, true); + if (n_rs_seq > 0) { + // [TAG_RECURRENT_ROLLBACK_SPLITS] + // TODO: recurrent state rollback does not support equal splits + ubatch = balloc.split_seq(n_ubatch); + } else { + // TODO: non-sequential equal split can be done if using unified KV cache + // for simplicity, we always use sequential equal split for now + ubatch = balloc.split_equal(n_ubatch, true); + } } if (ubatch.n_tokens == 0) { @@ -857,7 +863,7 @@ void llama_memory_recurrent::state_write_meta(llama_io_write_i & io, const std:: void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::vector> & cell_ranges) const { const uint32_t s_trans = 0; - const uint32_t n_layer = hparams.n_layer; + const uint32_t n_layer = hparams.n_layer(); io.write(&s_trans, sizeof(s_trans)); io.write(&n_layer, sizeof(n_layer)); @@ -1041,8 +1047,8 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell io.read(&s_trans, sizeof(s_trans)); io.read(&n_layer, sizeof(n_layer)); - if (n_layer != hparams.n_layer) { - LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer); + if (n_layer != hparams.n_layer()) { + LLAMA_LOG_ERROR("%s: mismatched layer count (%u instead of %u)\n", __func__, n_layer, hparams.n_layer()); return false; } if (cell_count > size) { diff --git a/src/llama-memory-recurrent.h b/src/llama-memory-recurrent.h index 29c58afc9c27..b13b7b748f5e 100644 --- a/src/llama-memory-recurrent.h +++ b/src/llama-memory-recurrent.h @@ -72,6 +72,7 @@ class llama_memory_recurrent : public llama_memory_i { // number of recurrent-state snapshots per seq for rollback; tensors are widened to (1 + n_rs_seq) groups uint32_t n_rs_seq = 0; + // per-seq rollback index std::vector rs_idx; diff --git a/src/llama-memory.h b/src/llama-memory.h index 1ab412261f38..6bd5ff6339a7 100644 --- a/src/llama-memory.h +++ b/src/llama-memory.h @@ -23,6 +23,8 @@ struct llama_memory_params { bool swa_full; llama_context_type ctx_type; + + llama_memory_t mem_other; }; enum llama_memory_status { @@ -85,6 +87,8 @@ struct llama_memory_i { // return negative value to indicate that the layer il should not reuse memory using layer_reuse_cb = std::function; + using layer_share_cb = std::function; + virtual ~llama_memory_i() = default; // split the input batch into a set of ubatches and verify that they can fit into the cache diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 54e5974e62c4..374788e2468f 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -148,7 +148,7 @@ namespace GGUFMeta { const enum gguf_type arr_type = gguf_get_arr_type(ctx, k); return ArrayInfo { arr_type, - size_t(gguf_get_arr_n(ctx, k)), + gguf_get_arr_n(ctx, k), arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx, k), }; } @@ -447,7 +447,7 @@ namespace GGUFMeta { } if (n > N_MAX) { - throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", (uint32_t) n, (uint32_t) N_MAX, key.c_str())); + throw std::runtime_error(format("n > N_MAX: %u > %u for key %s", n, (uint32_t) N_MAX, key.c_str())); } if (gguf_get_kv_type(metadata, kid) == GGUF_TYPE_ARRAY) { @@ -504,9 +504,9 @@ namespace GGUFMeta { } // TODO: this is not very clever - figure out something better - template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); + template bool llama_model_loader::get_key_or_arr> (enum llm_kv kid, std::array & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); - template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); + template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); llama_model_loader::llama_model_loader( @@ -1054,10 +1054,10 @@ struct ggml_tensor * llama_model_loader::create_tensor( if (it == ctx_map.end()) { // one ggml context per buffer type int max_n_tensors = n_tensors; - max_n_tensors += 1; // duplicated output tensor - max_n_tensors += hparams.n_layer*2; // duplicated rope freq tensors + max_n_tensors += 1; // duplicated output tensor + max_n_tensors += hparams.n_layer()*2; // duplicated rope freq tensors if (files.empty()) { - max_n_tensors += hparams.n_layer*256; // this should be well above what any model actually uses + max_n_tensors += hparams.n_layer()*256; // this should be well above what any model actually uses } const size_t ctx_size = ggml_tensor_overhead()*max_n_tensors; diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp index 528e4c9c069f..88692babbb6c 100644 --- a/src/llama-model-saver.cpp +++ b/src/llama-model-saver.cpp @@ -14,9 +14,6 @@ bool llama_model_saver_supports_arch(llm_arch arch) { switch (arch) { - case LLM_ARCH_QWEN3NEXT: - case LLM_ARCH_QWEN35: - case LLM_ARCH_QWEN35MOE: case LLM_ARCH_PLAMO3: case LLM_ARCH_GEMMA3: case LLM_ARCH_GEMMA3N: @@ -79,7 +76,7 @@ void llama_model_saver::add_kv(const enum llm_kv key, const char value) { template void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, const bool per_layer) { GGML_ASSERT(model != nullptr || !per_layer); - const size_t n_values = per_layer ? size_t(model->hparams.n_layer) : value.size(); + const size_t n_values = per_layer ? size_t(model->hparams.n_layer()) : value.size(); GGML_ASSERT(n_values <= value.size()); if (n_values == 0) { @@ -106,6 +103,8 @@ void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, c gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT8, value.data(), n_values); } else if (std::is_same::value) { gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT32, value.data(), n_values); + } else if (std::is_same::value) { + gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_BOOL, value.data(), n_values); } else if (std::is_same::value) { gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT32, value.data(), n_values); } else if (std::is_same::value) { @@ -206,7 +205,7 @@ void llama_model_saver::add_kv_from_model() { if (hparams.n_embd_out_impl > 0) { add_kv(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl); } - add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer); + add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer_all); add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead); add_kv(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, true); add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); @@ -227,7 +226,7 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_EXPERT_GROUP_SCALE, hparams.expert_group_scale); add_kv(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts); add_kv(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers); - add_kv(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers); + add_kv(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn); add_kv(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers); add_kv(LLM_KV_POOLING_TYPE, uint32_t(hparams.pooling_type)); add_kv(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); @@ -244,7 +243,7 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale); add_kv(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count); add_kv(LLM_KV_INTERLEAVE_MOE_LAYER_STEP, hparams.n_moe_layer_step); - // add_kv(LLM_KV_FULL_ATTENTION_INTERVAL, ???); + // add_kv(LLM_KV_FULL_ATTENTION_INTERVAL, ???); // saved as LLM_KV_ATTENTION_RECURRENT_LAYERS instead add_kv(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, true); add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, true); @@ -278,6 +277,7 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, hparams.indexer_n_head); add_kv(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, hparams.indexer_head_size); add_kv(LLM_KV_ATTENTION_INDEXER_TOP_K, hparams.indexer_top_k); + add_kv(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, true); const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 8bf20a716eba..f519784ccafb 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -134,6 +134,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params return new llama_model_gemma3n(params); case LLM_ARCH_GEMMA4: return new llama_model_gemma4(params); + case LLM_ARCH_GEMMA4_ASSISTANT: + return new llama_model_gemma4_assistant(params); case LLM_ARCH_GEMMA_EMBEDDING: return new llama_model_gemma_embedding(params); case LLM_ARCH_STARCODER2: @@ -366,10 +368,10 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str // count only the same type of previous layers to avoid this auto get_il_eff = [&](const size_t il){ size_t ret = 0; - const bool il_is_recurrent = hparams.is_recurrent(il); - const bool il_is_swa = hparams.is_swa(il); + const bool il_is_recr = hparams.is_recr(il); + const bool il_is_swa = hparams.is_swa(il); for (size_t il_prev = 0; il_prev < il; il_prev++) { - ret += hparams.is_recurrent(il_prev) == il_is_recurrent && hparams.is_swa(il_prev) == il_is_swa; + ret += hparams.is_recr(il_prev) == il_is_recr && hparams.is_swa(il_prev) == il_is_swa; } return ret; }; @@ -391,7 +393,7 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str rotation = get_il_eff(il) % ud->n_devices; } else { il = 0; - rotation = hparams.n_layer % ud->n_devices; + rotation = hparams.n_layer() % ud->n_devices; } const ggml_tensor * tensor_axis_0 = suffix.empty() ? tensor : ud->model->get_tensor((prefix + suffix).c_str()); if (tensor_axis_0 == nullptr) { @@ -546,7 +548,7 @@ struct ggml_backend_meta_split_state llama_meta_device_get_split_state(const str }; auto get_split_granularity = [&](int64_t blck_size, uint32_t il, const std::vector & segments) -> std::vector { - if (hparams.is_recurrent(il)) { + if (hparams.is_recr(il)) { // linear attention const int64_t head_dim = hparams.ssm_d_state; const int64_t granularity_qkv = std::lcm(blck_size, head_dim); @@ -1000,7 +1002,7 @@ void llama_model_base::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EMBEDDING_LENGTH_OUT, hparams.n_embd_out_impl, false); ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn, false); ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false); - ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer); + ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer_all); ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); ml.get_key(LLM_KV_EXPERT_GROUP_COUNT, hparams.n_expert_groups, false); @@ -1042,28 +1044,26 @@ void llama_model_base::load_hparams(llama_model_loader & ml) { std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0); std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0); std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0); - std::fill( - hparams.recurrent_layer_arr.begin(), - hparams.recurrent_layer_arr.end(), - llm_arch_is_recurrent(ml.get_arch())); std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0); - std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0); + std::fill(hparams.is_swa_impl.begin(), hparams.is_swa_impl.end(), 0); + std::fill(hparams.is_recr_impl.begin(), hparams.is_recr_impl.end(), llm_arch_is_recurrent(ml.get_arch()) ? 1 : 0); std::fill(hparams.xielu_alpha_n.begin(), hparams.xielu_alpha_n.end(), 0.0f); std::fill(hparams.xielu_alpha_p.begin(), hparams.xielu_alpha_p.end(), 0.0f); - std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f); - std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f); + std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f); + std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f); + std::fill(hparams.swiglu_clamp_exp.begin(), hparams.swiglu_clamp_exp.end(), 0.0f); std::fill(hparams.swiglu_clamp_shexp.begin(), hparams.swiglu_clamp_shexp.end(), 0.0f); - ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false); - ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false); + ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer(), false); + ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer(), false); // n_head_kv is optional, default to n_head hparams.n_head_kv_arr = hparams.n_head_arr; - ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false); + ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer(), false); bool rope_finetuned = false; ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false); @@ -1162,7 +1162,7 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { const auto & use_mlock = params.use_mlock; const auto & tensor_split = params.tensor_split; - const int n_layer = hparams.n_layer; + const int n_layer = hparams.n_layer_all; const int n_gpu_layers = this->n_gpu_layers(); const bool use_mmap_buffer = true; @@ -1219,10 +1219,10 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { splits[i] /= split_sum; } - const int i_gpu_start = std::max(int(hparams.n_layer) + 1 - n_gpu_layers, 0); - const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, int(n_layer) + 1); + const int i_gpu_start = std::max(n_layer + 1 - n_gpu_layers, 0); + const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, n_layer + 1); auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev { - const bool is_swa = il < int(hparams.n_layer) && hparams.is_swa(il); + const bool is_swa = il < n_layer && hparams.is_swa(il); if (il < i_gpu_start || (il - i_gpu_start) >= act_gpu_layers) { LLAMA_LOG_DEBUG("load_tensors: layer %3d assigned to device %s, is_swa = %d\n", il, ggml_backend_dev_name(cpu_dev), is_swa); return {cpu_dev, &pimpl->cpu_buft_list}; @@ -1334,6 +1334,12 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { if (!layer.ssm_beta_s && layer.ssm_beta) { layer.ssm_beta_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "scale", i), {1}, TENSOR_NOT_REQUIRED); } + if (!layer.nextn.eh_proj_s && layer.nextn.eh_proj) { + layer.nextn.eh_proj_s = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.nextn.shared_head_head_s && layer.nextn.shared_head_head) { + layer.nextn.shared_head_head_s = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "scale", i), {1}, TENSOR_NOT_REQUIRED); + } // input scales if (!layer.wq_in_s && layer.wq) { @@ -1393,6 +1399,12 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { if (!layer.ssm_beta_in_s && layer.ssm_beta) { layer.ssm_beta_in_s = create_tensor(tn(LLM_TENSOR_SSM_BETA, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); } + if (!layer.nextn.eh_proj_in_s && layer.nextn.eh_proj) { + layer.nextn.eh_proj_in_s = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); + } + if (!layer.nextn.shared_head_head_in_s && layer.nextn.shared_head_head) { + layer.nextn.shared_head_head_in_s = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "input_scale", i), {1}, TENSOR_NOT_REQUIRED); + } } // output scales if (output && output->type == GGML_TYPE_NVFP4) { @@ -1513,7 +1525,7 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { } if (llama_supports_gpu_offload()) { - const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); + const int n_gpu = std::min(n_gpu_layers, n_layer); int n_repeating = n_gpu; if (n_repeating > 0) { @@ -1522,8 +1534,8 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { } LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_repeating); - const int max_backend_supported_layers = hparams.n_layer + 1; - const int max_offloadable_layers = hparams.n_layer + 1; + const int max_backend_supported_layers = n_layer + 1; + const int max_offloadable_layers = n_layer + 1; LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); } @@ -1592,7 +1604,7 @@ const float * llama_model::tensor_split() const { } uint32_t llama_model::n_gpu_layers() const { - return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1; + return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer() + 1; } llama_split_mode llama_model::split_mode() const { @@ -1661,19 +1673,21 @@ void llama_model::print_info() const { if (!hparams.vocab_only) { LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train); - LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); LLAMA_LOG_INFO("%s: n_embd_inp = %u\n", __func__, hparams.n_embd_inp()); - LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer); - LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str()); - LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str()); + LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); + LLAMA_LOG_INFO("%s: n_embd_out = %u\n", __func__, hparams.n_embd_out()); + LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer()); + LLAMA_LOG_INFO("%s: n_layer_all = %u\n", __func__, hparams.n_layer_all); + LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer_all).c_str()); + LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer_all).c_str()); LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot_full); LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa); LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any()); LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k_full); LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v_full); - LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str()); - LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str()); - LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str()); + LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer_all).c_str()); + LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer_all).c_str()); + LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer_all).c_str()); LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps); LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps); LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv); @@ -1681,7 +1695,7 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale); LLAMA_LOG_INFO("%s: f_attn_scale = %.1e\n", __func__, hparams.f_attention_scale); LLAMA_LOG_INFO("%s: f_attn_value_scale = %.4f\n", __func__, hparams.f_attn_value_scale); - LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer).c_str()); + LLAMA_LOG_INFO("%s: n_ff = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_ff(il); }, hparams.n_layer_all).c_str()); LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert); LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used); LLAMA_LOG_INFO("%s: n_expert_groups = %d\n", __func__, hparams.n_expert_groups); @@ -1702,6 +1716,14 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn); LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul); LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown"); + if (arch == LLM_ARCH_GRANITE && + std::any_of(hparams.deepstack_mapping_arr.begin(), + hparams.deepstack_mapping_arr.end(), + [](const auto & entry) { return entry >= 0; })) { + LLAMA_LOG_INFO("%s: deepstack_mapping_arr = %s\n", __func__, + print_f([&](uint32_t il) { return hparams.deepstack_mapping_arr[il]; }, + hparams.n_layer_all).c_str()); + } // MRoPE (Multi-axis Rotary Position Embedding) sections if (const auto & s = hparams.rope_sections; s[0] || s[1] || s[2] || s[3]) { LLAMA_LOG_INFO("%s: mrope sections = [%d, %d, %d, %d]\n", __func__, s[0], s[1], s[2], s[3]); @@ -1804,7 +1826,7 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm); LLAMA_LOG_INFO("%s: expert_gating_func = %s\n", __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func)); - LLAMA_LOG_INFO("%s: nextn_predict_layers = %d\n", __func__, hparams.nextn_predict_layers); + LLAMA_LOG_INFO("%s: n_layer_nextn = %d\n", __func__, hparams.n_layer_nextn); } if (arch == LLM_ARCH_SMALLTHINKER || arch == LLM_ARCH_LFM2MOE) { @@ -1969,22 +1991,21 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_memory_hybrid::layer_filter_cb filter_attn = nullptr; llama_memory_hybrid::layer_filter_cb filter_recr = nullptr; if (arch == LLM_ARCH_FALCON_H1) { - filter_attn = [&](int32_t) { return true; }; - filter_recr = [&](int32_t) { return true; }; + filter_attn = [&](uint32_t) { return true; }; + filter_recr = [&](uint32_t) { return true; }; } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) { - filter_attn = [&](int32_t il) { - return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0; + filter_attn = [&](uint32_t il) { + return !hparams.is_recr(il) && hparams.n_ff(il) == 0; }; - filter_recr = [&](int32_t il) { - return hparams.is_recurrent(il) && hparams.n_ff(il) == 0; + filter_recr = [&](uint32_t il) { + return hparams.is_recr(il) && hparams.n_ff(il) == 0; }; } else if (arch == LLM_ARCH_QWEN35 || arch == LLM_ARCH_QWEN35MOE) { - const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers; - filter_attn = [&, n_main](int32_t il) { - return (uint32_t)il < n_main && !hparams.is_recurrent(il); + filter_attn = [&](uint32_t il) { + return il < hparams.n_layer() && !hparams.is_recr(il); }; - filter_recr = [&, n_main](int32_t il) { - return (uint32_t)il < n_main && hparams.is_recurrent(il); + filter_recr = [&](uint32_t il) { + return il < hparams.n_layer() && hparams.is_recr(il); }; } @@ -2029,13 +2050,16 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, /* filter_recr */ std::move(filter_recr)); } } else { - llama_memory_i::layer_reuse_cb reuse = nullptr; llama_kv_cache::layer_filter_cb filter = nullptr; + llama_memory_i::layer_reuse_cb reuse = nullptr; + llama_kv_cache::layer_share_cb share = nullptr; if (arch == LLM_ARCH_GEMMA3N || arch == LLM_ARCH_GEMMA4) { - reuse = [&](int32_t il) { - if (il >= (int32_t) hparams.n_layer_kv_from_start) { - return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1); + reuse = [&](uint32_t il) { + GGML_ASSERT(hparams.n_layer_kv_from_start >= 2); + + if (il >= (uint32_t)hparams.n_layer_kv_from_start) { + return hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1); } return -1; @@ -2043,32 +2067,73 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, } if (mtp_on_hybrid_qwen35) { - const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers; - filter = [n_main](int32_t il) { return (uint32_t)il >= n_main; }; + filter = [&](uint32_t il) { return il >= hparams.n_layer(); }; + } + + if (arch == LLM_ARCH_STEP35 && hparams.n_layer_nextn > 0) { + if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP) { + filter = [&](uint32_t il) { return il >= hparams.n_layer(); }; + } else { + filter = [&](uint32_t il) { return il < hparams.n_layer(); }; + } } if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { GGML_ASSERT(hparams.is_swa_any()); - res = new llama_kv_cache_iswa( - *this, - params.type_k, - params.type_v, - !cparams.flash_attn, - cparams.offload_kqv, - params.swa_full, - cparams.kv_unified, - cparams.n_ctx_seq, - cparams.n_seq_max, - cparams.n_ubatch, - 1, - filter, - reuse); + if (arch == LLM_ARCH_GEMMA4_ASSISTANT) { + llama_memory_t mem_other = llama_get_memory(cparams.ctx_other); + + share = [&](int32_t il) { + const llama_model * model_other = llama_get_model(cparams.ctx_other); + + if (hparams.is_swa(il)) { + return llama_model_n_layer(model_other) - 2; + } + + return llama_model_n_layer(model_other) - 1; + }; + + res = new llama_kv_cache_iswa( + *this, + params.type_k, + params.type_v, + !cparams.flash_attn, + cparams.offload_kqv, + params.swa_full, + cparams.kv_unified, + cparams.n_ctx_seq, + cparams.n_seq_max, + cparams.n_ubatch, + 1, + mem_other, + filter, + reuse, + share); + } else { + res = new llama_kv_cache_iswa( + *this, + params.type_k, + params.type_v, + !cparams.flash_attn, + cparams.offload_kqv, + params.swa_full, + cparams.kv_unified, + cparams.n_ctx_seq, + cparams.n_seq_max, + cparams.n_ubatch, + 1, + nullptr, + filter, + reuse, + share); + } } else { GGML_ASSERT(!hparams.is_swa_any()); res = new llama_kv_cache( *this, + hparams, params.type_k, params.type_v, !cparams.flash_attn, @@ -2079,7 +2144,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, 1, hparams.n_swa, hparams.swa_type, + nullptr, filter, + nullptr, nullptr); } } @@ -2167,7 +2234,7 @@ int32_t llama_model_n_embd_out(const llama_model * model) { } int32_t llama_model_n_layer(const llama_model * model) { - return model->hparams.n_layer; + return model->hparams.n_layer(); } int32_t llama_model_n_head(const llama_model * model) { @@ -2311,6 +2378,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_GEMMA3: case LLM_ARCH_GEMMA3N: case LLM_ARCH_GEMMA4: + case LLM_ARCH_GEMMA4_ASSISTANT: case LLM_ARCH_GEMMA_EMBEDDING: case LLM_ARCH_STARCODER2: case LLM_ARCH_OPENELM: diff --git a/src/llama-model.h b/src/llama-model.h index 01c87a75271f..2d2b83208540 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -202,12 +202,16 @@ struct llama_layer_shortconv { }; struct llama_layer_nextn { - struct ggml_tensor * eh_proj = nullptr; - struct ggml_tensor * embed_tokens = nullptr; - struct ggml_tensor * enorm = nullptr; - struct ggml_tensor * hnorm = nullptr; - struct ggml_tensor * shared_head_head = nullptr; - struct ggml_tensor * shared_head_norm = nullptr; + struct ggml_tensor * eh_proj = nullptr; + struct ggml_tensor * eh_proj_s = nullptr; + struct ggml_tensor * eh_proj_in_s = nullptr; + struct ggml_tensor * embed_tokens = nullptr; + struct ggml_tensor * enorm = nullptr; + struct ggml_tensor * hnorm = nullptr; + struct ggml_tensor * shared_head_head = nullptr; + struct ggml_tensor * shared_head_head_s = nullptr; + struct ggml_tensor * shared_head_head_in_s = nullptr; + struct ggml_tensor * shared_head_norm = nullptr; }; struct llama_layer { @@ -538,6 +542,10 @@ struct llama_model { struct ggml_tensor * output_s = nullptr; struct ggml_tensor * output_in_s = nullptr; + // NextN/MTP model-level projections + struct ggml_tensor * nextn_proj_pre = nullptr; + struct ggml_tensor * nextn_proj_post = nullptr; + // classifier struct ggml_tensor * cls = nullptr; struct ggml_tensor * cls_b = nullptr; @@ -690,7 +698,9 @@ const char * llm_type_name(llm_type type); // convenience macro for loading local variables for load_tensors() in llama_model_base // note: cast to int64_t since we will use these for the tensor dimensions #define LLAMA_LOAD_LOCALS \ - const int n_layer = hparams.n_layer; GGML_UNUSED(n_layer); \ + const int n_layer = hparams.n_layer(); GGML_UNUSED(n_layer); \ + const int n_layer_all = hparams.n_layer_all; GGML_UNUSED(n_layer_all); \ + const int n_layer_nextn = hparams.n_layer_nextn; GGML_UNUSED(n_layer_nextn); \ const int64_t n_head = hparams.n_head(); GGML_UNUSED(n_head); \ const int64_t n_head_kv = hparams.n_head_kv(); GGML_UNUSED(n_head_kv); \ const int64_t n_embd = hparams.n_embd; GGML_UNUSED(n_embd); \ diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index f86c9cce55e6..6695dcfb23f3 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -849,7 +849,7 @@ static void init_quantize_state_counters(quantize_state_impl & qs, std::vectorhparams.n_embd = desc->n_embd; model->hparams.n_embd_head_k_full = desc->n_embd_head_k; model->hparams.n_embd_head_v_full = desc->n_embd_head_v; - model->hparams.n_layer = desc->n_layer; + model->hparams.n_layer_all = desc->n_layer; model->hparams.n_expert = desc->n_expert; for (uint32_t i = 0; i < desc->n_layer; i++) { diff --git a/src/models/afmoe.cpp b/src/models/afmoe.cpp index a7c77ee5d28b..063b214256e7 100644 --- a/src/models/afmoe.cpp +++ b/src/models/afmoe.cpp @@ -30,7 +30,7 @@ void llama_model_afmoe::load_arch_hparams(llama_model_loader & ml) { hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 56: type = LLM_TYPE_6B; break; case 32: type = LLM_TYPE_26B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/apertus.cpp b/src/models/apertus.cpp index bec7136521c6..6dfb8905fbe5 100644 --- a/src/models/apertus.cpp +++ b/src/models/apertus.cpp @@ -2,12 +2,13 @@ void llama_model_apertus::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n, hparams.n_layer); - ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p, hparams.n_layer); - ml.get_key_or_arr(LLM_KV_XIELU_BETA, hparams.xielu_beta, hparams.n_layer); - ml.get_key_or_arr(LLM_KV_XIELU_EPS, hparams.xielu_eps, hparams.n_layer); - switch (hparams.n_layer) { + ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n, hparams.n_layer()); + ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p, hparams.n_layer()); + ml.get_key_or_arr(LLM_KV_XIELU_BETA, hparams.xielu_beta, hparams.n_layer()); + ml.get_key_or_arr(LLM_KV_XIELU_EPS, hparams.xielu_eps, hparams.n_layer()); + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_8B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/arcee.cpp b/src/models/arcee.cpp index d086c4717ff1..9536e7c5d429 100644 --- a/src/models/arcee.cpp +++ b/src/models/arcee.cpp @@ -4,7 +4,7 @@ void llama_model_arcee::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); // Arcee uses the same structure as Llama - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 36: type = LLM_TYPE_4B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/arctic.cpp b/src/models/arctic.cpp index 27deadffeb75..09ee0f752f0e 100644 --- a/src/models/arctic.cpp +++ b/src/models/arctic.cpp @@ -4,7 +4,7 @@ void llama_model_arctic::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); if (hparams.n_expert == 128) { - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 35: type = LLM_TYPE_10B_128x3_66B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/arwkv7.cpp b/src/models/arwkv7.cpp index 9bd04127b25a..b38b2064785b 100644 --- a/src/models/arwkv7.cpp +++ b/src/models/arwkv7.cpp @@ -10,7 +10,7 @@ void llama_model_arwkv7::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK, hparams.n_lora_gate, false); ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 12: switch (hparams.n_embd) { case 768: type = LLM_TYPE_190M; break; diff --git a/src/models/baichuan.cpp b/src/models/baichuan.cpp index 4d26081cd5d4..585f36141741 100644 --- a/src/models/baichuan.cpp +++ b/src/models/baichuan.cpp @@ -2,7 +2,7 @@ void llama_model_baichuan::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 40: type = LLM_TYPE_13B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/bailingmoe.cpp b/src/models/bailingmoe.cpp index fe1ae10864b8..7faf73c835b6 100644 --- a/src/models/bailingmoe.cpp +++ b/src/models/bailingmoe.cpp @@ -8,7 +8,7 @@ void llama_model_bailingmoe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 28: type = LLM_TYPE_16B; break; case 88: type = LLM_TYPE_290B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/bailingmoe2.cpp b/src/models/bailingmoe2.cpp index 2f0d44a62596..5000e9c6db89 100644 --- a/src/models/bailingmoe2.cpp +++ b/src/models/bailingmoe2.cpp @@ -9,17 +9,13 @@ void llama_model_bailingmoe2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); - // TODO: when MTP is implemented, this should probably be updated if needed - hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 20: type = LLM_TYPE_16B_A1B; break; - case 21: type = LLM_TYPE_16B_A1B; break; case 32: type = LLM_TYPE_100B_A6B; break; - case 33: type = LLM_TYPE_100B_A6B; break; default: type = LLM_TYPE_UNKNOWN; } } @@ -39,9 +35,9 @@ void llama_model_bailingmoe2::load_arch_tensors(llama_model_loader &) { GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2"); GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2"); - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { int flags = 0; - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { // skip all tensors in the NextN layers flags |= TENSOR_SKIP; } @@ -78,7 +74,7 @@ void llama_model_bailingmoe2::load_arch_tensors(llama_model_loader &) { } // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags); layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); @@ -112,8 +108,7 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph ggml_tensor * inp_out_ids = build_inp_out_ids(); - const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; // norm @@ -146,7 +141,7 @@ llama_model_bailingmoe2::graph::graph(const llama_model & model, const llm_graph Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); } - if (il == n_transformer_layers - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/bert.cpp b/src/models/bert.cpp index 3c28f419ccf2..53ce29f23cae 100644 --- a/src/models/bert.cpp +++ b/src/models/bert.cpp @@ -1,9 +1,9 @@ #include "models.h" void llama_model_bert::load_arch_hparams(llama_model_loader & ml) { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 3: type = LLM_TYPE_17M; break; // bge-micro case 6: diff --git a/src/models/bitnet.cpp b/src/models/bitnet.cpp index 7e8125deec40..c83302745806 100644 --- a/src/models/bitnet.cpp +++ b/src/models/bitnet.cpp @@ -3,7 +3,7 @@ void llama_model_bitnet::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 26: type = LLM_TYPE_3B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/bloom.cpp b/src/models/bloom.cpp index 30b0f3d07d06..609d2ddf998b 100644 --- a/src/models/bloom.cpp +++ b/src/models/bloom.cpp @@ -3,7 +3,7 @@ void llama_model_bloom::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1B; break; case 30: switch (hparams.n_embd) { diff --git a/src/models/chameleon.cpp b/src/models/chameleon.cpp index 4bceaefd63b3..4f45acecf848 100644 --- a/src/models/chameleon.cpp +++ b/src/models/chameleon.cpp @@ -6,7 +6,7 @@ void llama_model_chameleon::load_arch_hparams(llama_model_loader & ml) { hparams.f_norm_eps = 1e-5; // eps for qk-norm, torch default ml.get_key(LLM_KV_SWIN_NORM, hparams.swin_norm, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 48: type = LLM_TYPE_34B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/chatglm.cpp b/src/models/chatglm.cpp index 6766fa71c159..7ae5b938fde3 100644 --- a/src/models/chatglm.cpp +++ b/src/models/chatglm.cpp @@ -2,7 +2,8 @@ void llama_model_chatglm::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 28: { if (hparams.n_head(0) == 16) { type = LLM_TYPE_1_5B; diff --git a/src/models/codeshell.cpp b/src/models/codeshell.cpp index 274dd3342a7b..de53bb981847 100644 --- a/src/models/codeshell.cpp +++ b/src/models/codeshell.cpp @@ -2,7 +2,8 @@ void llama_model_codeshell::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 42: type = LLM_TYPE_7B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/cogvlm.cpp b/src/models/cogvlm.cpp index 2e231bb3f935..750f57a394e3 100644 --- a/src/models/cogvlm.cpp +++ b/src/models/cogvlm.cpp @@ -2,7 +2,8 @@ void llama_model_cogvlm::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_13B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/cohere2.cpp b/src/models/cohere2.cpp index a514cf88fc63..61a5945a1948 100644 --- a/src/models/cohere2.cpp +++ b/src/models/cohere2.cpp @@ -5,6 +5,7 @@ void llama_model_cohere2::load_arch_hparams(llama_model_loader & ml) { uint32_t swa_period = 4; ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false); hparams.set_swa_pattern(swa_period); + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; @@ -12,7 +13,8 @@ void llama_model_cohere2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_8B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/command-r.cpp b/src/models/command-r.cpp index adf7fcaa20f8..94a46188bb89 100644 --- a/src/models/command-r.cpp +++ b/src/models/command-r.cpp @@ -3,7 +3,8 @@ void llama_model_command_r::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 40: type = LLM_TYPE_35B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/dbrx.cpp b/src/models/dbrx.cpp index af71c7753658..4f5ac4d06a45 100644 --- a/src/models/dbrx.cpp +++ b/src/models/dbrx.cpp @@ -1,14 +1,14 @@ #include "models.h" void llama_model_dbrx::load_arch_hparams(llama_model_loader & ml) { -ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); -ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv); -switch (hparams.n_layer) { - case 40: type = LLM_TYPE_16x12B; break; - default: type = LLM_TYPE_UNKNOWN; + switch (hparams.n_layer()) { + case 40: type = LLM_TYPE_16x12B; break; + default: type = LLM_TYPE_UNKNOWN; + } } - } void llama_model_dbrx::load_arch_tensors(llama_model_loader &) { LLAMA_LOAD_LOCALS; diff --git a/src/models/deci.cpp b/src/models/deci.cpp index 567e35352761..cdfcf29e02f5 100644 --- a/src/models/deci.cpp +++ b/src/models/deci.cpp @@ -2,7 +2,8 @@ void llama_model_deci::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 80: type = LLM_TYPE_70B; break; case 162: type = LLM_TYPE_405B; break; diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp index 1fe54adc13e2..a9e8bc514036 100644 --- a/src/models/deepseek2.cpp +++ b/src/models/deepseek2.cpp @@ -5,7 +5,7 @@ void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false); // lite variants include DeepSeek-V2-Lite, GigaChat3-10B-A1.8B, Kanana-2-30B-A3B - const bool is_lite = (hparams.n_layer == 27 || hparams.n_layer == 26 || (hparams.n_layer == 48 && n_vocab == 128256)); + const bool is_lite = (hparams.n_layer() == 27 || hparams.n_layer() == 26 || (hparams.n_layer() == 48 && n_vocab == 128256)); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false); @@ -23,7 +23,7 @@ void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) { if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) { // for compatibility with existing DeepSeek V2 and V2.5 GGUFs // that have no expert_gating_func model parameter set - if ((hparams.n_layer == 47 || hparams.n_layer == 48) && n_vocab == 154880) { + if ((hparams.n_layer() == 47 || hparams.n_layer() == 48) && n_vocab == 154880) { // GLM 4.7 Lite hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID; } else { @@ -43,7 +43,7 @@ void llama_model_deepseek2::load_arch_hparams(llama_model_loader & ml) { hparams.f_attn_temp_offset = 0.0f; - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 27: type = LLM_TYPE_16B; break; case 47: type = LLM_TYPE_30B_A3B; break; case 60: type = LLM_TYPE_236B; break; @@ -191,8 +191,7 @@ llama_model_deepseek2::graph::graph(const llama_model & model, const llm_graph_p ggml_tensor * inp_out_ids = build_inp_out_ids(); - int effective_n_layers = hparams.n_layer - hparams.nextn_predict_layers; - for (int il = 0; il < effective_n_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; // norm @@ -366,7 +365,7 @@ llama_model_deepseek2::graph::graph(const llama_model & model, const llm_graph_p Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); } } - if (il == effective_n_layers - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/deepseek2ocr.cpp b/src/models/deepseek2ocr.cpp index f9e4c98785c1..65d31c31b93e 100644 --- a/src/models/deepseek2ocr.cpp +++ b/src/models/deepseek2ocr.cpp @@ -14,7 +14,7 @@ void llama_model_deepseek2ocr::load_arch_hparams(llama_model_loader & ml) { hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 12: type = LLM_TYPE_3B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/delta-net-base.cpp b/src/models/delta-net-base.cpp index 2a4e00384e95..a67238383eda 100644 --- a/src/models/delta-net-base.cpp +++ b/src/models/delta-net-base.cpp @@ -447,13 +447,6 @@ std::pair llm_build_delta_net_base::build_delta_ne return build_delta_net_chunking(q, k, v, g, b, s, il); } -bool llm_build_delta_net_base::keep_rs() const { - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - return cparams.n_rs_seq > 0 - && n_seq_tokens > 1 - && (uint32_t) n_seq_tokens <= 1 + cparams.n_rs_seq; -} - ggml_tensor * llm_build_delta_net_base::build_conv_state( llm_graph_input_rs * inp, ggml_tensor * conv_states_all, @@ -461,12 +454,12 @@ ggml_tensor * llm_build_delta_net_base::build_conv_state( int64_t conv_kernel_size, int64_t conv_channels, int il) { - const auto * mctx_cur = inp->mctx; - const auto kv_head = mctx_cur->get_head(); - const uint32_t mem_size = mctx_cur->get_size(); - const int64_t n_seqs = ubatch.n_seqs; - const int64_t n_seq_tokens = ubatch.n_seq_tokens; - const bool keep = keep_rs(); + const auto * mctx_cur = inp->mctx; + + const auto kv_head = mctx_cur->get_head(); + const auto mem_size = mctx_cur->get_size(); + + const int64_t n_seqs = ubatch.n_seqs; ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs); cb(conv_states, "conv_states", il); @@ -480,32 +473,52 @@ ggml_tensor * llm_build_delta_net_base::build_conv_state( ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0); cb(conv_input, "conv_input", il); - if (!keep) { - ggml_tensor * last_conv_states = - ggml_view_3d(ctx0, conv_input, conv_kernel_size - 1, conv_channels, n_seqs, conv_input->nb[1], - conv_input->nb[2], (conv_input->ne[0] - conv_states->ne[0]) * ggml_element_size(conv_input)); - cb(last_conv_states, "last_conv_states", il); + const int64_t row_count = (conv_kernel_size - 1) * conv_channels; + + const size_t row_size = ggml_row_size(conv_states_all->type, row_count); + + if (cparams.n_rs_seq == 0) { + const int64_t s_idx = conv_input->ne[0] - conv_states->ne[0]; + const int64_t s_slot = 0; + + ggml_tensor * conv_state_last = + ggml_view_3d(ctx0, conv_input, + conv_kernel_size - 1, conv_channels, n_seqs, + conv_input->nb[1], conv_input->nb[2], + ggml_row_size(conv_input->type, s_idx)); + cb(conv_state_last, "conv_state_last", il); - ggml_tensor * state_update_target = - ggml_view_2d(ctx0, conv_states_all, (conv_kernel_size - 1) * conv_channels, n_seqs, conv_states_all->nb[1], - kv_head * (conv_kernel_size - 1) * conv_channels * ggml_element_size(conv_states_all)); - cb(state_update_target, "state_update_target", il); + ggml_tensor * conv_state_update = + ggml_view_2d(ctx0, conv_states_all, + row_count, n_seqs, conv_states_all->nb[1], + (s_slot * mem_size + kv_head) * row_size); + cb(conv_state_update, "conv_state_update", il); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_state_last, conv_state_update)); } else { - const int64_t row_count = (conv_kernel_size - 1) * conv_channels; - const size_t row_size = row_count * ggml_element_size(conv_states_all); - for (int64_t t = 1; t <= n_seq_tokens; ++t) { - const uint32_t slot = (uint32_t)(n_seq_tokens - t); - ggml_tensor * src = - ggml_view_3d(ctx0, conv_input, conv_kernel_size - 1, conv_channels, n_seqs, - conv_input->nb[1], conv_input->nb[2], - t * ggml_element_size(conv_input)); - ggml_tensor * dst = - ggml_view_2d(ctx0, conv_states_all, row_count, n_seqs, - conv_states_all->nb[1], - ((size_t) slot * mem_size + kv_head) * row_size); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, src, dst)); + // [TAG_RECURRENT_ROLLBACK_SPLITS] + // TODO: this logic incorrectly assumes that the last (n_rs_seq + 1) tokens of a sequence in a batch are + // inside the same ubatch. currently with `split_equal()` this is not correct + + const int64_t K = (int64_t) cparams.n_rs_seq + 1; + + for (int64_t t = 1; t <= K; ++t) { + const int64_t s_idx = std::max(0, conv_input->ne[0] - conv_states->ne[0] - K + t); + const int64_t s_slot = K - t; + + ggml_tensor * conv_state_last = + ggml_view_3d(ctx0, conv_input, + conv_kernel_size - 1, conv_channels, n_seqs, + conv_input->nb[1], conv_input->nb[2], + ggml_row_size(conv_input->type, s_idx)); + + ggml_tensor * conv_state_update = + ggml_view_2d(ctx0, + conv_states_all, row_count, n_seqs, + conv_states_all->nb[1], + (s_slot * mem_size + kv_head) * row_size); + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_state_last, conv_state_update)); } } @@ -531,7 +544,9 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn( const int64_t n_seqs = s->ne[3]; const int64_t n_seq_tokens = q->ne[2]; - if (!keep_rs()) { + const bool keep = cparams.n_rs_seq > 0; + + if (!keep) { auto attn_out = build_delta_net(q, k, v, g, b, s, il); ggml_tensor * output = attn_out.first; ggml_tensor * new_state = attn_out.second; @@ -554,7 +569,11 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn( ggml_tensor * state_3d = ggml_pad(ctx0, state_in_3d, 0, K - 1, 0, 0); ggml_tensor * gdn_out = ggml_gated_delta_net(ctx0, q, k, v, g, b, state_3d); - cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_CH, il); + if (n_seq_tokens > 1) { + cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_CH, il); + } else { + cb(gdn_out, LLAMA_TENSOR_NAME_FGDN_AR, il); + } const int64_t attn_score_elems = S_v * H_v * n_seq_tokens * n_seqs; const int64_t state_size_per_snap = S_v * S_v * H_v * n_seqs; @@ -576,9 +595,11 @@ ggml_tensor * llm_build_delta_net_base::build_recurrent_attn( ggml_row_size(gdn_out->type, S_v * S_v), ggml_row_size(gdn_out->type, S_v * S_v * H_v), ggml_row_size(gdn_out->type, attn_score_elems + k_i * state_size_per_snap)); + ggml_tensor * dst = ggml_view_2d(ctx0, ssm_states_all, hparams.n_embd_s(), n_seqs, ssm_states_all->nb[1], ((size_t) cache_slot * mem_size + kv_head) * row_size); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, src, dst)); } diff --git a/src/models/dots1.cpp b/src/models/dots1.cpp index 435d27281c69..07d6ab1b7cdb 100644 --- a/src/models/dots1.cpp +++ b/src/models/dots1.cpp @@ -8,7 +8,8 @@ void llama_model_dots1::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 62: type = LLM_TYPE_142B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/dream.cpp b/src/models/dream.cpp index 12ac6f1ce88f..abe737c335a4 100644 --- a/src/models/dream.cpp +++ b/src/models/dream.cpp @@ -2,8 +2,9 @@ void llama_model_dream::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + // Dream models are primarily 7B with 28 layers - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 28: type = LLM_TYPE_7B; break; diff --git a/src/models/ernie4-5.cpp b/src/models/ernie4-5.cpp index 9b39c605e350..895cf690bd2d 100644 --- a/src/models/ernie4-5.cpp +++ b/src/models/ernie4-5.cpp @@ -12,7 +12,7 @@ void llama_model_ernie4_5::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false); } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 18: type = LLM_TYPE_0_3B; break; case 28: type = LLM_TYPE_21B_A3B; break; case 54: type = LLM_TYPE_300B_A47B; break; diff --git a/src/models/eurobert.cpp b/src/models/eurobert.cpp index ddf13c3028f2..0948d7de656f 100644 --- a/src/models/eurobert.cpp +++ b/src/models/eurobert.cpp @@ -3,7 +3,7 @@ void llama_model_eurobert::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - if (hparams.n_layer == 12) { + if (hparams.n_layer() == 12) { type = LLM_TYPE_SMALL; // 0.2B } } diff --git a/src/models/exaone-moe.cpp b/src/models/exaone-moe.cpp index 76d91982fc53..bccf169f8c00 100644 --- a/src/models/exaone-moe.cpp +++ b/src/models/exaone-moe.cpp @@ -20,13 +20,12 @@ void llama_model_exaone_moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false); - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_30B_A3B; break; - case 48: - case 49: type = LLM_TYPE_235B_A22B; break; + case 48: type = LLM_TYPE_235B_A22B; break; default: type = LLM_TYPE_UNKNOWN; } } @@ -50,9 +49,9 @@ void llama_model_exaone_moe::load_arch_tensors(llama_model_loader &) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { int flags = 0; - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { // skip all tensors in the NextN layers flags |= TENSOR_SKIP; } @@ -70,7 +69,7 @@ void llama_model_exaone_moe::load_arch_tensors(llama_model_loader &) { layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags); // dense layers for first n_layer_dense_lead layers or nextn_predict_layers layers at the end - if (i < (int) hparams.n_layer_dense_lead || (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers)) { + if (i < (int) hparams.n_layer_dense_lead || (i >= n_layer)) { layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, flags); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags); @@ -95,7 +94,7 @@ void llama_model_exaone_moe::load_arch_tensors(llama_model_loader &) { } // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), {2 * n_embd, n_embd}, flags); layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), {n_embd}, flags); layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), {n_embd}, flags); @@ -130,8 +129,7 @@ llama_model_exaone_moe::graph::graph(const llama_model & model, const llm_graph_ ggml_tensor * inp_out_ids = build_inp_out_ids(); - const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; // use RoPE for SWA layers @@ -170,7 +168,7 @@ llama_model_exaone_moe::graph::graph(const llama_model & model, const llm_graph_ Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); cb(cur, "attn_out", il); } - if (il == n_transformer_layers - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/exaone.cpp b/src/models/exaone.cpp index c7e9960d7185..676fb37b5a63 100644 --- a/src/models/exaone.cpp +++ b/src/models/exaone.cpp @@ -3,7 +3,7 @@ void llama_model_exaone::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_8B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/exaone4.cpp b/src/models/exaone4.cpp index 499e22dde81e..595289d72f57 100644 --- a/src/models/exaone4.cpp +++ b/src/models/exaone4.cpp @@ -1,7 +1,7 @@ #include "models.h" void llama_model_exaone4::load_arch_hparams(llama_model_loader & ml) { - if (hparams.n_layer == 64) { // 32B + if (hparams.n_layer() == 64) { // 32B hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; hparams.n_swa = 4096; uint32_t swa_period = 4; @@ -16,7 +16,7 @@ void llama_model_exaone4::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 30: type = LLM_TYPE_1_2B; break; case 64: type = LLM_TYPE_32B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/falcon-h1.cpp b/src/models/falcon-h1.cpp index 94b65a3c7c94..d6ef2d519862 100644 --- a/src/models/falcon-h1.cpp +++ b/src/models/falcon-h1.cpp @@ -11,9 +11,9 @@ void llama_model_falcon_h1::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); - std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true); + std::fill(hparams.is_recr_impl.begin(), hparams.is_recr_impl.end(), true); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 36: type = LLM_TYPE_0_5B; break; case 24: diff --git a/src/models/falcon.cpp b/src/models/falcon.cpp index ad546ef2db56..b2ad90b32724 100644 --- a/src/models/falcon.cpp +++ b/src/models/falcon.cpp @@ -3,7 +3,7 @@ void llama_model_falcon::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 60: type = LLM_TYPE_40B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/gemma-embedding.cpp b/src/models/gemma-embedding.cpp index 4e07f5f2bdaf..80ed3b1a4605 100644 --- a/src/models/gemma-embedding.cpp +++ b/src/models/gemma-embedding.cpp @@ -21,7 +21,7 @@ void llama_model_gemma_embedding::load_arch_hparams(llama_model_loader & ml) { GGML_ASSERT((hparams.dense_2_feat_in == 0 || hparams.dense_2_feat_in == hparams.n_embd) && "dense_2_feat_in must be equal to n_embd"); GGML_ASSERT((hparams.dense_3_feat_out == 0 || hparams.dense_3_feat_out == hparams.n_embd) && "dense_3_feat_out must be equal to n_embd"); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_0_3B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/gemma.cpp b/src/models/gemma.cpp index 1519682fdf62..651cd7e64de0 100644 --- a/src/models/gemma.cpp +++ b/src/models/gemma.cpp @@ -3,7 +3,7 @@ void llama_model_gemma::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 18: type = LLM_TYPE_2B; break; case 28: type = LLM_TYPE_7B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/gemma2.cpp b/src/models/gemma2.cpp index ae3f9ffb5301..2fbfb15a94a3 100644 --- a/src/models/gemma2.cpp +++ b/src/models/gemma2.cpp @@ -16,7 +16,7 @@ void llama_model_gemma2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false); ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 26: type = LLM_TYPE_2B; break; case 42: type = LLM_TYPE_9B; break; case 46: type = LLM_TYPE_27B; break; diff --git a/src/models/gemma3.cpp b/src/models/gemma3.cpp index 63a2b380e71a..690194529e38 100644 --- a/src/models/gemma3.cpp +++ b/src/models/gemma3.cpp @@ -17,7 +17,7 @@ void llama_model_gemma3::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 18: type = LLM_TYPE_270M; break; case 26: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_8B; break; // Rnj-1 diff --git a/src/models/gemma3n.cpp b/src/models/gemma3n.cpp index 6ec3a006081f..83eb8250aa94 100644 --- a/src/models/gemma3n.cpp +++ b/src/models/gemma3n.cpp @@ -6,14 +6,14 @@ void llama_model_gemma3n::load_arch_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; hparams.set_swa_pattern(swa_period); - hparams.n_layer_kv_from_start = 20; - hparams.f_attention_scale = 1.0f; + hparams.n_layer_kv_from_start = 20; + hparams.f_attention_scale = 1.0f; ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 30: type = LLM_TYPE_E2B; break; case 35: type = LLM_TYPE_E4B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/gemma4-assistant.cpp b/src/models/gemma4-assistant.cpp new file mode 100644 index 000000000000..5b7a25a5abaf --- /dev/null +++ b/src/models/gemma4-assistant.cpp @@ -0,0 +1,200 @@ +#include "models.h" + +void llama_model_gemma4_assistant::load_arch_hparams(llama_model_loader & ml) { + hparams.n_embd_inp_impl = hparams.n_embd_out(); + + hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer()); + + uint32_t n_kv_shared_layers = 0; + ml.get_key(LLM_KV_ATTENTION_SHARED_KV_LAYERS, n_kv_shared_layers, false); + + hparams.f_attention_scale = 1.0f; + + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn == hparams.n_layer_all && "n_layer_nextn must be == n_layer_impl"); + + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa); + ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa); +} + +void llama_model_gemma4_assistant::load_arch_tensors(llama_model_loader &) { + LLAMA_LOAD_LOCALS; + + if (n_embd_head_k != n_embd_head_v) { + throw std::runtime_error("Gemma 4 assistant requires n_embd_head_k == n_embd_head_v"); + } + if (hparams.n_embd_head_k_swa != hparams.n_embd_head_v_swa) { + throw std::runtime_error("Gemma 4 assistant requires n_embd_head_k_swa == n_embd_head_v_swa"); + } + if (hparams.n_embd_out() == n_embd) { + throw std::runtime_error("Gemma 4 assistant requires embedding_length_out to carry the target hidden size"); + } + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED); + + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); + + const int64_t n_embd_backbone = hparams.n_embd_inp(); + nextn_proj_post = create_tensor(tn(LLM_TENSOR_NEXTN_PROJ_POST, "weight"), { n_embd, n_embd_backbone }, 0); + + int rope_freqs_flag = 0; + + for (int i = 0; i < n_layer_nextn; ++i) { + auto & layer = layers[i]; + + const int64_t n_head = hparams.n_head(i); + const int64_t n_embd_head = hparams.n_embd_head_k(i); + const int64_t n_ff = hparams.n_ff(i); + + if (i == 0) { + nextn_proj_pre = create_tensor(tn(LLM_TENSOR_NEXTN_PROJ_PRE, "weight", i), { 2*n_embd_backbone, n_embd }, 0); + } + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head*n_head }, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head*n_head, n_embd }, 0); + + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head }, 0); + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0); + + layer.out_scale = create_tensor(tn(LLM_TENSOR_LAYER_OUT_SCALE, "weight", i), { 1u }, 0); + + if (!hparams.is_swa(i)) { + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_embd_head/2 }, rope_freqs_flag); + rope_freqs_flag = TENSOR_DUPLICATED; + } + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, 0); + layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), { n_embd }, 0); + } +} + +std::unique_ptr llama_model_gemma4_assistant::build_arch_graph(const llm_graph_params & params) const { + return std::make_unique(*this, params); +} + +llama_model_gemma4_assistant::graph::graph(const llama_model & model, const llm_graph_params & params) : + llm_graph_context(params) { + const int64_t n_embd_backbone = hparams.n_embd_inp(); + + ggml_tensor * inp_tokens; + ggml_tensor * inp_h; + { + auto inp = std::make_unique(n_embd_backbone); + + inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens); + cb(inp->tokens, "inp_tokens", -1); + ggml_set_input(inp->tokens); + inp_tokens = inp->tokens; + res->t_inp_tokens = inp->tokens; + + inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd_backbone, ubatch.n_tokens); + cb(inp->embd, "inp_h", -1); + ggml_set_input(inp->embd); + inp_h = inp->embd; + res->t_inp_embd = inp->embd; + + res->add_input(std::move(inp)); + } + + GGML_ASSERT(cparams.ctx_other != nullptr); + const auto * model_other = llama_get_model(cparams.ctx_other); + + ggml_tensor * x = ggml_get_rows(ctx0, model_other->tok_embd, inp_tokens); + x = ggml_scale(ctx0, x, sqrtf((float) n_embd_backbone)); + cb(x, "inp_embd_target", -1); + + ggml_tensor * xh = ggml_concat(ctx0, x, inp_h, 0); + cb(xh, "inp_xh", -1); + + ggml_tensor * cur = ggml_mul_mat(ctx0, model.nextn_proj_pre, xh); + cb(cur, "pre_proj", -1); + + auto * inp_attn = build_attn_inp_kv_iswa(); + ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + ggml_tensor * inpL = cur; + + for (int il = 0; il < n_layer_nextn; ++il) { + const bool is_swa = hparams.is_swa(il); + + const int64_t n_embd_head = hparams.n_embd_head_k(il); + const int64_t n_head = hparams.n_head(il); + + const float freq_base_l = model.get_rope_freq_base(cparams, il); + const float freq_scale_l = model.get_rope_freq_scale(cparams, il); + const int n_rot_l = hparams.n_rot(il); + + ggml_tensor * cur_norm = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); + cb(cur_norm, "attn_norm", il); + + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur_norm); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il); + cb(Qcur, "Qcur_normed", il); + + ggml_tensor * freq_factors = is_swa ? nullptr : model.layers[il].rope_freqs; + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, freq_factors, n_rot_l, rope_type, n_ctx_orig, + freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "Qcur_pos", il); + + cur = build_attn(inp_attn, model.layers[il].wo, nullptr, nullptr, + Qcur, nullptr, nullptr, nullptr, nullptr, nullptr, hparams.f_attention_scale, il); + + if (il == n_layer_nextn - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + cur = build_norm(cur, model.layers[il].attn_post_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "attn_post_norm", il); + + ggml_tensor * attn_out = ggml_add(ctx0, cur, inpL); + cb(attn_out, "attn_out", il); + + cur = build_norm(attn_out, model.layers[il].ffn_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, nullptr, nullptr, + model.layers[il].ffn_gate, nullptr, nullptr, + model.layers[il].ffn_down, nullptr, nullptr, + nullptr, + LLM_FFN_GELU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = build_norm(cur, model.layers[il].ffn_post_norm, nullptr, LLM_NORM_RMS, -1); + cb(cur, "ffn_post_norm", il); + + cur = ggml_add(ctx0, cur, attn_out); + + cur = ggml_mul(ctx0, cur, model.layers[il].out_scale); + cb(cur, "out_scaled", il); + + inpL = cur; + } + cur = inpL; + + cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + ggml_tensor * logits = build_lora_mm(model.output, cur); + cb(logits, "result_output", -1); + res->t_logits = logits; + + ggml_tensor * h_next = ggml_mul_mat(ctx0, model.nextn_proj_post, cur); + cb(h_next, "h_nextn", -1); + res->t_h_nextn = h_next; + + ggml_build_forward_expand(gf, logits); + ggml_build_forward_expand(gf, h_next); +} diff --git a/src/models/gemma4.cpp b/src/models/gemma4.cpp index c92ffffd23ac..6f7fcd645cbd 100644 --- a/src/models/gemma4.cpp +++ b/src/models/gemma4.cpp @@ -2,12 +2,12 @@ void llama_model_gemma4::load_arch_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; - ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer); + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer()); uint32_t n_kv_shared_layers = 0; ml.get_key(LLM_KV_ATTENTION_SHARED_KV_LAYERS, n_kv_shared_layers, false); - hparams.n_layer_kv_from_start = hparams.n_layer - (int32_t)n_kv_shared_layers; + hparams.n_layer_kv_from_start = hparams.n_layer_all - (int32_t)n_kv_shared_layers; hparams.f_attention_scale = 1.0f; // Gemma4 uses self.scaling = 1.0 (no pre-attn scaling) ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); @@ -19,7 +19,7 @@ void llama_model_gemma4::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa); ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 30: type = LLM_TYPE_26B_A4B; break; case 35: type = LLM_TYPE_E2B; break; case 42: type = LLM_TYPE_E4B; break; @@ -155,12 +155,14 @@ class llm_graph_input_logits_bias : public llm_graph_input_i { } virtual ~llm_graph_input_logits_bias() = default; - void set_input(const llama_ubatch *) override { + void set_input(const llama_ubatch * /*ubatch*/) override { const int64_t n_vocab = arr.size(); ggml_backend_tensor_set(logits_bias, arr.data(), 0, n_vocab*ggml_element_size(logits_bias)); } - // bool can_reuse(const llm_graph_params & params) override; + bool can_reuse(const llm_graph_params & /*params*/) override { + return true; + } ggml_tensor * logits_bias = nullptr; // F32 [n_vocab] @@ -270,7 +272,8 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para } // TODO @ngxson : strip unused token right after the last KV layer to speed up prompt processing - if (il == n_layer - 1 && inp_out_ids) { + // keep all rows when extracting unmasked nextn embeddings (MTP target needs the hidden state for every token) + if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); } @@ -370,7 +373,7 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para ggml_tensor * inp_this_layer = ggml_view_2d_slice(ctx0, inp_per_layer, il); // [n_embd_per_layer, n_tokens] // TODO @ngxson : improve this - if (il == n_layer - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { inp_this_layer = ggml_get_rows(ctx0, inp_this_layer, inp_out_ids); } @@ -401,6 +404,17 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para model.output_norm, nullptr, LLM_NORM_RMS, -1); + // Expose the post-output-norm hidden state (the LM-head input feature) so that + // MTP draft contexts can read it via llama_get_embeddings_nextn_ith() as the + // recurrent h input. This matches the reference (transformers/vLLM/SGLang), + // which feeds the drafter the target's post-final-norm hidden state. + cb(cur, "h_nextn", -1); + res->t_h_nextn = cur; + + if (!cparams.embeddings_nextn_masked && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + } + cb(cur, "result_norm", -1); res->t_embd = cur; diff --git a/src/models/glm-dsa.cpp b/src/models/glm-dsa.cpp index af2b55ef563d..11d91312defc 100644 --- a/src/models/glm-dsa.cpp +++ b/src/models/glm-dsa.cpp @@ -33,13 +33,10 @@ void llama_model_glm_dsa::load_arch_hparams(llama_model_loader & ml) { } // NextN/MTP parameters - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); - // TODO: when MTP is implemented, this should probably be updated if needed - hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; - - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 79: type = LLM_TYPE_744B_A40B; break; default: type = LLM_TYPE_UNKNOWN; } @@ -76,9 +73,9 @@ void llama_model_glm_dsa::load_arch_tensors(llama_model_loader &) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { int flags = 0; - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { // skip all tensors in the NextN layers // TODO @ngxson : TENSOR_NOT_REQUIRED was a hack, need to remove it later flags |= TENSOR_SKIP | TENSOR_NOT_REQUIRED; @@ -135,8 +132,8 @@ void llama_model_glm_dsa::load_arch_tensors(llama_model_loader &) { layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, flags); } - // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + // NextN/MTP tensors (preserved but unused) - conditionally load for last n_layer_nextn + if (i >= n_layer) { layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags); diff --git a/src/models/glm4-moe.cpp b/src/models/glm4-moe.cpp index 27654b8cba3c..3105c56b5306 100644 --- a/src/models/glm4-moe.cpp +++ b/src/models/glm4-moe.cpp @@ -20,16 +20,13 @@ void llama_model_glm4_moe::load_arch_hparams(llama_model_loader & ml) { } // NextN/MTP parameters - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); - // TODO: when MTP is implemented, this should probably be updated if needed - hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; - - switch (hparams.n_layer) { - case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer) - case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open - case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer) + switch (hparams.n_layer()) { + case 46: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air + case 48: type = LLM_TYPE_102B_A12B; break; // Solar Open + case 92: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 default: type = LLM_TYPE_UNKNOWN; } } @@ -54,9 +51,9 @@ void llama_model_glm4_moe::load_arch_tensors(llama_model_loader &) { // Load ALL tensors including NextN layer to satisfy total tensor count // but only PROCESS up to last layer (skipping final NextN layer) in forward pass - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { int flags = 0; - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { // skip all tensors in the NextN layers flags |= TENSOR_SKIP; } @@ -116,7 +113,7 @@ void llama_model_glm4_moe::load_arch_tensors(llama_model_loader &) { } // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags); @@ -161,8 +158,7 @@ llama_model_glm4_moe::graph::graph(const llama_model & model, const llm_graph_pa // Only process up to last layer (skip final NextN layer) // Final layer tensors are loaded but not processed in forward pass - const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; // Pre-attention norm @@ -211,7 +207,7 @@ llama_model_glm4_moe::graph::graph(const llama_model & model, const llm_graph_pa model.layers[il].wo, NULL, model.layers[il].wo_s, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } - if (il == n_transformer_layers - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp index 7c242fed298e..b4326c5f2107 100644 --- a/src/models/glm4.cpp +++ b/src/models/glm4.cpp @@ -5,13 +5,10 @@ void llama_model_glm4::load_arch_hparams(llama_model_loader & ml) { ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false); // NextN/MTP parameters (GLM-OCR) - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); - // TODO: when MTP is implemented, this should probably be updated if needed - hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; - - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 17: type = LLM_TYPE_1B; break; // GLM-OCR case 40: type = LLM_TYPE_9B; break; case 61: type = LLM_TYPE_32B; break; @@ -32,9 +29,9 @@ void llama_model_glm4::load_arch_tensors(llama_model_loader &) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { int flags = 0; - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { // skip all tensors in the NextN layers flags |= TENSOR_SKIP; } @@ -55,7 +52,7 @@ void llama_model_glm4::load_arch_tensors(llama_model_loader &) { layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, flags); // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + if (i >= n_layer) { layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags); @@ -100,8 +97,7 @@ llama_model_glm4::graph::graph(const llama_model & model, const llm_graph_params // Only process up to last layer (skip final NextN layer) // Final layer tensors are loaded but not processed in forward pass - const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; // Pre-attention norm @@ -140,7 +136,7 @@ llama_model_glm4::graph::graph(const llama_model & model, const llm_graph_params model.layers[il].wo, NULL, model.layers[il].wo_s, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); } - if (il == n_transformer_layers - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/gpt2.cpp b/src/models/gpt2.cpp index e2dcc8b1521e..45afbccc1210 100644 --- a/src/models/gpt2.cpp +++ b/src/models/gpt2.cpp @@ -2,7 +2,8 @@ void llama_model_gpt2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 12: type = LLM_TYPE_SMALL; break; case 24: type = LLM_TYPE_MEDIUM; break; case 36: type = LLM_TYPE_LARGE; break; diff --git a/src/models/gptneox.cpp b/src/models/gptneox.cpp index 443e35addf28..ed5e8c50da2f 100644 --- a/src/models/gptneox.cpp +++ b/src/models/gptneox.cpp @@ -3,7 +3,8 @@ void llama_model_gptneox::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 6: switch (hparams.n_ff()) { case 512: type = LLM_TYPE_14M; break; diff --git a/src/models/granite-hybrid.cpp b/src/models/granite-hybrid.cpp index 27f6706ea10f..eb23095aece6 100644 --- a/src/models/granite-hybrid.cpp +++ b/src/models/granite-hybrid.cpp @@ -19,8 +19,8 @@ void llama_model_granite_hybrid::load_arch_hparams(llama_model_loader & ml) { hparams.rope_finetuned = rope_finetuned; // A layer is recurrent IFF the n_head_kv value is set to 0 - for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0; + for (uint32_t i = 0; i < hparams.n_layer(); ++i) { + hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0; } ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -71,7 +71,7 @@ void llama_model_granite_hybrid::load_arch_tensors(llama_model_loader &) { // norm layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - if (hparams.is_recurrent(i)) { + if (hparams.is_recr(i)) { // ssm layers layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0); @@ -158,7 +158,7 @@ llama_model_granite_hybrid::graph::graph(const llama_model & model, const llm_gr cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); - if (hparams.is_recurrent(il)) { + if (hparams.is_recr(il)) { // ssm layer // cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il); } else { diff --git a/src/models/granite-moe.cpp b/src/models/granite-moe.cpp index 0d89bc1f3404..115263c418fa 100644 --- a/src/models/granite-moe.cpp +++ b/src/models/granite-moe.cpp @@ -12,7 +12,7 @@ void llama_model_granite_moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false); hparams.rope_finetuned = rope_finetuned; - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_3B; break; case 40: type = LLM_TYPE_3B; break; // Add additional layer/vocab/etc checks here for other model sizes diff --git a/src/models/granite.cpp b/src/models/granite.cpp index cda4aa231fa9..7aff942da015 100644 --- a/src/models/granite.cpp +++ b/src/models/granite.cpp @@ -12,7 +12,7 @@ void llama_model_granite::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false); hparams.rope_finetuned = rope_finetuned; - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_3B; break; case 40: type = LLM_TYPE_3B; break; // Add additional layer/vocab/etc checks here for other model sizes diff --git a/src/models/grok.cpp b/src/models/grok.cpp index 7c46ec1c0f2d..42f38af67243 100644 --- a/src/models/grok.cpp +++ b/src/models/grok.cpp @@ -26,7 +26,7 @@ void llama_model_grok::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_FAST, hparams.yarn_beta_fast, false); ml.get_key(LLM_KV_ROPE_SCALING_YARN_BETA_SLOW, hparams.yarn_beta_slow, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 64: type = LLM_TYPE_314B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/grovemoe.cpp b/src/models/grovemoe.cpp index 1cab75adc7fe..643a448e59ac 100644 --- a/src/models/grovemoe.cpp +++ b/src/models/grovemoe.cpp @@ -7,7 +7,7 @@ void llama_model_grovemoe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERTS_PER_GROUP, hparams.n_group_experts); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 48: type = LLM_TYPE_30B_A3B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/hunyuan-moe.cpp b/src/models/hunyuan-moe.cpp index deb3c9671f33..4d55f5e7f317 100644 --- a/src/models/hunyuan-moe.cpp +++ b/src/models/hunyuan-moe.cpp @@ -5,7 +5,7 @@ void llama_model_hunyuan_moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_A13B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/internlm2.cpp b/src/models/internlm2.cpp index f9ee37a24b69..f6cfdfb9458f 100644 --- a/src/models/internlm2.cpp +++ b/src/models/internlm2.cpp @@ -2,7 +2,8 @@ void llama_model_internlm2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 48: type = LLM_TYPE_20B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/jais.cpp b/src/models/jais.cpp index 2ba162605f13..415103ce23a8 100644 --- a/src/models/jais.cpp +++ b/src/models/jais.cpp @@ -4,7 +4,7 @@ void llama_model_jais::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1_3B; break; case 40: type = LLM_TYPE_13B; break; /* TODO: add variants */ diff --git a/src/models/jais2.cpp b/src/models/jais2.cpp index 8966131441c3..8610fcc9f82f 100644 --- a/src/models/jais2.cpp +++ b/src/models/jais2.cpp @@ -3,7 +3,7 @@ void llama_model_jais2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_8B; break; case 68: type = LLM_TYPE_70B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/jamba.cpp b/src/models/jamba.cpp index 84ea63c31367..dba160b014fd 100644 --- a/src/models/jamba.cpp +++ b/src/models/jamba.cpp @@ -8,11 +8,11 @@ void llama_model_jamba::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0; + for (uint32_t i = 0; i < hparams.n_layer(); ++i) { + hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { // TODO: Jamba layers are a bit heterogeneous, so naming this is hard. case 12: // 900M 8x???M case 32: // 51B 16x?B diff --git a/src/models/jina-bert-v2.cpp b/src/models/jina-bert-v2.cpp index 4f8866ece4d2..86ff1c84d1a6 100644 --- a/src/models/jina-bert-v2.cpp +++ b/src/models/jina-bert-v2.cpp @@ -4,7 +4,7 @@ void llama_model_jina_bert_v2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); hparams.f_max_alibi_bias = 8.0f; - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 4: type = LLM_TYPE_33M; break; // jina-embeddings-small case 12: type = LLM_TYPE_137M; break; // jina-embeddings-base default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/jina-bert-v3.cpp b/src/models/jina-bert-v3.cpp index e0527529f56c..1c974a6f16cc 100644 --- a/src/models/jina-bert-v3.cpp +++ b/src/models/jina-bert-v3.cpp @@ -3,7 +3,7 @@ void llama_model_jina_bert_v3::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_558M; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index ecffb105496b..367f6990d1fb 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -14,8 +14,8 @@ void llama_model_kimi_linear::load_arch_hparams(llama_model_loader & ml) { // Mark KDA layers as recurrent using n_head_kv pattern (like Jamba) // Set n_head_kv = 0 for KDA layers (recurrent), n_head_kv = n_head for MLA layers (attention) - for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0; // KDA layers are recurrent + for (uint32_t i = 0; i < hparams.n_layer(); ++i) { + hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0; // KDA layers are recurrent } // MoE parameters - Kimi uses moe_intermediate_size = 1024 @@ -25,7 +25,7 @@ void llama_model_kimi_linear::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 27: type = LLM_TYPE_48B_A3B; break; // Kimi-Linear-48B-A3B default: type = LLM_TYPE_UNKNOWN; } @@ -53,7 +53,7 @@ void llama_model_kimi_linear::load_arch_tensors(llama_model_loader &) { const int64_t n_embd_head_v_kda = hparams.n_embd_head_kda; const int64_t ssm_d_conv = hparams.ssm_d_conv; - if (hparams.is_recurrent(i)) { + if (hparams.is_recr(i)) { // Conv1d weights: try 4D first, then 3D (quantization may remove trailing 1) // 4D: [d_conv, 1, d_inner, 1], 3D: [d_conv, 1, d_inner] layer.ssm_q_conv = create_tensor(tn(LLM_TENSOR_SSM_CONV1D_Q, "weight", i), {ssm_d_conv, 1, n_embd_head_k_kda * n_head, 1}, TENSOR_NOT_REQUIRED); @@ -285,7 +285,7 @@ llama_model_kimi_linear::graph::graph(const llama_model & model, const llm_graph ggml_build_forward_expand(gf, cur); - if (hparams.is_recurrent(il)) { + if (hparams.is_recr(il)) { // === KDA Layer (Kimi Delta Attention) with Recurrent State === // Reference: vLLM kda.py const auto * mctx_cur = inp_rs->mctx; diff --git a/src/models/lfm2.cpp b/src/models/lfm2.cpp index 29081344b245..97da8a6abb84 100644 --- a/src/models/lfm2.cpp +++ b/src/models/lfm2.cpp @@ -5,10 +5,13 @@ void llama_model_lfm2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - for (uint32_t il = 0; il < hparams.n_layer; ++il) { - hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0; + + for (uint32_t il = 0; il < hparams.n_layer(); ++il) { + hparams.is_recr_impl[il] = hparams.n_head_kv(il) == 0; } - hparams.n_layer_dense_lead = hparams.n_layer; + + hparams.n_layer_dense_lead = hparams.n_layer(); + switch (hparams.n_ff()) { case 4608: type = LLM_TYPE_350M; break; case 6912: type = LLM_TYPE_700M; break; @@ -16,10 +19,11 @@ void llama_model_lfm2::load_arch_hparams(llama_model_loader & ml) { case 10752: type = LLM_TYPE_2_6B; break; default: type = LLM_TYPE_UNKNOWN; } + if (const auto is_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); is_swa && hparams.n_swa > 0) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; - for (uint32_t il = 0; il < hparams.n_layer; ++il) { - hparams.swa_layers[il] = !hparams.recurrent_layer_arr[il]; + for (uint32_t il = 0; il < hparams.n_layer(); ++il) { + hparams.is_swa_impl[il] = !hparams.is_recr_impl[il]; } } } @@ -59,7 +63,7 @@ void llama_model_lfm2::load_arch_tensors(llama_model_loader &) { // for operator_norm layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - if (!hparams.is_recurrent(i)) { + if (!hparams.is_recr(i)) { layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0); layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0); GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa); @@ -235,8 +239,8 @@ llama_model_lfm2::graph::graph(const llama_model & model, const llm_graph_ cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); cb(cur, "model.layers.{}.operator_norm", il); - cur = hparams.is_recurrent(il) ? build_shortconv_block(cur, inp_hybrid->get_recr(), il) : - build_attn_block(cur, inp_pos, inp_hybrid->get_attn(), il); + cur = hparams.is_recr(il) ? build_shortconv_block(cur, inp_hybrid->get_recr(), il) : + build_attn_block(cur, inp_pos, inp_hybrid->get_attn(), il); if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); diff --git a/src/models/lfm2moe.cpp b/src/models/lfm2moe.cpp index 12a66c05c7dc..490f5c223ebb 100644 --- a/src/models/lfm2moe.cpp +++ b/src/models/lfm2moe.cpp @@ -9,11 +9,11 @@ void llama_model_lfm2moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); - for (uint32_t il = 0; il < hparams.n_layer; ++il) { - hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0; + for (uint32_t il = 0; il < hparams.n_layer(); ++il) { + hparams.is_recr_impl[il] = hparams.n_head_kv(il) == 0; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_8B_A1B; break; case 40: type = LLM_TYPE_24B_A2B; break; default: type = LLM_TYPE_UNKNOWN; @@ -55,7 +55,7 @@ void llama_model_lfm2moe::load_arch_tensors(llama_model_loader &) { // for operator_norm layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - if (!hparams.is_recurrent(i)) { + if (!hparams.is_recr(i)) { layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0); layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0); GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa); diff --git a/src/models/llada-moe.cpp b/src/models/llada-moe.cpp index 9722dde9f177..2ae893864472 100644 --- a/src/models/llada-moe.cpp +++ b/src/models/llada-moe.cpp @@ -2,11 +2,12 @@ void llama_model_llada_moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + // diffusion language model uses non-causal attention hparams.causal_attn = false; - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 16: type = LLM_TYPE_A1_7B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/llada.cpp b/src/models/llada.cpp index 58b2c466e176..87d4259f9a74 100644 --- a/src/models/llada.cpp +++ b/src/models/llada.cpp @@ -2,14 +2,16 @@ void llama_model_llada::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + // LLaDA-8B has 32 layers, similar to LLaMA but for diffusion - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_8B; break; default: type = LLM_TYPE_UNKNOWN; } + // Set non-causal attention for diffusion models hparams.causal_attn = false; } diff --git a/src/models/llama.cpp b/src/models/llama.cpp index cef66d054b0c..c0ec7e0a9adb 100644 --- a/src/models/llama.cpp +++ b/src/models/llama.cpp @@ -7,13 +7,13 @@ void llama_model_llama::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); if (hparams.n_expert == 8) { - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_8x7B; break; case 56: type = LLM_TYPE_8x22B; break; default: type = LLM_TYPE_UNKNOWN; } } else { - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 16: type = LLM_TYPE_1B; break; // Llama 3.2 1B case 22: type = LLM_TYPE_1B; break; case 26: type = LLM_TYPE_3B; break; diff --git a/src/models/llama4.cpp b/src/models/llama4.cpp index 0ff5376d571f..7194c72a5850 100644 --- a/src/models/llama4.cpp +++ b/src/models/llama4.cpp @@ -8,14 +8,15 @@ void llama_model_llama4::load_arch_hparams(llama_model_loader & ml) { const bool found_swa = ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); if (found_swa && hparams.n_swa == 0) { hparams.swa_type = LLAMA_SWA_TYPE_NONE; - hparams.n_no_rope_layer_step = hparams.n_layer; // always use rope + hparams.n_no_rope_layer_step = hparams.n_layer(); // always use rope } else { hparams.swa_type = LLAMA_SWA_TYPE_CHUNKED; hparams.n_swa = 8192; hparams.n_attn_temp_floor_scale = 8192; hparams.f_attn_temp_scale = 0.1f; hparams.f_attn_temp_offset = 1.0f; - uint32_t swa_period = 4; // pattern: 3 chunked - 1 full + + uint32_t swa_period = 4; // pattern: 3 chunked - 1 full ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false); hparams.set_swa_pattern(swa_period); diff --git a/src/models/maincoder.cpp b/src/models/maincoder.cpp index 84cfe399027e..ae56a26a1f64 100644 --- a/src/models/maincoder.cpp +++ b/src/models/maincoder.cpp @@ -2,7 +2,8 @@ void llama_model_maincoder::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_1B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/mamba.cpp b/src/models/mamba.cpp index 887a1fa509a9..0d94e98281ce 100644 --- a/src/models/mamba.cpp +++ b/src/models/mamba.cpp @@ -9,7 +9,7 @@ void llama_model_mamba::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: switch (hparams.n_embd) { case 768: type = LLM_TYPE_SMALL; break; diff --git a/src/models/mamba2.cpp b/src/models/mamba2.cpp index 3277ca53ec4b..c5951cf0f7fc 100644 --- a/src/models/mamba2.cpp +++ b/src/models/mamba2.cpp @@ -9,7 +9,7 @@ void llama_model_mamba2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: switch (hparams.n_embd) { case 768: type = LLM_TYPE_SMALL; break; diff --git a/src/models/mimo2.cpp b/src/models/mimo2.cpp index d0295ec116f6..889891605701 100644 --- a/src/models/mimo2.cpp +++ b/src/models/mimo2.cpp @@ -8,18 +8,18 @@ void llama_model_mimo2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); - ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer); + + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer()); float value_scale = 0.0f; if (ml.get_key(LLM_KV_ATTENTION_VALUE_SCALE, value_scale, false) && value_scale != 1.0f) { hparams.f_attn_value_scale = value_scale; } - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); - hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); - switch (hparams.n_layer - hparams.nextn_predict_layers) { + switch (hparams.n_layer()) { case 48: type = LLM_TYPE_310B_A15B; break; default: type = LLM_TYPE_UNKNOWN; } @@ -34,16 +34,14 @@ void llama_model_mimo2::load_arch_tensors(llama_model_loader &) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); - const uint32_t n_nextn = hparams.nextn_predict_layers; - - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < n_layer_all; ++i) { auto & layer = layers[i]; uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i); uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i); uint32_t n_head = hparams.n_head(i); // NextN/MTP layers (the last n_nextn blocks) are preserved but disabled pending support - const bool is_nextn = (n_nextn > 0) && (static_cast(i) >= n_layer - n_nextn); + const bool is_nextn = i >= n_layer; const int skip = is_nextn ? TENSOR_SKIP : 0; create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_k_gqa, n_embd_v_gqa, skip); @@ -92,10 +90,7 @@ llama_model_mimo2::graph::graph(const llama_model & model, const llm_graph_param const float v_scale = hparams.f_attn_value_scale; - // The last hparams.nextn_predict_layers blocks are MTP heads, currently inactive - const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; - - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; uint32_t n_head_l = hparams.n_head(il); @@ -173,7 +168,7 @@ llama_model_mimo2::graph::graph(const llama_model & model, const llm_graph_param } } - if (il == n_transformer_layers - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/minicpm.cpp b/src/models/minicpm.cpp index 966d3af615c0..fc3e5b171d52 100644 --- a/src/models/minicpm.cpp +++ b/src/models/minicpm.cpp @@ -3,7 +3,7 @@ void llama_model_minicpm::load_arch_hparams(llama_model_loader & ml) { // Backward-compatible defaults for older MiniCPM GGUFs hparams.f_embedding_scale = 12.0f; - hparams.f_residual_scale = 1.4f / sqrtf(float(hparams.n_layer)); + hparams.f_residual_scale = 1.4f / sqrtf(float(hparams.n_layer())); hparams.f_logit_scale = hparams.n_embd ? (256.0f / float(hparams.n_embd)) : 1.0f; ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -16,7 +16,7 @@ void llama_model_minicpm::load_arch_hparams(llama_model_loader & ml) { // MiniCPM uses rope by default, unlike Granite which uses it as a switch hparams.rope_finetuned = true; - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 52: type = LLM_TYPE_1B; break; case 40: type = LLM_TYPE_2B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/minicpm3.cpp b/src/models/minicpm3.cpp index 1ffc54fa7c66..e011b1ff0a84 100644 --- a/src/models/minicpm3.cpp +++ b/src/models/minicpm3.cpp @@ -5,7 +5,7 @@ void llama_model_minicpm3::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q); ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 62: type = LLM_TYPE_4B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/minimax-m2.cpp b/src/models/minimax-m2.cpp index 22e291d73a33..b25435e4d97c 100644 --- a/src/models/minimax-m2.cpp +++ b/src/models/minimax-m2.cpp @@ -5,7 +5,7 @@ void llama_model_minimax_m2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 62: type = LLM_TYPE_230B_A10B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/mistral3.cpp b/src/models/mistral3.cpp index 4e6ebef82cbb..0b119cc93cce 100644 --- a/src/models/mistral3.cpp +++ b/src/models/mistral3.cpp @@ -18,7 +18,7 @@ void llama_model_mistral3::load_arch_hparams(llama_model_loader & ml) { } } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 26: type = LLM_TYPE_3B; break; case 34: type = LLM_TYPE_8B; break; case 40: type = LLM_TYPE_14B; break; diff --git a/src/models/models.h b/src/models/models.h index 4e40536a5ea3..bc20a82bc0e2 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -66,9 +66,6 @@ struct llm_build_delta_net_base : public llm_graph_context { ggml_tensor * s, int il); - // true when speculative rollback is enabled and the batch fits in the rs cache - bool keep_rs() const; - // read conv state from cache, concat with qkv_mixed, write back (single slot or per-token) // qkv_mixed: (qkv_dim, n_seq_tokens, n_seqs); returns conv_input: (kernel_size + n_seq_tokens - 1, channels, n_seqs) ggml_tensor * build_conv_state( @@ -800,6 +797,19 @@ struct llama_model_gemma4 : public llama_model_base { }; +struct llama_model_gemma4_assistant : public llama_model_base { + llama_model_gemma4_assistant(const struct llama_model_params & params) : llama_model_base(params) {} + void load_arch_hparams(llama_model_loader & ml) override; + void load_arch_tensors(llama_model_loader & ml) override; + + struct graph : public llm_graph_context { + graph(const llama_model & model, const llm_graph_params & params); + }; + + std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; +}; + + struct llama_model_gemma_embedding : public llama_model_base { llama_model_gemma_embedding(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; diff --git a/src/models/modern-bert.cpp b/src/models/modern-bert.cpp index e9b79ffc6dc0..cca841f5c74e 100644 --- a/src/models/modern-bert.cpp +++ b/src/models/modern-bert.cpp @@ -14,7 +14,7 @@ void llama_model_modern_bert::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 12: type = LLM_TYPE_47M; break; // granite-embedding-small case 22: diff --git a/src/models/mpt.cpp b/src/models/mpt.cpp index 0229d20ed369..d094fd9f80bc 100644 --- a/src/models/mpt.cpp +++ b/src/models/mpt.cpp @@ -5,7 +5,7 @@ void llama_model_mpt::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false); ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 48: type = LLM_TYPE_30B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/nemotron-h.cpp b/src/models/nemotron-h.cpp index a82f9c170b48..a456269347b9 100644 --- a/src/models/nemotron-h.cpp +++ b/src/models/nemotron-h.cpp @@ -9,8 +9,8 @@ void llama_model_nemotron_h::load_arch_hparams(llama_model_loader & ml) { // A layer is recurrent IFF the n_head_kv value is set to 0 and // the n_ff value is set to 0 - for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.recurrent_layer_arr[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0); + for (uint32_t i = 0; i < hparams.n_layer(); ++i) { + hparams.is_recr_impl[i] = (hparams.n_head_kv(i) == 0 && hparams.n_ff(i) == 0); } ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -22,7 +22,7 @@ void llama_model_nemotron_h::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); ml.get_key(LLM_KV_MOE_LATENT_SIZE, hparams.moe_latent_size, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 52: type = LLM_TYPE_31B_A3_5B; break; // Nemotron-H_MOE 31B case 56: type = LLM_TYPE_9B; break; case 88: type = LLM_TYPE_120B_A12B; break; @@ -62,7 +62,7 @@ void llama_model_nemotron_h::load_arch_tensors(llama_model_loader &) { // all blocks use the attn norm layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - if (hparams.is_recurrent(i)) { + if (hparams.is_recr(i)) { // ssm layers layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, d_in_proj}, 0); @@ -143,7 +143,7 @@ llama_model_nemotron_h::graph::graph(const llama_model & model, const llm_graph_ cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); - if (hparams.is_recurrent(il)) { + if (hparams.is_recr(il)) { // ssm layer // cur = build_mamba2_layer(inp->get_recr(), cur, model, ubatch, il); } else if (hparams.n_ff(il) == 0) { diff --git a/src/models/nemotron.cpp b/src/models/nemotron.cpp index 5d4a3b5c69e5..6e2bd9a33ca8 100644 --- a/src/models/nemotron.cpp +++ b/src/models/nemotron.cpp @@ -2,7 +2,8 @@ void llama_model_nemotron::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_4B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/neo-bert.cpp b/src/models/neo-bert.cpp index f00d6eddfc99..4a08d7abd40f 100644 --- a/src/models/neo-bert.cpp +++ b/src/models/neo-bert.cpp @@ -3,7 +3,7 @@ void llama_model_neo_bert::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - if (hparams.n_layer == 28) { + if (hparams.n_layer() == 28) { type = LLM_TYPE_250M; } } diff --git a/src/models/nomic-bert-moe.cpp b/src/models/nomic-bert-moe.cpp index a17abe2c2692..da4b62919bb9 100644 --- a/src/models/nomic-bert-moe.cpp +++ b/src/models/nomic-bert-moe.cpp @@ -4,7 +4,7 @@ void llama_model_nomic_bert_moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0); - if (hparams.n_layer == 12 && hparams.n_embd == 768) { + if (hparams.n_layer() == 12 && hparams.n_embd == 768) { if (arch == LLM_ARCH_NOMIC_BERT) { type = LLM_TYPE_137M; } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) { diff --git a/src/models/nomic-bert.cpp b/src/models/nomic-bert.cpp index 5a8a5584457e..e7fc72286a6d 100644 --- a/src/models/nomic-bert.cpp +++ b/src/models/nomic-bert.cpp @@ -4,7 +4,7 @@ void llama_model_nomic_bert::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS, hparams.moe_every_n_layers, 0); - if (hparams.n_layer == 12 && hparams.n_embd == 768) { + if (hparams.n_layer() == 12 && hparams.n_embd == 768) { if (arch == LLM_ARCH_NOMIC_BERT) { type = LLM_TYPE_137M; } else if (arch == LLM_ARCH_NOMIC_BERT_MOE && hparams.moe_every_n_layers == 2) { diff --git a/src/models/olmo.cpp b/src/models/olmo.cpp index cfcf17bcb035..9f7a2ba60efb 100644 --- a/src/models/olmo.cpp +++ b/src/models/olmo.cpp @@ -4,7 +4,7 @@ void llama_model_olmo::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 22: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_7B; break; case 80: type = LLM_TYPE_70B; break; diff --git a/src/models/olmo2.cpp b/src/models/olmo2.cpp index 7cc262f55046..cb52cdef7204 100644 --- a/src/models/olmo2.cpp +++ b/src/models/olmo2.cpp @@ -17,7 +17,7 @@ void llama_model_olmo2::load_arch_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_NONE; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 16: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_7B; break; case 40: type = LLM_TYPE_13B; break; diff --git a/src/models/olmoe.cpp b/src/models/olmoe.cpp index 7976ae44a51c..1e2baeb207ff 100644 --- a/src/models/olmoe.cpp +++ b/src/models/olmoe.cpp @@ -2,7 +2,8 @@ void llama_model_olmoe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 16: type = LLM_TYPE_A1_7B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/openai-moe.cpp b/src/models/openai-moe.cpp index 15b6c8c12057..3ab15d61f08c 100644 --- a/src/models/openai-moe.cpp +++ b/src/models/openai-moe.cpp @@ -14,7 +14,7 @@ void llama_model_openai_moe::load_arch_hparams(llama_model_loader & ml) { hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_20B; break; case 36: type = LLM_TYPE_120B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/openelm.cpp b/src/models/openelm.cpp index 9f76350fd4d1..13120bd3236d 100644 --- a/src/models/openelm.cpp +++ b/src/models/openelm.cpp @@ -3,12 +3,12 @@ void llama_model_openelm::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { - case 16: type = LLM_TYPE_270M; break; - case 20: type = LLM_TYPE_450M; break; - case 28: type = LLM_TYPE_1B; break; - case 36: type = LLM_TYPE_3B; break; - default: type = LLM_TYPE_UNKNOWN; + switch (hparams.n_layer()) { + case 16: type = LLM_TYPE_270M; break; + case 20: type = LLM_TYPE_450M; break; + case 28: type = LLM_TYPE_1B; break; + case 36: type = LLM_TYPE_3B; break; + default: type = LLM_TYPE_UNKNOWN; } } diff --git a/src/models/orion.cpp b/src/models/orion.cpp index bcb4bbba4b10..863a2822269b 100644 --- a/src/models/orion.cpp +++ b/src/models/orion.cpp @@ -3,7 +3,7 @@ void llama_model_orion::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 40: type = LLM_TYPE_14B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/pangu-embed.cpp b/src/models/pangu-embed.cpp index 7593f879b24f..90f05c088c10 100644 --- a/src/models/pangu-embed.cpp +++ b/src/models/pangu-embed.cpp @@ -2,7 +2,8 @@ void llama_model_pangu_embed::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 26: type = LLM_TYPE_1B; break; // openPangu-Embedded-1B-V1.1 case 34: type = LLM_TYPE_7B; break; // openPangu-Embedded-7B-V1.1 default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/phi2.cpp b/src/models/phi2.cpp index 8f3ed5f7b7d2..81b1ad12cc02 100644 --- a/src/models/phi2.cpp +++ b/src/models/phi2.cpp @@ -3,7 +3,7 @@ void llama_model_phi2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_3B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/phi3.cpp b/src/models/phi3.cpp index f8a4a4d5aa50..716ff814cc12 100644 --- a/src/models/phi3.cpp +++ b/src/models/phi3.cpp @@ -3,7 +3,7 @@ void llama_model_phi3::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_3B; break; case 40: type = LLM_TYPE_14B; break; diff --git a/src/models/phimoe.cpp b/src/models/phimoe.cpp index 4575d6139cf5..c332553bc7d1 100644 --- a/src/models/phimoe.cpp +++ b/src/models/phimoe.cpp @@ -3,7 +3,7 @@ void llama_model_phimoe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_16x3_8B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/plamo.cpp b/src/models/plamo.cpp index c7ed1211c311..246144519e43 100644 --- a/src/models/plamo.cpp +++ b/src/models/plamo.cpp @@ -3,7 +3,7 @@ void llama_model_plamo::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 40: type = LLM_TYPE_13B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/plamo2.cpp b/src/models/plamo2.cpp index b713889fe72f..b93cf48bc5c6 100644 --- a/src/models/plamo2.cpp +++ b/src/models/plamo2.cpp @@ -11,11 +11,11 @@ void llama_model_plamo2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); - for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0; + for (uint32_t i = 0; i < hparams.n_layer(); ++i) { + hparams.is_recr_impl[i] = hparams.n_head_kv(i) == 0; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 16: type = LLM_TYPE_1B; break; case 32: if (hparams.n_embd == 2048) { @@ -54,7 +54,7 @@ void llama_model_plamo2::load_arch_tensors(llama_model_loader &) { for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; - bool is_mamba_layer = hparams.is_recurrent(i); + bool is_mamba_layer = hparams.is_recr(i); layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); @@ -128,7 +128,7 @@ llama_model_plamo2::graph::graph(const llama_model & model, const llm_graph_para cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); // check if this layer is Mamba or Attention - const bool is_mamba_layer = hparams.is_recurrent(il); + const bool is_mamba_layer = hparams.is_recr(il); if (is_mamba_layer) { // PLaMo-2 Mamba layer diff --git a/src/models/plamo3.cpp b/src/models/plamo3.cpp index 29f3e803d685..16d0b1dcef71 100644 --- a/src/models/plamo3.cpp +++ b/src/models/plamo3.cpp @@ -13,7 +13,7 @@ void llama_model_plamo3::load_arch_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_NONE; } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_2B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/plm.cpp b/src/models/plm.cpp index ce050919e6a8..8ca325f5e2cf 100644 --- a/src/models/plm.cpp +++ b/src/models/plm.cpp @@ -3,7 +3,8 @@ void llama_model_plm::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_1_8B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/qwen.cpp b/src/models/qwen.cpp index 00467dbad7d5..1f5dff3843cf 100644 --- a/src/models/qwen.cpp +++ b/src/models/qwen.cpp @@ -3,7 +3,7 @@ void llama_model_qwen::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 40: type = LLM_TYPE_13B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/qwen2.cpp b/src/models/qwen2.cpp index a5147460bae6..e9c2ea80a6be 100644 --- a/src/models/qwen2.cpp +++ b/src/models/qwen2.cpp @@ -2,7 +2,8 @@ void llama_model_qwen2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break; case 28: type = hparams.n_embd == 1536 ? LLM_TYPE_1_5B : LLM_TYPE_7B; break; case 32: type = LLM_TYPE_7B; break; diff --git a/src/models/qwen2moe.cpp b/src/models/qwen2moe.cpp index 7cb03859debb..e831ed11aad6 100644 --- a/src/models/qwen2moe.cpp +++ b/src/models/qwen2moe.cpp @@ -5,7 +5,8 @@ void llama_model_qwen2moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_A2_7B; break; case 28: type = LLM_TYPE_57B_A14B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/qwen3.cpp b/src/models/qwen3.cpp index 41b97fed9564..1d0d2fab362a 100644 --- a/src/models/qwen3.cpp +++ b/src/models/qwen3.cpp @@ -2,7 +2,8 @@ void llama_model_qwen3::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break; case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break; case 40: type = LLM_TYPE_14B; break; diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index 54b5fb186907..4b642cff467c 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -13,21 +13,20 @@ void llama_model_qwen35::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); // NextN/MTP (Qwen3.5/3.6): extra decoder block appended beyond the main stack - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); // Mark recurrent layers (linear attention layers). MTP layers are dense // attention-only and must be flagged non-recurrent. - { - const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers; + if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer_all, false)) { uint32_t full_attn_interval = 4; ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false); - for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.recurrent_layer_arr[i] = (i < n_main) && ((i + 1) % full_attn_interval != 0); + for (uint32_t i = 0; i < hparams.n_layer_all; ++i) { + hparams.is_recr_impl[i] = (i < hparams.n_layer()) && ((i + 1) % full_attn_interval != 0); } } - switch (hparams.n_layer - hparams.nextn_predict_layers) { + switch (hparams.n_layer()) { case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_8B : LLM_TYPE_2B; break; case 32: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_9B; break; case 64: type = LLM_TYPE_27B; break; @@ -38,9 +37,7 @@ void llama_model_qwen35::load_arch_hparams(llama_model_loader & ml) { void llama_model_qwen35::load_arch_tensors(llama_model_loader & ml) { LLAMA_LOAD_LOCALS; - const uint32_t n_main = n_layer - hparams.nextn_predict_layers; - const bool mtp_only = (hparams.nextn_predict_layers > 0) && - (ml.get_weight("blk.0.attn_norm.weight") == nullptr); + const bool mtp_only = (hparams.n_layer_nextn > 0) && (ml.get_weight("blk.0.attn_norm.weight") == nullptr); const int trunk_flags = mtp_only ? TENSOR_NOT_REQUIRED : 0; tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); @@ -69,7 +66,7 @@ void llama_model_qwen35::load_arch_tensors(llama_model_loader & ml) { layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", il), { n_embd }, flags); layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", il), { n_embd }, flags); - if (!hparams.is_recurrent(il)) { + if (!hparams.is_recr(il)) { // Attention layers create_tensor_qkv(layer, il, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, flags); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", il), { n_embd_head_k * n_head, n_embd }, flags); @@ -121,10 +118,10 @@ void llama_model_qwen35::load_arch_tensors(llama_model_loader & ml) { layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", il), { n_embd }, TENSOR_NOT_REQUIRED); }; - for (int i = 0; i < (int) n_main; ++i) { + for (int i = 0; i < n_layer; ++i) { load_block_trunk(i, trunk_flags); } - for (int i = (int) n_main; i < n_layer; ++i) { + for (int i = n_layer; i < n_layer_all; ++i) { load_block_mtp(i); } } @@ -155,11 +152,10 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para auto * inp = build_inp_mem_hybrid(); ggml_tensor * inp_pos = build_inp_pos(); - ggml_tensor * inp_out_ids = (n_outputs > 0 && (!cparams.embeddings_pre_norm || n_outputs < n_tokens)) ? build_inp_out_ids() : nullptr; + ggml_tensor * inp_out_ids = build_inp_out_ids(); // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass. - const int n_transformer_layers = n_layer - (int) hparams.nextn_predict_layers; - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); @@ -168,7 +164,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para ggml_build_forward_expand(gf, cur); // Determine layer type and build appropriate attention mechanism - if (hparams.is_recurrent(il)) { + if (hparams.is_recr(il)) { // Linear attention layer (gated delta net) cur = build_layer_attn_linear(inp->get_recr(), cur, il); } else { @@ -176,7 +172,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il); } - if (il == n_transformer_layers - 1 && inp_out_ids && !cparams.embeddings_pre_norm) { + if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -208,22 +204,15 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para } cur = inpL; - cb(cur, "h_pre_norm", -1); - res->t_h_pre_norm = cur; + cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); - if (n_outputs == 0) { - ggml_build_forward_expand(gf, cur); - return; - } + cb(cur, "h_nextn", -1); + res->t_h_nextn = cur; - if (inp_out_ids && cparams.embeddings_pre_norm && n_outputs < n_tokens) { + if (!cparams.embeddings_nextn_masked && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); - cb(cur, "h_pre_norm_out", -1); } - // Final norm - cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); res->t_embd = cur; @@ -496,14 +485,15 @@ ggml_tensor * llama_model_qwen35::graph::build_layer_ffn(ggml_tensor * cur, cons // LLM_GRAPH_TYPE_DECODER_MTP draft head for Qwen3.5/3.6 dense series llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35 MTP requires nextn_predict_layers > 0"); - GGML_ASSERT(hparams.nextn_predict_layers == 1 && "QWEN35 MTP currently only supports a single MTP block"); + GGML_ASSERT(hparams.n_layer_nextn > 0 && "QWEN35 MTP requires n_layer_nextn > 0"); + GGML_ASSERT(hparams.n_layer_nextn == 1 && "QWEN35 MTP currently only supports a single MTP block"); const int64_t n_embd_head = hparams.n_embd_head_v(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); - // The MTP block lives at the source file's original layer index. - const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers; + // hparams.n_layer includes both main model layers and MTP layers. The MTP + // layer is stored immediately after the main layers in model.layers[]. + const int il = hparams.n_layer(); const auto & layer = model.layers[il]; GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj"); @@ -513,28 +503,41 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr int sections[4]; std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); - auto inp = std::make_unique(hparams.n_embd); + // TODO: extract in a common llm_graph_context::build_inp_embd_h() + auto inp = std::make_unique(hparams.n_embd); inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); ggml_set_input(inp->tokens); - inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens); + inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd_inp(), n_tokens); ggml_set_input(inp->embd); - ggml_set_name(inp->embd, "mtp_h_input"); - ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd; + // TODO: make static using `ggml_build_forward_select()` + // see llm_graph_context::build_inp_embd() for reference + ggml_tensor * tok_embd; + if (ubatch.token) { + ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd; - ggml_tensor * h_input = inp->embd; - ggml_tensor * tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens); + tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens); + } else { + tok_embd = inp->embd; + } cb(tok_embd, "mtp_tok_embd", il); + inp->h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens); + ggml_set_input(inp->h); + ggml_set_name(inp->h, "mtp_h_input"); + + ggml_tensor * h_embd = inp->h; + res->add_input(std::move(inp)); ggml_tensor * inp_pos = build_inp_pos(); - ggml_tensor * inp_out_ids = (n_outputs > 0 && n_outputs < n_tokens) ? build_inp_out_ids() : nullptr; - auto * inp_attn = build_attn_inp_kv(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + auto * inp_attn = build_attn_inp_kv(); - ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il); + ggml_tensor * h_norm = build_norm(h_embd, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il); cb(h_norm, "mtp_hnorm", il); ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il); @@ -543,7 +546,7 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0); cb(concat, "mtp_concat", il); - ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat); + ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat, layer.nextn.eh_proj_s); cb(cur, "mtp_eh_proj", il); ggml_tensor * inpSA = cur; @@ -616,31 +619,22 @@ llama_model_qwen35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr cur = ggml_add(ctx0, cur, ffn_residual); cb(cur, "mtp_post_ffn", il); - // Pre-norm hidden state: used by the AR draft loop to seed the next MTP step. - // (In the trunk graph this is `t_h_pre_norm`; the MTP head reuses the same slot.) - cb(cur, "h_pre_norm", -1); - res->t_h_pre_norm = cur; - - if (n_outputs == 0) { - ggml_build_forward_expand(gf, cur); - return; - } - - if (inp_out_ids && n_outputs < n_tokens) { - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - cb(cur, "mtp_h_pre_norm_out", -1); - } - ggml_tensor * head_norm_w = layer.nextn.shared_head_norm ? layer.nextn.shared_head_norm : model.output_norm; GGML_ASSERT(head_norm_w && "QWEN35 MTP: missing both nextn.shared_head_norm and output_norm"); cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1); + + cb(cur, "h_nextn", -1); + res->t_h_nextn = cur; + + cur = ggml_get_rows(ctx0, cur, inp_out_ids); cb(cur, "mtp_shared_head_norm", -1); ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output; + ggml_tensor * head_s = layer.nextn.shared_head_head ? layer.nextn.shared_head_head_s : model.output_s; GGML_ASSERT(head_w && "QWEN35 MTP: missing LM head (nextn.shared_head_head or model.output)"); - cur = build_lora_mm(head_w, cur); + cur = build_lora_mm(head_w, cur, head_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index 029e1a4febfd..eb5e9a406a15 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -16,21 +16,20 @@ void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); // NextN/MTP (Qwen3.5/3.6): extra decoder block appended beyond the main stack - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer_impl"); // Mark recurrent layers (linear attention layers). MTP layers are dense // attention-only and must be flagged non-recurrent. - { - const uint32_t n_main = hparams.n_layer - hparams.nextn_predict_layers; + if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer_all, false)) { uint32_t full_attn_interval = 4; ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false); - for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.recurrent_layer_arr[i] = (i < n_main) && ((i + 1) % full_attn_interval != 0); + for (uint32_t i = 0; i < hparams.n_layer_all; ++i) { + hparams.is_recr_impl[i] = (i < hparams.n_layer()) && ((i + 1) % full_attn_interval != 0); } } - switch (hparams.n_layer - hparams.nextn_predict_layers) { + switch (hparams.n_layer()) { case 40: type = LLM_TYPE_35B_A3B; break; case 48: type = LLM_TYPE_122B_A10B; break; case 60: type = LLM_TYPE_397B_A17B; break; @@ -41,9 +40,7 @@ void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) { void llama_model_qwen35moe::load_arch_tensors(llama_model_loader & ml) { LLAMA_LOAD_LOCALS; - const uint32_t n_main = n_layer - hparams.nextn_predict_layers; - const bool mtp_only = (hparams.nextn_predict_layers > 0) && - (ml.get_weight("blk.0.attn_norm.weight") == nullptr); + const bool mtp_only = (hparams.n_layer_nextn > 0) && (ml.get_weight("blk.0.attn_norm.weight") == nullptr); const int trunk_flags = mtp_only ? TENSOR_NOT_REQUIRED : 0; tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); @@ -75,7 +72,7 @@ void llama_model_qwen35moe::load_arch_tensors(llama_model_loader & ml) { layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", il), { n_embd }, flags); layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", il), { n_embd }, flags); - if (!hparams.is_recurrent(il)) { + if (!hparams.is_recr(il)) { // Attention layers create_tensor_qkv(layer, il, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, flags); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", il), { n_embd_head_k * n_head, n_embd }, flags); @@ -144,10 +141,10 @@ void llama_model_qwen35moe::load_arch_tensors(llama_model_loader & ml) { layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", il), { n_embd }, TENSOR_NOT_REQUIRED); }; - for (int i = 0; i < (int) n_main; ++i) { + for (int i = 0; i < n_layer; ++i) { load_block_trunk(i, trunk_flags); } - for (int i = (int) n_main; i < n_layer; ++i) { + for (int i = n_layer; i < n_layer_all; ++i) { load_block_mtp(i); } } @@ -178,11 +175,10 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p auto * inp = build_inp_mem_hybrid(); ggml_tensor * inp_pos = build_inp_pos(); - ggml_tensor * inp_out_ids = (n_outputs > 0 && (!cparams.embeddings_pre_norm || n_outputs < n_tokens)) ? build_inp_out_ids() : nullptr; + ggml_tensor * inp_out_ids = build_inp_out_ids(); // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass. - const int n_transformer_layers = n_layer - (int) hparams.nextn_predict_layers; - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); @@ -191,7 +187,7 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p ggml_build_forward_expand(gf, cur); // Determine layer type and build appropriate attention mechanism - if (hparams.is_recurrent(il)) { + if (hparams.is_recr(il)) { // Linear attention layer (gated delta net) cur = build_layer_attn_linear(inp->get_recr(), cur, il); } else { @@ -199,7 +195,7 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p cur = build_layer_attn(inp->get_attn(), cur, inp_pos, sections, il); } - if (il == n_transformer_layers - 1 && inp_out_ids && !cparams.embeddings_pre_norm) { + if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -231,22 +227,16 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p } cur = inpL; - cb(cur, "h_pre_norm", -1); - res->t_h_pre_norm = cur; + // post-norm hidden state feeds both the LM head and the MTP seed below + cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); - if (n_outputs == 0) { - ggml_build_forward_expand(gf, cur); - return; - } + cb(cur, "h_nextn", -1); + res->t_h_nextn = cur; - if (inp_out_ids && cparams.embeddings_pre_norm && n_outputs < n_tokens) { + if (!cparams.embeddings_nextn_masked && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); - cb(cur, "h_pre_norm_out", -1); } - // Final norm - cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); - cb(cur, "result_norm", -1); res->t_embd = cur; @@ -560,13 +550,13 @@ ggml_tensor * llama_model_qwen35moe::graph::build_layer_ffn(ggml_tensor * cur, c // LLM_GRAPH_TYPE_DECODER_MTP draft head for Qwen3.5/3.6 MoE llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - GGML_ASSERT(hparams.nextn_predict_layers > 0 && "QWEN35MOE MTP requires nextn_predict_layers > 0"); - GGML_ASSERT(hparams.nextn_predict_layers == 1 && "QWEN35MOE MTP currently only supports a single MTP block"); + GGML_ASSERT(hparams.n_layer_nextn > 0 && "QWEN35MOE MTP requires n_layer_nextn > 0"); + GGML_ASSERT(hparams.n_layer_nextn == 1 && "QWEN35MOE MTP currently only supports a single MTP block"); const int64_t n_embd_head = hparams.n_embd_head_v(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); - const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers; + const int il = hparams.n_layer(); const auto & layer = model.layers[il]; GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj"); @@ -577,28 +567,41 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm int sections[4]; std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); - auto inp = std::make_unique(hparams.n_embd); + // TODO: extract in a common llm_graph_context::build_inp_embd_h() + auto inp = std::make_unique(hparams.n_embd); inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); ggml_set_input(inp->tokens); - inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens); + inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd_inp(), n_tokens); ggml_set_input(inp->embd); - ggml_set_name(inp->embd, "mtp_h_input"); - ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd; + // TODO: make static using `ggml_build_forward_select()` + // see llm_graph_context::build_inp_embd() for reference + ggml_tensor * tok_embd; + if (ubatch.token) { + ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd; - ggml_tensor * h_input = inp->embd; - ggml_tensor * tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens); + tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens); + } else { + tok_embd = inp->embd; + } cb(tok_embd, "mtp_tok_embd", il); + inp->h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens); + ggml_set_input(inp->h); + ggml_set_name(inp->h, "mtp_h_input"); + + ggml_tensor * h_embd = inp->h; + res->add_input(std::move(inp)); ggml_tensor * inp_pos = build_inp_pos(); - ggml_tensor * inp_out_ids = (n_outputs > 0 && n_outputs < n_tokens) ? build_inp_out_ids() : nullptr; - auto * inp_attn = build_attn_inp_kv(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + auto * inp_attn = build_attn_inp_kv(); - ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il); + ggml_tensor * h_norm = build_norm(h_embd, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il); cb(h_norm, "mtp_hnorm", il); ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il); @@ -607,7 +610,7 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0); cb(concat, "mtp_concat", il); - ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat); + ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat, layer.nextn.eh_proj_s); cb(cur, "mtp_eh_proj", il); ggml_tensor * inpSA = cur; @@ -713,30 +716,22 @@ llama_model_qwen35moe::graph_mtp::graph_mtp(const llama_model & model, const llm cur = ggml_add(ctx0, cur, ffn_residual); cb(cur, "mtp_post_ffn", il); - // Pre-norm hidden state: used by the AR draft loop to seed the next MTP step. - cb(cur, "h_pre_norm", -1); - res->t_h_pre_norm = cur; - - if (n_outputs == 0) { - ggml_build_forward_expand(gf, cur); - return; - } - - if (inp_out_ids && n_outputs < n_tokens) { - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - cb(cur, "mtp_h_pre_norm_out", -1); - } - ggml_tensor * head_norm_w = layer.nextn.shared_head_norm ? layer.nextn.shared_head_norm : model.output_norm; GGML_ASSERT(head_norm_w && "QWEN35MOE MTP: missing both nextn.shared_head_norm and output_norm"); cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1); + + cb(cur, "h_nextn", -1); + res->t_h_nextn= cur; + + cur = ggml_get_rows(ctx0, cur, inp_out_ids); cb(cur, "mtp_shared_head_norm", -1); ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output; + ggml_tensor * head_s = layer.nextn.shared_head_head ? layer.nextn.shared_head_head_s : model.output_s; GGML_ASSERT(head_w && "QWEN35MOE MTP: missing LM head (nextn.shared_head_head or model.output)"); - cur = build_lora_mm(head_w, cur); + cur = build_lora_mm(head_w, cur, head_s); cb(cur, "result_output", -1); res->t_logits = cur; diff --git a/src/models/qwen3moe.cpp b/src/models/qwen3moe.cpp index a4f8e1379c90..317e668bec79 100644 --- a/src/models/qwen3moe.cpp +++ b/src/models/qwen3moe.cpp @@ -1,10 +1,10 @@ #include "models.h" void llama_model_qwen3moe::load_arch_hparams(llama_model_loader & ml) { - ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); - + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 48: type = LLM_TYPE_30B_A3B; break; case 94: type = LLM_TYPE_235B_A22B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp index 1d873427db5e..97200a44072f 100644 --- a/src/models/qwen3next.cpp +++ b/src/models/qwen3next.cpp @@ -14,15 +14,15 @@ void llama_model_qwen3next::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); // Mark recurrent layers (linear attention layers) - { + if (!ml.get_key_or_arr(LLM_KV_ATTENTION_RECURRENT_LAYERS, hparams.is_recr_impl, hparams.n_layer_all, false)) { uint32_t full_attn_interval = 4; ml.get_key(LLM_KV_FULL_ATTENTION_INTERVAL, full_attn_interval, false); - for (uint32_t i = 0; i < hparams.n_layer; ++i) { - hparams.recurrent_layer_arr[i] = ((i + 1) % full_attn_interval != 0); + for (uint32_t i = 0; i < hparams.n_layer_all; ++i) { + hparams.is_recr_impl[i] = (i < hparams.n_layer()) && ((i + 1) % full_attn_interval != 0); } } - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 48: type = LLM_TYPE_80B_A3B; break; default: type = LLM_TYPE_UNKNOWN; } @@ -68,7 +68,7 @@ void llama_model_qwen3next::load_arch_tensors(llama_model_loader &) { layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0); layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0); - if (!hparams.is_recurrent(i)) { + if (!hparams.is_recr(i)) { // Attention layers create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head * 2, n_embd_k_gqa, n_embd_v_gqa, 0); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0); @@ -129,7 +129,7 @@ llama_model_qwen3next::graph::graph(const llama_model & model, const llm_graph_p ggml_build_forward_expand(gf, cur); // Determine layer type and build appropriate attention mechanism - if (hparams.is_recurrent(il)) { + if (hparams.is_recr(il)) { // Linear attention layer (gated delta net) cur = build_layer_attn_linear(inp->get_recr(), cur, il); } else { diff --git a/src/models/qwen3vl.cpp b/src/models/qwen3vl.cpp index 5defd8939448..724d6140d193 100644 --- a/src/models/qwen3vl.cpp +++ b/src/models/qwen3vl.cpp @@ -4,7 +4,8 @@ void llama_model_qwen3vl::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_NUM_DEEPSTACK_LAYERS, hparams.n_deepstack_layers, false); ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 28: type = LLM_TYPE_1_7B; break; case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break; case 64: type = LLM_TYPE_32B; break; diff --git a/src/models/qwen3vlmoe.cpp b/src/models/qwen3vlmoe.cpp index 5b77df57122e..7c41592f7727 100644 --- a/src/models/qwen3vlmoe.cpp +++ b/src/models/qwen3vlmoe.cpp @@ -5,7 +5,8 @@ void llama_model_qwen3vlmoe::load_arch_hparams(llama_model_loader & ml) { ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, true); ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 48: type = LLM_TYPE_30B_A3B; break; case 94: type = LLM_TYPE_235B_A22B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/refact.cpp b/src/models/refact.cpp index bf3949a90920..a46c358fa68a 100644 --- a/src/models/refact.cpp +++ b/src/models/refact.cpp @@ -2,7 +2,8 @@ void llama_model_refact::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_1B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/rnd1.cpp b/src/models/rnd1.cpp index ca8e009615e9..fc276ce591bf 100644 --- a/src/models/rnd1.cpp +++ b/src/models/rnd1.cpp @@ -2,12 +2,13 @@ void llama_model_rnd1::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 48: type = LLM_TYPE_30B_A3B; break; default: type = LLM_TYPE_UNKNOWN; } + // Set non-causal attention for diffusion models hparams.causal_attn = false; } diff --git a/src/models/rwkv6.cpp b/src/models/rwkv6.cpp index ba2a9dfa0db4..0b5013dc7585 100644 --- a/src/models/rwkv6.cpp +++ b/src/models/rwkv6.cpp @@ -9,7 +9,7 @@ void llama_model_rwkv6::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false); ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1_6B; break; case 32: switch (hparams.n_embd) { diff --git a/src/models/rwkv6qwen2.cpp b/src/models/rwkv6qwen2.cpp index 566b8cdcb549..6c7db514435e 100644 --- a/src/models/rwkv6qwen2.cpp +++ b/src/models/rwkv6qwen2.cpp @@ -9,7 +9,7 @@ void llama_model_rwkv6qwen2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers, false); ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1_6B; break; case 32: switch (hparams.n_embd) { diff --git a/src/models/rwkv7.cpp b/src/models/rwkv7.cpp index 7574b252621c..67c51f5b59c6 100644 --- a/src/models/rwkv7.cpp +++ b/src/models/rwkv7.cpp @@ -10,7 +10,7 @@ void llama_model_rwkv7::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_GATE_LORA_RANK, hparams.n_lora_gate, false); ml.get_key(LLM_KV_TOKEN_SHIFT_COUNT, hparams.token_shift_count, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 12: switch (hparams.n_embd) { case 768: type = LLM_TYPE_190M; break; diff --git a/src/models/seed-oss.cpp b/src/models/seed-oss.cpp index 806cba574be8..57de881a091f 100644 --- a/src/models/seed-oss.cpp +++ b/src/models/seed-oss.cpp @@ -2,7 +2,8 @@ void llama_model_seed_oss::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 64: type = LLM_TYPE_36B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/smallthinker.cpp b/src/models/smallthinker.cpp index 4231cccc666f..a8e3d957f1f0 100644 --- a/src/models/smallthinker.cpp +++ b/src/models/smallthinker.cpp @@ -15,14 +15,14 @@ void llama_model_smallthinker::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); } else { hparams.swa_type = LLAMA_SWA_TYPE_NONE; - hparams.n_no_rope_layer_step = hparams.n_layer; + hparams.n_no_rope_layer_step = hparams.n_layer(); } ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_4B; break; case 52: type = LLM_TYPE_20B; break; default: type = LLM_TYPE_UNKNOWN; diff --git a/src/models/smollm3.cpp b/src/models/smollm3.cpp index 90e7d473eaff..c67d967b2040 100644 --- a/src/models/smollm3.cpp +++ b/src/models/smollm3.cpp @@ -4,7 +4,7 @@ void llama_model_smollm3::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); hparams.n_no_rope_layer_step = 4; - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 36: type = LLM_TYPE_3B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/stablelm.cpp b/src/models/stablelm.cpp index 4da7f7aefcf0..bf6087b87960 100644 --- a/src/models/stablelm.cpp +++ b/src/models/stablelm.cpp @@ -3,7 +3,7 @@ void llama_model_stablelm::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1B; break; case 32: type = LLM_TYPE_3B; break; case 40: type = LLM_TYPE_12B; break; diff --git a/src/models/starcoder.cpp b/src/models/starcoder.cpp index e131af058bc3..f73a88fd4e90 100644 --- a/src/models/starcoder.cpp +++ b/src/models/starcoder.cpp @@ -2,7 +2,8 @@ void llama_model_starcoder::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 24: type = LLM_TYPE_1B; break; case 36: type = LLM_TYPE_3B; break; case 42: type = LLM_TYPE_7B; break; diff --git a/src/models/starcoder2.cpp b/src/models/starcoder2.cpp index 9c207c028856..b81b469374a4 100644 --- a/src/models/starcoder2.cpp +++ b/src/models/starcoder2.cpp @@ -2,7 +2,8 @@ void llama_model_starcoder2::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 30: type = LLM_TYPE_3B; break; case 32: type = LLM_TYPE_7B; break; case 40: type = LLM_TYPE_15B; break; diff --git a/src/models/step35.cpp b/src/models/step35.cpp index 3b68e68707ae..7f52fa61d1fd 100644 --- a/src/models/step35.cpp +++ b/src/models/step35.cpp @@ -22,11 +22,11 @@ void llama_model_step35::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); - ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer); - ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp, hparams.n_layer, false); - ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer, false); + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer()); + ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp, hparams.n_layer(), false); + ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer(), false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 45: type = LLM_TYPE_196B_A11B; break; default: type = LLM_TYPE_UNKNOWN; } diff --git a/src/models/t5.cpp b/src/models/t5.cpp index 73e327414066..b0e3f062572f 100644 --- a/src/models/t5.cpp +++ b/src/models/t5.cpp @@ -9,10 +9,10 @@ void llama_model_t5::load_arch_hparams(llama_model_loader & ml) { hparams.dec_start_token_id = dec_start_token_id; } - hparams.dec_n_layer = hparams.n_layer; + hparams.dec_n_layer = hparams.n_layer(); ml.get_key(LLM_KV_DECODER_BLOCK_COUNT, hparams.dec_n_layer, false); - switch (hparams.n_layer) { + switch (hparams.n_layer()) { case 6: type = LLM_TYPE_60M; break; // t5-small case 8: type = LLM_TYPE_80M; break; // flan-t5-small case 12: diff --git a/src/models/xverse.cpp b/src/models/xverse.cpp index d6d1c7a2e5de..3135001293a8 100644 --- a/src/models/xverse.cpp +++ b/src/models/xverse.cpp @@ -2,7 +2,8 @@ void llama_model_xverse::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { + + switch (hparams.n_layer()) { case 32: type = LLM_TYPE_7B; break; case 40: type = LLM_TYPE_13B; break; case 80: type = LLM_TYPE_65B; break; diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp index 16af11a28623..d1138da0f6f5 100644 --- a/tests/test-llama-archs.cpp +++ b/tests/test-llama-archs.cpp @@ -387,7 +387,7 @@ static bool arch_supported(const llm_arch arch) { if (arch == LLM_ARCH_WAVTOKENIZER_DEC) { return false; // FIXME CUDA backend crashes. } - if (arch == LLM_ARCH_GEMMA4) { + if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) { return false; // FIXME @ngxson } if (arch == LLM_ARCH_LLAMA_EMBED || arch == LLM_ARCH_GEMMA_EMBEDDING || arch == LLM_ARCH_T5ENCODER) { @@ -442,7 +442,7 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) { continue; } - if (arch == LLM_ARCH_GEMMA4) { + if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) { continue; // FIXME: ISWA KV cache initialization needs more fixture params } for (bool moe : {false, true}) { @@ -526,7 +526,7 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg if (target_arch != LLM_ARCH_UNKNOWN && arch != target_arch) { continue; } - if (arch == LLM_ARCH_GEMMA4) { + if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) { continue; // FIXME: ISWA KV cache initialization needs more fixture params } diff --git a/tools/cli/README.md b/tools/cli/README.md index c40b5a21cc0f..38bc78a3fdf8 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -191,10 +191,10 @@ | `--spec-draft-override-tensor, -otd, --override-tensor-draft =,...` | override tensor buffer type for draft model | | `--spec-draft-cpu-moe, -cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model
(env: LLAMA_ARG_SPEC_DRAFT_CPU_MOE) | | `--spec-draft-n-cpu-moe, --spec-draft-ncmoe, -ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model
(env: LLAMA_ARG_SPEC_DRAFT_N_CPU_MOE) | -| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) | +| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 3)
(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) | | `--spec-draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)
(env: LLAMA_ARG_SPEC_DRAFT_N_MIN) | | `--spec-draft-p-split, --draft-p-split P` | speculative decoding split probability (default: 0.10)
(env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT) | -| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.75)
(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) | +| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.00)
(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) | | `--spec-draft-device, -devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices | | `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | | `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_SPEC_DRAFT_MODEL) | diff --git a/tools/server/README.md b/tools/server/README.md index 1ecfb9cdd816..e0a25398c032 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -183,6 +183,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)
(env: LLAMA_ARG_IMAGE_MAX_TOKENS) | | `-a, --alias STRING` | set model name aliases, comma-separated (to be used by API)
(env: LLAMA_ARG_ALIAS) | | `--tags STRING` | set model tags, comma-separated (informational, not used for routing)
(env: LLAMA_ARG_TAGS) | +| `--embd-normalize N` | normalisation for embeddings (default: 2) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) | | `--host HOST` | ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: 127.0.0.1)
(env: LLAMA_ARG_HOST) | | `--port PORT` | port to listen (default: 8080)
(env: LLAMA_ARG_PORT) | | `--reuse-port` | allow multiple sockets to bind to the same port (default: disabled)
(env: LLAMA_ARG_REUSE_PORT) | @@ -246,10 +247,10 @@ For the full list of features, please refer to [server's changelog](https://gith | `--spec-draft-override-tensor, -otd, --override-tensor-draft =,...` | override tensor buffer type for draft model | | `--spec-draft-cpu-moe, -cmoed, --cpu-moe-draft` | keep all Mixture of Experts (MoE) weights in the CPU for the draft model
(env: LLAMA_ARG_SPEC_DRAFT_CPU_MOE) | | `--spec-draft-n-cpu-moe, --spec-draft-ncmoe, -ncmoed, --n-cpu-moe-draft N` | keep the Mixture of Experts (MoE) weights of the first N layers in the CPU for the draft model
(env: LLAMA_ARG_SPEC_DRAFT_N_CPU_MOE) | -| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) | +| `--spec-draft-n-max N` | number of tokens to draft for speculative decoding (default: 3)
(env: LLAMA_ARG_SPEC_DRAFT_N_MAX) | | `--spec-draft-n-min N` | minimum number of draft tokens to use for speculative decoding (default: 0)
(env: LLAMA_ARG_SPEC_DRAFT_N_MIN) | | `--spec-draft-p-split, --draft-p-split P` | speculative decoding split probability (default: 0.10)
(env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT) | -| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.75)
(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) | +| `--spec-draft-p-min, --draft-p-min P` | minimum speculative decoding probability (greedy) (default: 0.00)
(env: LLAMA_ARG_SPEC_DRAFT_P_MIN) | | `--spec-draft-device, -devd, --device-draft ` | comma-separated list of devices to use for offloading the draft model (none = don't offload)
use --list-devices to see a list of available devices | | `--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N` | max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT) | | `--spec-draft-model, -md, --model-draft FNAME` | draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_SPEC_DRAFT_MODEL) | diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index abae3d1f2655..cbb79547c012 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -1,4 +1,3 @@ - #include "server-context.h" #include "server-chat.h" #include "server-common.h" @@ -15,6 +14,11 @@ #include "mtmd.h" #include "mtmd-helper.h" +#include "ggml-cpp.h" + +// TODO: tmp until the mtmd draft processing is refactored [TAG_MTMD_DRAFT_PROCESSING] +#include "../../src/llama-ext.h" + #include #include #include @@ -37,6 +41,21 @@ using json = nlohmann::ordered_json; constexpr int HTTP_POLLING_SECONDS = 1; +static uint32_t server_n_outputs_max(const common_params & params) { + const uint32_t n_batch = params.n_batch; + + if (params.embedding || + (params.pooling_type != LLAMA_POOLING_TYPE_UNSPECIFIED && params.pooling_type != LLAMA_POOLING_TYPE_NONE)) { + return n_batch; + } + + const uint32_t n_outputs_per_seq = 1 + common_speculative_n_max(¶ms.speculative); + + const uint64_t n_outputs = (uint64_t) params.n_parallel * n_outputs_per_seq; + + return std::max(1, std::min(n_batch, n_outputs)); +} + // state diagram: https://github.com/ggml-org/llama.cpp/pull/9283 enum slot_state { SLOT_STATE_IDLE, @@ -241,11 +260,12 @@ struct server_slot { bool need_embd() const { GGML_ASSERT(task); - return task->need_embd(); + return task->need_embd() || (spec && common_speculative_need_embd(spec)); } - bool need_embd_pre_norm() const { - return spec && common_speculative_need_embd(spec); + bool need_embd_nextn() const { + GGML_ASSERT(task); + return spec && common_speculative_need_embd_nextn(spec); } // if the context does not have a memory module then all embeddings have to be computed within a single ubatch @@ -742,6 +762,11 @@ struct server_context_impl { SRV_INF("loading model '%s'\n", params.model.path.c_str()); params_base = params; + params_base.n_outputs_max = server_n_outputs_max(params_base); + + // note: upstream's --fit VRAM pre-estimation for the mmproj / draft-MTP + // context (mtmd_get_memory_usage + common_get_device_memory_data) is not + // carried on this fork yet; MTP itself does not depend on it. llama_init = common_init_from_params(params_base); @@ -793,16 +818,17 @@ struct server_context_impl { const bool spec_mtp = std::find(params_base.speculative.types.begin(), params_base.speculative.types.end(), COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params_base.speculative.types.end(); + if (spec_mtp) { cparams.ctx_type = LLAMA_CONTEXT_TYPE_MTP; } // note: for small models maybe we can set this to the maximum possible draft from all speculative types // the extra memory for small models is likely negligible? - cparams.n_rs_seq = 0; - ctx_dft.reset(llama_init_from_model(model_dft.get(), cparams)); + cparams.n_rs_seq = 0; + cparams.ctx_other = ctx_tgt; - ctx_dft_seq_rm_type = common_context_can_seq_rm(ctx_dft.get()); + ctx_dft.reset(llama_init_from_model(model_dft.get(), cparams)); params_base.speculative.draft.ctx_tgt = ctx_tgt; params_base.speculative.draft.ctx_dft = ctx_dft.get(); @@ -812,8 +838,12 @@ struct server_context_impl { params_base.model.path.c_str()); auto cparams_mtp = common_context_params_to_llama(params_base); - cparams_mtp.ctx_type = LLAMA_CONTEXT_TYPE_MTP; - cparams_mtp.n_rs_seq = 0; + cparams_mtp.ctx_type = LLAMA_CONTEXT_TYPE_MTP; + cparams_mtp.type_k = params_base.speculative.draft.cache_type_k; + cparams_mtp.type_v = params_base.speculative.draft.cache_type_v; + cparams_mtp.n_rs_seq = 0; + cparams_mtp.n_outputs_max = params_base.n_parallel; + cparams_mtp.ctx_other = ctx_tgt; ctx_dft.reset(llama_init_from_model(model_tgt, cparams_mtp)); if (ctx_dft == nullptr) { @@ -821,8 +851,6 @@ struct server_context_impl { return false; } - ctx_dft_seq_rm_type = common_context_can_seq_rm(ctx_dft.get()); - params_base.speculative.draft.ctx_tgt = ctx_tgt; params_base.speculative.draft.ctx_dft = ctx_dft.get(); } @@ -920,6 +948,10 @@ struct server_context_impl { } } + if (ctx_dft) { + ctx_dft_seq_rm_type = common_context_can_seq_rm(ctx_dft.get()); + } + if (spec) { SRV_INF("%s", "speculative decoding context initialized\n"); } else { @@ -2856,10 +2888,11 @@ struct server_context_impl { continue; } - if (ctx_dft) { + if (ctx_dft && llama_get_ctx_other(ctx_dft.get()) != ctx_tgt) { // TODO: in the future, figure out how to infuse target embeddings to the images // for now, we skip this for simplicity // maybe we simply need to call `common_speculative_process()` on the mtmd batches in the `process_chunk` above? + // [TAG_MTMD_DRAFT_PROCESSING] res = input_tokens.process_chunk(ctx_dft.get(), mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out); if (res != 0) { GGML_ABORT("failed to process multi-modal data on draft context\n"); @@ -2893,6 +2926,9 @@ struct server_context_impl { break; } + // embedding requires all tokens in the batch to be output; + // MTP also wants logits at every prompt position so the + // streaming hook can mirror t_h_nextn into ctx_dft. common_batch_add(batch, cur_tok, slot.prompt.tokens.pos_next(),