From 1f1bb6423fe6a9b437d2265cda413f611661a389 Mon Sep 17 00:00:00 2001 From: forforever73 <690105611@qq.com> Date: Wed, 27 May 2026 11:24:38 +0800 Subject: [PATCH 1/6] feat: support step3.7 --- common/jinja/value.cpp | 17 +++++++++++++++++ conversion/__init__.py | 2 ++ conversion/base.py | 2 +- conversion/step3.py | 43 ++++++++++++++++++++++++++++++++++++++---- tests/test-jinja.cpp | 7 +++++++ 5 files changed, 66 insertions(+), 5 deletions(-) diff --git a/common/jinja/value.cpp b/common/jinja/value.cpp index 0b79098cd1e7..ea03e86e72ec 100644 --- a/common/jinja/value.cpp +++ b/common/jinja/value.cpp @@ -594,6 +594,8 @@ static bool string_endswith(const std::string & str, const std::string & suffix) throw not_implemented_exception("String join builtin not implemented"); } +static value fromjson(const func_args & args); + const func_builtins & value_string_t::get_builtins() const { static const func_builtins builtins = { {"default", default_value}, @@ -813,6 +815,7 @@ const func_builtins & value_string_t::get_builtins() const { args.ensure_vals(); return args.get_pos(0); }}, + {"fromjson", fromjson}, {"tojson", tojson}, {"indent", [](const func_args &args) -> value { args.ensure_count(1, 4); @@ -1285,6 +1288,20 @@ static value from_json(const nlohmann::ordered_json & j, bool mark_input) { } } +static value fromjson(const func_args & args) { + args.ensure_count(1); + args.ensure_vals(); + + const auto & input = args.get_pos(0)->as_string(); + + try { + const auto parsed = nlohmann::ordered_json::parse(input.str()); + return from_json(parsed, input.all_parts_are_input()); + } catch (const nlohmann::json::exception & e) { + throw raised_exception("fromjson: failed to parse JSON: " + std::string(e.what())); + } +} + // compare operator for value_t bool value_compare(const value & a, const value & b, value_compare_op op) { auto cmp = [&]() { diff --git a/conversion/__init__.py b/conversion/__init__.py index 2a87bd75b441..c4a170acf2f8 100644 --- a/conversion/__init__.py +++ b/conversion/__init__.py @@ -213,6 +213,7 @@ "Starcoder2ForCausalLM": "starcoder", "Step3p5ForCausalLM": "step3", "StepVLForConditionalGeneration": "step3", + "Step3p7ForConditionalGeneration": "step3", "T5EncoderModel": "t5", "T5ForConditionalGeneration": "t5", "T5WithLMHeadModel": "t5", @@ -279,6 +280,7 @@ "Sarashina2VisionForCausalLM": "sarashina2", "SmolVLMForConditionalGeneration": "smolvlm", "StepVLForConditionalGeneration": "step3", + "Step3p7ForConditionalGeneration": "step3", "UltravoxModel": "ultravox", "VoxtralForConditionalGeneration": "ultravox", "YoutuVLForConditionalGeneration": "youtuvl", diff --git a/conversion/base.py b/conversion/base.py index 1d3554ea2972..55be84411959 100644 --- a/conversion/base.py +++ b/conversion/base.py @@ -2552,7 +2552,7 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st # Step3-VL keeps text config under text_config but uses a custom top-level architecture. # For text conversion we route to a dedicated text-only class. # TODO: refactor this later to avoid adding exception here - if model_type == ModelType.TEXT and arch in ("StepVLForConditionalGeneration", "Sarashina2VisionForCausalLM"): + if model_type == ModelType.TEXT and arch in ("StepVLForConditionalGeneration", "Sarashina2VisionForCausalLM", "Step3p7ForConditionalGeneration"): return arch # if "architectures" is found in the sub-config, use that instead diff --git a/conversion/step3.py b/conversion/step3.py index ba867fb831ba..f7b0abca97c6 100644 --- a/conversion/step3.py +++ b/conversion/step3.py @@ -95,6 +95,13 @@ class Step3VLTextModel(Qwen3Model): model_arch = gguf.MODEL_ARCH.QWEN3 +# Step3.7 reuses the Step3-VL vision tower/projector; a separate subclass is only +# needed because ModelBase.register maps each arch to a class. +@ModelBase.register("Step3p7ForConditionalGeneration") +class Step37VisionModel(Step3VLVisionModel): + pass + + @ModelBase.register("Step3p5ForCausalLM") class Step35Model(TextModel): model_arch = gguf.MODEL_ARCH.STEP35 @@ -203,11 +210,23 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: if isinstance(rope_theta, list): rope_theta = rope_theta[0] base = float(rope_theta) - if (dim := self.hparams.get("head_dim")) is None: - dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - dim = int(dim) - freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + if (storage_dim := self.hparams.get("head_dim")) is None: + storage_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + storage_dim = int(storage_dim) + + # Llama 3 factors apply only to the rotary dims used by full_attention layers + # (partial_rotary_factor * head_dim). Remaining slots are padded with 1.0 so + # sliding_attention layers remain unaffected. set_gguf_parameters already + # guarantees at least one full_attention layer. + layer_types = (self.hparams.get("layer_types") or [])[: self.block_count] + partial_rotary_factors = (self.hparams.get("partial_rotary_factors") or [])[: self.block_count] + full_attention_factor = next( + float(f) for lt, f in zip(layer_types, partial_rotary_factors) if lt == "full_attention" + ) + rotary_dim = int(storage_dim * full_attention_factor) + + freqs = 1.0 / (base ** (torch.arange(0, rotary_dim, 2, dtype=torch.float32) / rotary_dim)) factor = float(rope_params.get("factor", 8.0)) low_freq_factor = float(rope_params.get("low_freq_factor", 1.0)) @@ -228,4 +247,20 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) rope_factors.append(1.0 / ((1.0 - smooth) / factor + smooth)) + # Pad to head_dim/2 with 1.0 so non-scaled layers remain neutral. + if len(rope_factors) < storage_dim // 2: + rope_factors.extend([1.0] * (storage_dim // 2 - len(rope_factors))) + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) + + +@ModelBase.register("Step3p7ForConditionalGeneration") +class Step37TextModel(Step35Model): + model_arch = gguf.MODEL_ARCH.STEP35 + + @classmethod + def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: + name, _ = item + if name.startswith(("vision_model.", "model.vision_model.", "vit_large_projector.")): + return None + return super().filter_tensors(item) diff --git a/tests/test-jinja.cpp b/tests/test-jinja.cpp index b5ee53461e8d..afc64c4c9698 100644 --- a/tests/test-jinja.cpp +++ b/tests/test-jinja.cpp @@ -523,6 +523,12 @@ static void test_set_statement(testing & t) { } static void test_filters(testing & t) { + test_template(t, "fromjson parses object string", + "{% set arguments = '{\"arg\": \"hello\"}' | fromjson %}{{ arguments.arg }}", + json::object(), + "hello" + ); + test_template(t, "upper", "{{ 'hello'|upper }}", json::object(), @@ -2004,6 +2010,7 @@ def raise_exception(message): raise jinja2.exceptions.TemplateError(message) env.filters["tojson"] = lambda x, ensure_ascii=False, indent=None, separators=None, sort_keys=False: json.dumps(x, ensure_ascii=ensure_ascii, indent=indent, separators=separators, sort_keys=sort_keys) +env.filters["fromjson"] = lambda x: json.loads(x) env.globals["strftime_now"] = lambda format: datetime.now().strftime(format) env.globals["raise_exception"] = raise_exception From 38eee9e0cb714a0e44ced0cca7c16c597ddba9d6 Mon Sep 17 00:00:00 2001 From: forforever73 <690105611@qq.com> Date: Fri, 29 May 2026 13:08:45 +0800 Subject: [PATCH 2/6] fix: register Step-3.7 BPE pre-tokenizer hash --- conversion/base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/conversion/base.py b/conversion/base.py index 55be84411959..54e0385af6aa 100644 --- a/conversion/base.py +++ b/conversion/base.py @@ -1541,6 +1541,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5": # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3 res = "deepseek-v3" + if chkhsh == "5841594bd6a8eeecd7207aeec6570831cc97ffaeba51e908bdaf560113177bae": + # ref: https://huggingface.co/stepfun-ai/Step-3.7-Flash (Mistral-style pre-tokenizer; split regex identical to deepseek-v3) + res = "deepseek-v3" if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5": # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B res = "deepseek-r1-qwen" From ae0dcaca9f34babc6ab09c04d541844b1ddb45fb Mon Sep 17 00:00:00 2001 From: forforever73 <690105611@qq.com> Date: Fri, 29 May 2026 17:31:41 +0800 Subject: [PATCH 3/6] delete fromjson --- common/jinja/value.cpp | 16 ---------------- tests/test-jinja.cpp | 7 ------- 2 files changed, 23 deletions(-) diff --git a/common/jinja/value.cpp b/common/jinja/value.cpp index ea03e86e72ec..5c72283eb584 100644 --- a/common/jinja/value.cpp +++ b/common/jinja/value.cpp @@ -594,7 +594,6 @@ static bool string_endswith(const std::string & str, const std::string & suffix) throw not_implemented_exception("String join builtin not implemented"); } -static value fromjson(const func_args & args); const func_builtins & value_string_t::get_builtins() const { static const func_builtins builtins = { @@ -815,7 +814,6 @@ const func_builtins & value_string_t::get_builtins() const { args.ensure_vals(); return args.get_pos(0); }}, - {"fromjson", fromjson}, {"tojson", tojson}, {"indent", [](const func_args &args) -> value { args.ensure_count(1, 4); @@ -1288,20 +1286,6 @@ static value from_json(const nlohmann::ordered_json & j, bool mark_input) { } } -static value fromjson(const func_args & args) { - args.ensure_count(1); - args.ensure_vals(); - - const auto & input = args.get_pos(0)->as_string(); - - try { - const auto parsed = nlohmann::ordered_json::parse(input.str()); - return from_json(parsed, input.all_parts_are_input()); - } catch (const nlohmann::json::exception & e) { - throw raised_exception("fromjson: failed to parse JSON: " + std::string(e.what())); - } -} - // compare operator for value_t bool value_compare(const value & a, const value & b, value_compare_op op) { auto cmp = [&]() { diff --git a/tests/test-jinja.cpp b/tests/test-jinja.cpp index afc64c4c9698..b5ee53461e8d 100644 --- a/tests/test-jinja.cpp +++ b/tests/test-jinja.cpp @@ -523,12 +523,6 @@ static void test_set_statement(testing & t) { } static void test_filters(testing & t) { - test_template(t, "fromjson parses object string", - "{% set arguments = '{\"arg\": \"hello\"}' | fromjson %}{{ arguments.arg }}", - json::object(), - "hello" - ); - test_template(t, "upper", "{{ 'hello'|upper }}", json::object(), @@ -2010,7 +2004,6 @@ def raise_exception(message): raise jinja2.exceptions.TemplateError(message) env.filters["tojson"] = lambda x, ensure_ascii=False, indent=None, separators=None, sort_keys=False: json.dumps(x, ensure_ascii=ensure_ascii, indent=indent, separators=separators, sort_keys=sort_keys) -env.filters["fromjson"] = lambda x: json.loads(x) env.globals["strftime_now"] = lambda format: datetime.now().strftime(format) env.globals["raise_exception"] = raise_exception From eb1bc0ed3001551f621a8de801ae62c679573a98 Mon Sep 17 00:00:00 2001 From: forforever73 <690105611@qq.com> Date: Fri, 29 May 2026 23:42:58 +0800 Subject: [PATCH 4/6] register step3.7 arch to Step35Model --- common/jinja/value.cpp | 1 - conversion/base.py | 3 --- conversion/step3.py | 26 +++++--------------------- 3 files changed, 5 insertions(+), 25 deletions(-) diff --git a/common/jinja/value.cpp b/common/jinja/value.cpp index 5c72283eb584..0b79098cd1e7 100644 --- a/common/jinja/value.cpp +++ b/common/jinja/value.cpp @@ -594,7 +594,6 @@ static bool string_endswith(const std::string & str, const std::string & suffix) throw not_implemented_exception("String join builtin not implemented"); } - const func_builtins & value_string_t::get_builtins() const { static const func_builtins builtins = { {"default", default_value}, diff --git a/conversion/base.py b/conversion/base.py index 54e0385af6aa..55be84411959 100644 --- a/conversion/base.py +++ b/conversion/base.py @@ -1541,9 +1541,6 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5": # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3 res = "deepseek-v3" - if chkhsh == "5841594bd6a8eeecd7207aeec6570831cc97ffaeba51e908bdaf560113177bae": - # ref: https://huggingface.co/stepfun-ai/Step-3.7-Flash (Mistral-style pre-tokenizer; split regex identical to deepseek-v3) - res = "deepseek-v3" if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5": # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B res = "deepseek-r1-qwen" diff --git a/conversion/step3.py b/conversion/step3.py index f7b0abca97c6..0b5a8ca3c6ee 100644 --- a/conversion/step3.py +++ b/conversion/step3.py @@ -15,7 +15,7 @@ from .qwen import Qwen3Model -@ModelBase.register("StepVLForConditionalGeneration") +@ModelBase.register("StepVLForConditionalGeneration", "Step3p7ForConditionalGeneration") class Step3VLVisionModel(MmprojModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -95,14 +95,7 @@ class Step3VLTextModel(Qwen3Model): model_arch = gguf.MODEL_ARCH.QWEN3 -# Step3.7 reuses the Step3-VL vision tower/projector; a separate subclass is only -# needed because ModelBase.register maps each arch to a class. -@ModelBase.register("Step3p7ForConditionalGeneration") -class Step37VisionModel(Step3VLVisionModel): - pass - - -@ModelBase.register("Step3p5ForCausalLM") +@ModelBase.register("Step3p5ForCausalLM", "Step3p7ForConditionalGeneration") class Step35Model(TextModel): model_arch = gguf.MODEL_ARCH.STEP35 @@ -176,6 +169,9 @@ def set_gguf_parameters(self): def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: name, gen = item + if name.startswith("vit_large_projector."): + return None + # Map router bias (expert selection bias) to a GGUF bias tensor if name.endswith(".moe.router_bias"): name += ".bias" @@ -252,15 +248,3 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: rope_factors.extend([1.0] * (storage_dim // 2 - len(rope_factors))) yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) - - -@ModelBase.register("Step3p7ForConditionalGeneration") -class Step37TextModel(Step35Model): - model_arch = gguf.MODEL_ARCH.STEP35 - - @classmethod - def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: - name, _ = item - if name.startswith(("vision_model.", "model.vision_model.", "vit_large_projector.")): - return None - return super().filter_tensors(item) From 84c5857e6163778cebb47a502a881e32e89dceee Mon Sep 17 00:00:00 2001 From: forforever73 <690105611@qq.com> Date: Sat, 30 May 2026 02:23:29 +0800 Subject: [PATCH 5/6] drop vit projector in base filter --- conversion/base.py | 2 +- conversion/step3.py | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/conversion/base.py b/conversion/base.py index 55be84411959..10969a3156ab 100644 --- a/conversion/base.py +++ b/conversion/base.py @@ -1112,7 +1112,7 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca name, gen = item # Skip multimodal tensors - if name.startswith(("mlp", "vit.", "vpm.", "siglip2.", "conformer.", "merger.", "resampler.", "sound_encoder.", "sound_projection.", "speech_embeddings.")) \ + if name.startswith(("mlp", "vit.", "vit_large_projector.", "vpm.", "siglip2.", "conformer.", "merger.", "resampler.", "sound_encoder.", "sound_projection.", "speech_embeddings.")) \ or "visual." in name or "vision." in name or "audio." in name or "talker." in name \ or "vision_" in name or "audio_" in name or "sam_model" in name \ or "token2wav." in name or "code2wav." in name \ diff --git a/conversion/step3.py b/conversion/step3.py index 0b5a8ca3c6ee..673fa14af3e3 100644 --- a/conversion/step3.py +++ b/conversion/step3.py @@ -168,10 +168,6 @@ def set_gguf_parameters(self): @classmethod def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: name, gen = item - - if name.startswith("vit_large_projector."): - return None - # Map router bias (expert selection bias) to a GGUF bias tensor if name.endswith(".moe.router_bias"): name += ".bias" From 7e6776a1d44daf0cdea75fd6209b205f9ca21dc0 Mon Sep 17 00:00:00 2001 From: eauchs <138336683+eauchs@users.noreply.github.com> Date: Sun, 31 May 2026 21:33:22 +0200 Subject: [PATCH 6/6] feat(step3.7): support NextN/MTP heads for speculative decoding Step-3.7-Flash ships num_nextn_predict_layers (3) dense MTP blocks after the main transformer (HF model.layers.{N..N+K-1}). The current converter silently drops them and the runtime arch graph never declares an MTP draft head, so `--spec-type draft-mtp` is unavailable for step35 GGUFs. This change wires the full chain end-to-end: conversion/step3.py - Extend block_count by num_nextn_predict_layers. - Stop filtering HF layers >= num_hidden_layers when MTP is enabled. - Emit `step35.nextn_predict_layers` GGUF metadata. - Pad per-layer arrays (layer_types, partial_rotary_factors, swiglu_limits[_shared]) for the MTP blocks (full-attention, no clamp). gguf-py/gguf/constants.py - Register the NEXTN_* tensors on MODEL_ARCH.STEP35. gguf-py/gguf/tensor_mapping.py - Map Step-3.7's `transformer.shared_head.{norm,output}` to NEXTN_SHARED_HEAD_{NORM,HEAD}. src/models/step35.cpp + src/models/models.h - Read `nextn_predict_layers` in load_arch_hparams; force the trailing blocks to full-attention. - Split tensor loading: trunk (MoE + shared expert + Step35 attn) for [0, n_main) and MTP heads (dense SwiGLU MLP + nextn.* + per-block shared head) for [n_main, n_layer). - Trim the main forward to n_transformer_layers and expose res->t_h_pre_norm so the draft head can seed AR steps. - Implement llama_model_step35::graph_mtp following the Qwen3.5 single-block convention but with Step35 attention semantics (head-wise sigmoid gate, q/k norm, partial rotary) and a dense MLP (Step-3.7 MTP heads use mlp.{gate,up,down}_proj, not MoE). --- conversion/step3.py | 56 +++++-- gguf-py/gguf/constants.py | 6 + gguf-py/gguf/tensor_mapping.py | 2 + src/models/models.h | 7 + src/models/step35.cpp | 292 ++++++++++++++++++++++++++++++++- 5 files changed, 349 insertions(+), 14 deletions(-) diff --git a/conversion/step3.py b/conversion/step3.py index 673fa14af3e3..b6c8c0d1060a 100644 --- a/conversion/step3.py +++ b/conversion/step3.py @@ -99,6 +99,19 @@ class Step3VLTextModel(Qwen3Model): class Step35Model(TextModel): model_arch = gguf.MODEL_ARCH.STEP35 + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # Step-3.7 ships NextN/MTP heads (num_nextn_predict_layers > 0) after the + # main transformer stack. We expose them as extra blocks (blk.N..blk.N+K-1) + # so the model loader can find their tensors under blk.%d.nextn.* and the + # final dense MLP / shared head tensors. + nextn = int(self.hparams.get("num_nextn_predict_layers", 0)) + self._nextn_predict_layers = nextn + self._n_main_layers = int(self.hparams["num_hidden_layers"]) + if nextn > 0: + self.block_count = self._n_main_layers + nextn + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + def set_gguf_parameters(self): rope_theta = self.hparams.get("rope_theta") if isinstance(rope_theta, list): @@ -109,8 +122,11 @@ def set_gguf_parameters(self): super().set_gguf_parameters() - layer_types = self.hparams.get("layer_types") or [] - partial_rotary_factors = self.hparams.get("partial_rotary_factors") or [] + nextn = self._nextn_predict_layers + n_main = self._n_main_layers + + layer_types = list(self.hparams.get("layer_types") or []) + partial_rotary_factors = list(self.hparams.get("partial_rotary_factors") or []) attn_other = self.hparams.get("attention_other_setting") or {} n_head_base = self.hparams["num_attention_heads"] @@ -119,9 +135,19 @@ def set_gguf_parameters(self): n_head_swa = attn_other.get("num_attention_heads", n_head_base) n_kv_swa = attn_other.get("num_attention_groups", n_kv_base) - layer_types = layer_types[: self.block_count] - partial_rotary_factors = partial_rotary_factors[: self.block_count] + # Trim the HF lists to the main transformer length first; the upstream + # config sometimes includes entries for the MTP heads, sometimes not. + layer_types = layer_types[:n_main] + partial_rotary_factors = partial_rotary_factors[:n_main] assert [1.0 if lt == "sliding_attention" else 0.5 for lt in layer_types] == partial_rotary_factors + + # MTP heads are full-attention only and use the full-attention rope branch + # (half rope dims, base rope_theta). Extend per-layer arrays accordingly so + # the GGUF carries one entry per block. + if nextn > 0: + layer_types += ["full_attention"] * nextn + partial_rotary_factors += [0.5] * nextn + head_arr = [n_head_swa if lt == "sliding_attention" else n_head_base for lt in layer_types] kv_arr = [n_kv_swa if lt == "sliding_attention" else n_kv_base for lt in layer_types] swa_pat = [lt == "sliding_attention" for lt in layer_types] @@ -157,12 +183,21 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-5)) - # Optional per-layer SwiGLU clamps. + # NextN/MTP heads — Step-3.7 ships num_nextn_predict_layers dense MTP + # blocks after the main transformer (model.layers.N..N+K-1 in HF). + if self._nextn_predict_layers > 0: + self.gguf_writer.add_nextn_predict_layers(self._nextn_predict_layers) + + # Optional per-layer SwiGLU clamps. Pad with 0.0 for the MTP blocks + # (MTP heads use a dense MLP without clamping), so the array length + # matches block_count. if (limits := self.hparams.get("swiglu_limits")) is not None: - limits_f = [0.0 if v is None else float(v) for v in limits[: self.block_count]] + limits_f = [0.0 if v is None else float(v) for v in limits[: self._n_main_layers]] + limits_f += [0.0] * self._nextn_predict_layers self.gguf_writer.add_swiglu_clamp_exp(limits_f) if (limits_shared := self.hparams.get("swiglu_limits_shared")) is not None: - limits_shared_f = [0.0 if v is None else float(v) for v in limits_shared[: self.block_count]] + limits_shared_f = [0.0 if v is None else float(v) for v in limits_shared[: self._n_main_layers]] + limits_shared_f += [0.0] * self._nextn_predict_layers self.gguf_writer.add_swiglu_clamp_shexp(limits_shared_f) @classmethod @@ -175,11 +210,12 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca return super().filter_tensors((name, gen)) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): - # remove mtp layers + # Step-3.7 MTP heads live at model.layers.{N..N+K-1}.{eh_proj,enorm,hnorm,...} + # We keep them when nextn_predict_layers > 0 (mapped via NEXTN_* tensors) + # and drop them otherwise to preserve backward compatibility with text-only conversion. if (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None: il = int(m.group(1)) - n_main = int(self.hparams.get("num_hidden_layers", self.block_count)) - if il >= n_main: + if il >= self._n_main_layers and self._nextn_predict_layers == 0: return if name.endswith("norm.weight"): data_torch += 1.0 diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 0189f6f03c51..37f688ae72d6 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -3937,6 +3937,12 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_GATE_SHEXP, MODEL_TENSOR.FFN_DOWN_SHEXP, MODEL_TENSOR.FFN_EXP_PROBS_B, + # NextN/MTP heads (Step-3.7 num_nextn_predict_layers > 0) + MODEL_TENSOR.NEXTN_EH_PROJ, + MODEL_TENSOR.NEXTN_ENORM, + MODEL_TENSOR.NEXTN_HNORM, + MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD, + MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM, ], MODEL_ARCH.LLAMA_EMBED: [ MODEL_TENSOR.TOKEN_EMBD, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index ecc3c05f99ac..7d5502c4e852 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -2273,10 +2273,12 @@ class TensorNameMap: MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: ( "model.layers.{bid}.shared_head.head", + "model.layers.{bid}.transformer.shared_head.output", # step3.7 ), MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: ( "model.layers.{bid}.shared_head.norm", + "model.layers.{bid}.transformer.shared_head.norm", # step3.7 ), } diff --git a/src/models/models.h b/src/models/models.h index db228865d5d0..fe85f742f542 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -1900,5 +1900,12 @@ struct llama_model_step35 : public llama_model_base { graph(const llama_model & model, const llm_graph_params & params); }; + // NextN/MTP draft head used by --spec-type draft-mtp. + // Steps the AR draft loop one position ahead using the pre-norm hidden + // state from the trunk and the embedding of the previous draft token. + struct graph_mtp : public llm_graph_context { + graph_mtp(const llama_model & model, const llm_graph_params & params); + }; + std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; diff --git a/src/models/step35.cpp b/src/models/step35.cpp index 3b68e68707ae..849a5adc9db5 100644 --- a/src/models/step35.cpp +++ b/src/models/step35.cpp @@ -26,7 +26,20 @@ void llama_model_step35::load_arch_hparams(llama_model_loader & ml) { ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP, hparams.swiglu_clamp_exp, hparams.n_layer, false); ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer, false); - switch (hparams.n_layer) { + // NextN/MTP heads — Step-3.7 trails the main transformer with + // num_nextn_predict_layers dense MTP blocks (model.layers.N..N+K-1 in HF). + // The converter appends them to block_count so n_layer reflects the total. + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); + GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer"); + + // The MTP blocks are dense + full-attention even though the converter marks + // them as full_attention in swa_layers. Defensive: force full-attention. + for (uint32_t i = hparams.n_layer - hparams.nextn_predict_layers; i < hparams.n_layer; ++i) { + hparams.swa_layers[i] = false; + } + + const uint32_t n_main_layer = hparams.n_layer - hparams.nextn_predict_layers; + switch (n_main_layer) { case 45: type = LLM_TYPE_196B_A11B; break; default: type = LLM_TYPE_UNKNOWN; } @@ -35,6 +48,8 @@ void llama_model_step35::load_arch_hparams(llama_model_loader & ml) { void llama_model_step35::load_arch_tensors(llama_model_loader &) { LLAMA_LOAD_LOCALS; + const int n_main = n_layer - (int) hparams.nextn_predict_layers; + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); // output @@ -51,7 +66,7 @@ void llama_model_step35::load_arch_tensors(llama_model_loader &) { n_rot_max = n_rot; } - for (int i = 0; i < n_layer; ++i) { + auto load_block_trunk = [&](int i) { auto & layer = layers[i]; const uint32_t n_head_l = hparams.n_head(i); @@ -95,10 +110,74 @@ void llama_model_step35::load_arch_tensors(llama_model_loader &) { layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED); layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED); layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, TENSOR_NOT_REQUIRED); + }; + + // Step-3.7 MTP block layout (per HF safetensors index): + // model.layers.{N..N+K-1}.{eh_proj, enorm, hnorm, + // input_layernorm, post_attention_layernorm, + // self_attn.{q,k,v,o,g}_proj, self_attn.{q,k}_norm, + // mlp.{gate,up,down}_proj, + // transformer.shared_head.{norm,output}} + // Each MTP head is a single transformer block with full attention and a + // DENSE SwiGLU MLP (not MoE). It owns its own LM head (shared head). + auto load_block_mtp = [&](int i) { + auto & layer = layers[i]; + + const uint32_t n_head_l = hparams.n_head(i); + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i); + + // Pre-attention norm + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + // Standard Step35 attention block (q/k norm, head-wise gate, partial RoPE) + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED); + + // rope factors (shared, see trunk) + if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) { + layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | TENSOR_DUPLICATED); + layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | TENSOR_DUPLICATED); + } else { + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | TENSOR_DUPLICATED); + } + + create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head_l, n_embd_k_gqa, n_embd_v_gqa, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, 0); + layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_head_l}, TENSOR_NOT_REQUIRED); + + // Dense SwiGLU MLP (mlp.gate_proj, mlp.up_proj, mlp.down_proj in HF) + // Sized via the standard ffn_dim (intermediate_size). `post_attention_layernorm` + // in the HF MTP block functions as the pre-FFN norm and therefore maps to + // FFN_NORM via the tensor name map. + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + + // NextN-specific tensors + layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), {2 * n_embd, n_embd}, 0); + layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), {n_embd}, 0); + layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), {n_embd}, 0); + // Step-3.7 has per-MTP-block shared head (transformer.shared_head.{norm,output}). + layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); + layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + // Step-3.7 does not ship a per-block embed_tokens — main tok_embd is reused. + layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + }; + + for (int i = 0; i < n_main; ++i) { + load_block_trunk(i); + } + for (int i = n_main; i < n_layer; ++i) { + load_block_mtp(i); } } std::unique_ptr llama_model_step35::build_arch_graph(const llm_graph_params & params) const { + if (params.gtype == LLM_GRAPH_TYPE_DECODER_MTP) { + return std::make_unique(*this, params); + } return std::make_unique(*this, params); } @@ -111,7 +190,10 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para auto * inp_attn = build_attn_inp_kv_iswa(); ggml_tensor * inp_out_ids = build_inp_out_ids(); - for (int il = 0; il < n_layer; ++il) { + // Iterate only the main transformer stack; the trailing nextn_predict_layers + // blocks are MTP heads invoked via LLM_GRAPH_TYPE_DECODER_MTP. + const int n_transformer_layers = n_layer - (int) hparams.nextn_predict_layers; + for (int il = 0; il < n_transformer_layers; ++il) { ggml_tensor * inpSA = inpL; const uint32_t n_head_l = hparams.n_head(il); @@ -198,7 +280,7 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para cb(cur, "attn_proj", il); } - if (il == n_layer - 1 && inp_out_ids) { + if (il == n_transformer_layers - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -257,6 +339,11 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para cur = inpL; + // Expose the pre-norm hidden state — the MTP draft head consumes this as + // its `h_input` (the AR draft loop seeds successive MTP steps with it). + cb(cur, "h_pre_norm", -1); + res->t_h_pre_norm = cur; + cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); res->t_embd = cur; @@ -267,3 +354,200 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para ggml_build_forward_expand(gf, cur); } + +// ============================================================================= +// MTP draft head graph (LLM_GRAPH_TYPE_DECODER_MTP) +// ============================================================================= +// +// Step-3.7-Flash ships num_nextn_predict_layers (typically 3) dense MTP blocks +// trailing the main transformer. Each block is a single full-attention layer +// with a dense SwiGLU MLP and its own LM head. The block predicts the token +// one position ahead given (h_prev, prev_token) where h_prev is the pre-norm +// hidden state from the previous step (trunk for the first MTP step, then the +// previous MTP block's pre-norm output for subsequent chained MTP calls). +// +// To stay aligned with the existing speculative driver this graph follows the +// Qwen3.5 MTP layout (single-block draft per invocation). For Step-3.7 we use +// the FIRST MTP block (lowest index). Multi-step draft chains can be issued by +// calling this graph repeatedly with refreshed (h, token) pairs. +// +// Graph layout per the reference HF tensors: +// h_norm = RMSNorm_h(h_input) // hnorm +// e_norm = RMSNorm_e(embed(prev_token)) // enorm +// x = eh_proj(concat(e_norm, h_norm, dim=0)) +// attn_in = input_layernorm(x) +// attn_out = step35_self_attn(attn_in) +// x = x + attn_out +// ffn_in = post_attention_layernorm(x) +// ffn_out = swiglu_mlp(ffn_in) +// h_next = x + ffn_out +// logits = shared_head_output(shared_head_norm(h_next)) +// +// The attention block reuses the Step35 head-wise sigmoid gate and partial +// rotary embeddings (full_attention => n_rot = head_dim/2). +llama_model_step35::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params) + : llm_graph_context(params) { + GGML_ASSERT(hparams.nextn_predict_layers > 0 && "STEP35 MTP requires nextn_predict_layers > 0"); + + // Use the first MTP block (lowest index). Multi-block chains are driven + // externally by re-invoking this graph with refreshed (h, token) pairs. + const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers; + const auto & layer = model.layers[il]; + + GGML_ASSERT(layer.nextn.eh_proj && "STEP35 MTP: missing nextn.eh_proj"); + GGML_ASSERT(layer.nextn.enorm && "STEP35 MTP: missing nextn.enorm"); + GGML_ASSERT(layer.nextn.hnorm && "STEP35 MTP: missing nextn.hnorm"); + GGML_ASSERT(layer.ffn_gate && layer.ffn_up && layer.ffn_down && "STEP35 MTP: missing dense MLP weights"); + + // Input plumbing: the MTP graph takes (token_id, h_pre_norm_row) per draft + // position. We expose them through the standard llm_graph_input_embd. + auto inp = std::make_unique(hparams.n_embd); + inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + ggml_set_input(inp->tokens); + inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens); + ggml_set_input(inp->embd); + ggml_set_name(inp->embd, "mtp_h_input"); + + ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd; + + ggml_tensor * h_input = inp->embd; + ggml_tensor * tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens); + cb(tok_embd, "mtp_tok_embd", il); + + res->add_input(std::move(inp)); + + ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + auto * inp_attn = build_attn_inp_kv(); + + // hnorm/enorm + eh_proj + ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il); + cb(h_norm, "mtp_hnorm", il); + ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il); + cb(e_norm, "mtp_enorm", il); + + ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0); + cb(concat, "mtp_concat", il); + + ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat); + cb(cur, "mtp_eh_proj", il); + + ggml_tensor * inpSA = cur; + + // ------------------------------------------------------------------------- + // Step35-style attention block (mirrors graph::graph for full-attention) + // ------------------------------------------------------------------------- + { + cur = build_norm(cur, layer.attn_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "mtp_attn_norm", il); + + const uint32_t n_head_l = hparams.n_head(il); + const uint32_t n_head_kv_l = hparams.n_head_kv(il); + + ggml_tensor * Qcur = build_lora_mm(layer.wq, cur); + ggml_tensor * Kcur = build_lora_mm(layer.wk, cur); + ggml_tensor * Vcur = build_lora_mm(layer.wv, cur); + cb(Qcur, "mtp_Qcur", il); + cb(Kcur, "mtp_Kcur", il); + cb(Vcur, "mtp_Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head_l, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv_l, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv_l, n_tokens); + + if (layer.attn_q_norm) { + Qcur = build_norm(Qcur, layer.attn_q_norm, nullptr, LLM_NORM_RMS, il); + cb(Qcur, "mtp_Qcur_normed", il); + } + if (layer.attn_k_norm) { + Kcur = build_norm(Kcur, layer.attn_k_norm, nullptr, LLM_NORM_RMS, il); + cb(Kcur, "mtp_Kcur_normed", il); + } + + // MTP block is full-attention (n_rot = head_dim/2 like main full-attn). + const float freq_base_l = model.get_rope_freq_base(cparams, il); + const float freq_scale_l = model.get_rope_freq_scale(cparams, il); + ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); + const int64_t n_rot_l = hparams.n_rot(il); + + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, + n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, + ext_factor, attn_factor, beta_fast, beta_slow); + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, + n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "mtp_Qcur_pos", il); + cb(Kcur, "mtp_Kcur_pos", il); + + const float kq_scale = 1.0f / sqrtf(float(n_embd_head_k)); + ggml_tensor * attn_out = build_attn(inp_attn, + nullptr, nullptr, nullptr, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); + cb(attn_out, "mtp_attn_out_raw", il); + + // head-wise sigmoid attention gate (g_proj) + if (layer.wqkv_gate) { + ggml_tensor * gate = build_lora_mm(layer.wqkv_gate, cur); + cb(gate, "mtp_attn_gate", il); + gate = ggml_sigmoid(ctx0, gate); + cb(gate, "mtp_attn_gate_sigmoid", il); + + ggml_tensor * attn_3d = ggml_reshape_3d(ctx0, attn_out, n_embd_head_v, n_head_l, n_tokens); + ggml_tensor * gate_3d = ggml_reshape_3d(ctx0, gate, 1, n_head_l, n_tokens); + attn_3d = ggml_mul(ctx0, attn_3d, gate_3d); + attn_out = ggml_reshape_2d(ctx0, attn_3d, n_embd_head_v * n_head_l, n_tokens); + cb(attn_out, "mtp_attn_gated", il); + } + + cur = build_lora_mm(layer.wo, attn_out); + cb(cur, "mtp_attn_proj", il); + } + + cur = ggml_add(ctx0, cur, inpSA); + cb(cur, "mtp_attn_residual", il); + + // ------------------------------------------------------------------------- + // Dense SwiGLU MLP — Step-3.7 MTP blocks use a single dense MLP (not MoE). + // HF `post_attention_layernorm` functions as the pre-FFN norm (FFN_NORM). + // ------------------------------------------------------------------------- + ggml_tensor * ffn_residual = cur; + GGML_ASSERT(layer.ffn_norm && "STEP35 MTP: missing ffn_norm (HF post_attention_layernorm)"); + cur = build_norm(cur, layer.ffn_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "mtp_ffn_norm", il); + + cur = build_ffn(cur, + layer.ffn_up, nullptr, nullptr, + layer.ffn_gate, nullptr, nullptr, + layer.ffn_down, nullptr, nullptr, + nullptr, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "mtp_ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_residual); + cb(cur, "mtp_post_ffn", il); + + // Pre-norm hidden state for the AR draft loop (consumed as next h_input). + cb(cur, "h_pre_norm", -1); + res->t_h_pre_norm = cur; + + if (inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + } + + // Per-block shared head: use nextn.shared_head_norm / nextn.shared_head_head + // when present; otherwise fall back to the main output_norm / output (i.e. + // tied LM head when the MTP block has no dedicated head — Step-3.7 always + // ships a per-block head). + ggml_tensor * head_norm_w = layer.nextn.shared_head_norm ? layer.nextn.shared_head_norm : model.output_norm; + GGML_ASSERT(head_norm_w && "STEP35 MTP: missing shared_head_norm / output_norm"); + cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1); + cb(cur, "mtp_shared_head_norm", -1); + + ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output; + GGML_ASSERT(head_w && "STEP35 MTP: missing shared_head_head / output"); + cur = build_lora_mm(head_w, cur); + cb(cur, "result_output", -1); + + res->t_logits = cur; + ggml_build_forward_expand(gf, cur); +}