From 1f1bb6423fe6a9b437d2265cda413f611661a389 Mon Sep 17 00:00:00 2001
From: forforever73 <690105611@qq.com>
Date: Wed, 27 May 2026 11:24:38 +0800
Subject: [PATCH 1/6] feat: support step3.7

---
 common/jinja/value.cpp | 17 +++++++++++++++++
 conversion/__init__.py |  2 ++
 conversion/base.py     |  2 +-
 conversion/step3.py    | 43 ++++++++++++++++++++++++++++++++++++++----
 tests/test-jinja.cpp   |  7 +++++++
 5 files changed, 66 insertions(+), 5 deletions(-)

diff --git a/common/jinja/value.cpp b/common/jinja/value.cpp
index 0b79098cd1e7..ea03e86e72ec 100644
--- a/common/jinja/value.cpp
+++ b/common/jinja/value.cpp
@@ -594,6 +594,8 @@ static bool string_endswith(const std::string & str, const std::string & suffix)
     throw not_implemented_exception("String join builtin not implemented");
 }
 
+static value fromjson(const func_args & args);
+
 const func_builtins & value_string_t::get_builtins() const {
     static const func_builtins builtins = {
         {"default", default_value},
@@ -813,6 +815,7 @@ const func_builtins & value_string_t::get_builtins() const {
             args.ensure_vals<value_string>();
             return args.get_pos(0);
         }},
+        {"fromjson", fromjson},
         {"tojson", tojson},
         {"indent", [](const func_args &args) -> value {
             args.ensure_count(1, 4);
@@ -1285,6 +1288,20 @@ static value from_json(const nlohmann::ordered_json & j, bool mark_input) {
     }
 }
 
+static value fromjson(const func_args & args) {
+    args.ensure_count(1);
+    args.ensure_vals<value_string>();
+
+    const auto & input = args.get_pos(0)->as_string();
+
+    try {
+        const auto parsed = nlohmann::ordered_json::parse(input.str());
+        return from_json(parsed, input.all_parts_are_input());
+    } catch (const nlohmann::json::exception & e) {
+        throw raised_exception("fromjson: failed to parse JSON: " + std::string(e.what()));
+    }
+}
+
 // compare operator for value_t
 bool value_compare(const value & a, const value & b, value_compare_op op) {
     auto cmp = [&]() {
diff --git a/conversion/__init__.py b/conversion/__init__.py
index 2a87bd75b441..c4a170acf2f8 100644
--- a/conversion/__init__.py
+++ b/conversion/__init__.py
@@ -213,6 +213,7 @@
     "Starcoder2ForCausalLM": "starcoder",
     "Step3p5ForCausalLM": "step3",
     "StepVLForConditionalGeneration": "step3",
+    "Step3p7ForConditionalGeneration": "step3",
     "T5EncoderModel": "t5",
     "T5ForConditionalGeneration": "t5",
     "T5WithLMHeadModel": "t5",
@@ -279,6 +280,7 @@
     "Sarashina2VisionForCausalLM": "sarashina2",
     "SmolVLMForConditionalGeneration": "smolvlm",
     "StepVLForConditionalGeneration": "step3",
+    "Step3p7ForConditionalGeneration": "step3",
     "UltravoxModel": "ultravox",
     "VoxtralForConditionalGeneration": "ultravox",
     "YoutuVLForConditionalGeneration": "youtuvl",
diff --git a/conversion/base.py b/conversion/base.py
index 1d3554ea2972..55be84411959 100644
--- a/conversion/base.py
+++ b/conversion/base.py
@@ -2552,7 +2552,7 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st
     # Step3-VL keeps text config under text_config but uses a custom top-level architecture.
     # For text conversion we route to a dedicated text-only class.
     # TODO: refactor this later to avoid adding exception here
-    if model_type == ModelType.TEXT and arch in ("StepVLForConditionalGeneration", "Sarashina2VisionForCausalLM"):
+    if model_type == ModelType.TEXT and arch in ("StepVLForConditionalGeneration", "Sarashina2VisionForCausalLM", "Step3p7ForConditionalGeneration"):
         return arch
 
     # if "architectures" is found in the sub-config, use that instead
diff --git a/conversion/step3.py b/conversion/step3.py
index ba867fb831ba..f7b0abca97c6 100644
--- a/conversion/step3.py
+++ b/conversion/step3.py
@@ -95,6 +95,13 @@ class Step3VLTextModel(Qwen3Model):
     model_arch = gguf.MODEL_ARCH.QWEN3
 
 
+# Step3.7 reuses the Step3-VL vision tower/projector; a separate subclass is only
+# needed because ModelBase.register maps each arch to a class.
+@ModelBase.register("Step3p7ForConditionalGeneration")
+class Step37VisionModel(Step3VLVisionModel):
+    pass
+
+
 @ModelBase.register("Step3p5ForCausalLM")
 class Step35Model(TextModel):
     model_arch = gguf.MODEL_ARCH.STEP35
@@ -203,11 +210,23 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
         if isinstance(rope_theta, list):
             rope_theta = rope_theta[0]
         base = float(rope_theta)
-        if (dim := self.hparams.get("head_dim")) is None:
-            dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
-        dim = int(dim)
 
-        freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
+        if (storage_dim := self.hparams.get("head_dim")) is None:
+            storage_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
+        storage_dim = int(storage_dim)
+
+        # Llama 3 factors apply only to the rotary dims used by full_attention layers
+        # (partial_rotary_factor * head_dim). Remaining slots are padded with 1.0 so
+        # sliding_attention layers remain unaffected. set_gguf_parameters already
+        # guarantees at least one full_attention layer.
+        layer_types = (self.hparams.get("layer_types") or [])[: self.block_count]
+        partial_rotary_factors = (self.hparams.get("partial_rotary_factors") or [])[: self.block_count]
+        full_attention_factor = next(
+            float(f) for lt, f in zip(layer_types, partial_rotary_factors) if lt == "full_attention"
+        )
+        rotary_dim = int(storage_dim * full_attention_factor)
+
+        freqs = 1.0 / (base ** (torch.arange(0, rotary_dim, 2, dtype=torch.float32) / rotary_dim))
 
         factor = float(rope_params.get("factor", 8.0))
         low_freq_factor = float(rope_params.get("low_freq_factor", 1.0))
@@ -228,4 +247,20 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
                 smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
                 rope_factors.append(1.0 / ((1.0 - smooth) / factor + smooth))
 
+        # Pad to head_dim/2 with 1.0 so non-scaled layers remain neutral.
+        if len(rope_factors) < storage_dim // 2:
+            rope_factors.extend([1.0] * (storage_dim // 2 - len(rope_factors)))
+
         yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
+
+
+@ModelBase.register("Step3p7ForConditionalGeneration")
+class Step37TextModel(Step35Model):
+    model_arch = gguf.MODEL_ARCH.STEP35
+
+    @classmethod
+    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
+        name, _ = item
+        if name.startswith(("vision_model.", "model.vision_model.", "vit_large_projector.")):
+            return None
+        return super().filter_tensors(item)
diff --git a/tests/test-jinja.cpp b/tests/test-jinja.cpp
index b5ee53461e8d..afc64c4c9698 100644
--- a/tests/test-jinja.cpp
+++ b/tests/test-jinja.cpp
@@ -523,6 +523,12 @@ static void test_set_statement(testing & t) {
 }
 
 static void test_filters(testing & t) {
+    test_template(t, "fromjson parses object string",
+        "{% set arguments = '{\"arg\": \"hello\"}' | fromjson %}{{ arguments.arg }}",
+        json::object(),
+        "hello"
+    );
+
     test_template(t, "upper",
         "{{ 'hello'|upper }}",
         json::object(),
@@ -2004,6 +2010,7 @@ def raise_exception(message):
     raise jinja2.exceptions.TemplateError(message)
 
 env.filters["tojson"] = lambda x, ensure_ascii=False, indent=None, separators=None, sort_keys=False: json.dumps(x, ensure_ascii=ensure_ascii, indent=indent, separators=separators, sort_keys=sort_keys)
+env.filters["fromjson"] = lambda x: json.loads(x)
 env.globals["strftime_now"] = lambda format: datetime.now().strftime(format)
 env.globals["raise_exception"] = raise_exception
 

From 38eee9e0cb714a0e44ced0cca7c16c597ddba9d6 Mon Sep 17 00:00:00 2001
From: forforever73 <690105611@qq.com>
Date: Fri, 29 May 2026 13:08:45 +0800
Subject: [PATCH 2/6] fix: register Step-3.7 BPE pre-tokenizer hash

---
 conversion/base.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/conversion/base.py b/conversion/base.py
index 55be84411959..54e0385af6aa 100644
--- a/conversion/base.py
+++ b/conversion/base.py
@@ -1541,6 +1541,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
             # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
             res = "deepseek-v3"
+        if chkhsh == "5841594bd6a8eeecd7207aeec6570831cc97ffaeba51e908bdaf560113177bae":
+            # ref: https://huggingface.co/stepfun-ai/Step-3.7-Flash (Mistral-style pre-tokenizer; split regex identical to deepseek-v3)
+            res = "deepseek-v3"
         if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
             # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
             res = "deepseek-r1-qwen"

From ae0dcaca9f34babc6ab09c04d541844b1ddb45fb Mon Sep 17 00:00:00 2001
From: forforever73 <690105611@qq.com>
Date: Fri, 29 May 2026 17:31:41 +0800
Subject: [PATCH 3/6] delete fromjson

---
 common/jinja/value.cpp | 16 ----------------
 tests/test-jinja.cpp   |  7 -------
 2 files changed, 23 deletions(-)

diff --git a/common/jinja/value.cpp b/common/jinja/value.cpp
index ea03e86e72ec..5c72283eb584 100644
--- a/common/jinja/value.cpp
+++ b/common/jinja/value.cpp
@@ -594,7 +594,6 @@ static bool string_endswith(const std::string & str, const std::string & suffix)
     throw not_implemented_exception("String join builtin not implemented");
 }
 
-static value fromjson(const func_args & args);
 
 const func_builtins & value_string_t::get_builtins() const {
     static const func_builtins builtins = {
@@ -815,7 +814,6 @@ const func_builtins & value_string_t::get_builtins() const {
             args.ensure_vals<value_string>();
             return args.get_pos(0);
         }},
-        {"fromjson", fromjson},
         {"tojson", tojson},
         {"indent", [](const func_args &args) -> value {
             args.ensure_count(1, 4);
@@ -1288,20 +1286,6 @@ static value from_json(const nlohmann::ordered_json & j, bool mark_input) {
     }
 }
 
-static value fromjson(const func_args & args) {
-    args.ensure_count(1);
-    args.ensure_vals<value_string>();
-
-    const auto & input = args.get_pos(0)->as_string();
-
-    try {
-        const auto parsed = nlohmann::ordered_json::parse(input.str());
-        return from_json(parsed, input.all_parts_are_input());
-    } catch (const nlohmann::json::exception & e) {
-        throw raised_exception("fromjson: failed to parse JSON: " + std::string(e.what()));
-    }
-}
-
 // compare operator for value_t
 bool value_compare(const value & a, const value & b, value_compare_op op) {
     auto cmp = [&]() {
diff --git a/tests/test-jinja.cpp b/tests/test-jinja.cpp
index afc64c4c9698..b5ee53461e8d 100644
--- a/tests/test-jinja.cpp
+++ b/tests/test-jinja.cpp
@@ -523,12 +523,6 @@ static void test_set_statement(testing & t) {
 }
 
 static void test_filters(testing & t) {
-    test_template(t, "fromjson parses object string",
-        "{% set arguments = '{\"arg\": \"hello\"}' | fromjson %}{{ arguments.arg }}",
-        json::object(),
-        "hello"
-    );
-
     test_template(t, "upper",
         "{{ 'hello'|upper }}",
         json::object(),
@@ -2010,7 +2004,6 @@ def raise_exception(message):
     raise jinja2.exceptions.TemplateError(message)
 
 env.filters["tojson"] = lambda x, ensure_ascii=False, indent=None, separators=None, sort_keys=False: json.dumps(x, ensure_ascii=ensure_ascii, indent=indent, separators=separators, sort_keys=sort_keys)
-env.filters["fromjson"] = lambda x: json.loads(x)
 env.globals["strftime_now"] = lambda format: datetime.now().strftime(format)
 env.globals["raise_exception"] = raise_exception
 

From eb1bc0ed3001551f621a8de801ae62c679573a98 Mon Sep 17 00:00:00 2001
From: forforever73 <690105611@qq.com>
Date: Fri, 29 May 2026 23:42:58 +0800
Subject: [PATCH 4/6] register step3.7 arch to Step35Model

---
 common/jinja/value.cpp |  1 -
 conversion/base.py     |  3 ---
 conversion/step3.py    | 26 +++++---------------------
 3 files changed, 5 insertions(+), 25 deletions(-)

diff --git a/common/jinja/value.cpp b/common/jinja/value.cpp
index 5c72283eb584..0b79098cd1e7 100644
--- a/common/jinja/value.cpp
+++ b/common/jinja/value.cpp
@@ -594,7 +594,6 @@ static bool string_endswith(const std::string & str, const std::string & suffix)
     throw not_implemented_exception("String join builtin not implemented");
 }
 
-
 const func_builtins & value_string_t::get_builtins() const {
     static const func_builtins builtins = {
         {"default", default_value},
diff --git a/conversion/base.py b/conversion/base.py
index 54e0385af6aa..55be84411959 100644
--- a/conversion/base.py
+++ b/conversion/base.py
@@ -1541,9 +1541,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
             # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
             res = "deepseek-v3"
-        if chkhsh == "5841594bd6a8eeecd7207aeec6570831cc97ffaeba51e908bdaf560113177bae":
-            # ref: https://huggingface.co/stepfun-ai/Step-3.7-Flash (Mistral-style pre-tokenizer; split regex identical to deepseek-v3)
-            res = "deepseek-v3"
         if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
             # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
             res = "deepseek-r1-qwen"
diff --git a/conversion/step3.py b/conversion/step3.py
index f7b0abca97c6..0b5a8ca3c6ee 100644
--- a/conversion/step3.py
+++ b/conversion/step3.py
@@ -15,7 +15,7 @@
 from .qwen import Qwen3Model
 
 
-@ModelBase.register("StepVLForConditionalGeneration")
+@ModelBase.register("StepVLForConditionalGeneration", "Step3p7ForConditionalGeneration")
 class Step3VLVisionModel(MmprojModel):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -95,14 +95,7 @@ class Step3VLTextModel(Qwen3Model):
     model_arch = gguf.MODEL_ARCH.QWEN3
 
 
-# Step3.7 reuses the Step3-VL vision tower/projector; a separate subclass is only
-# needed because ModelBase.register maps each arch to a class.
-@ModelBase.register("Step3p7ForConditionalGeneration")
-class Step37VisionModel(Step3VLVisionModel):
-    pass
-
-
-@ModelBase.register("Step3p5ForCausalLM")
+@ModelBase.register("Step3p5ForCausalLM", "Step3p7ForConditionalGeneration")
 class Step35Model(TextModel):
     model_arch = gguf.MODEL_ARCH.STEP35
 
@@ -176,6 +169,9 @@ def set_gguf_parameters(self):
     def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
         name, gen = item
 
+        if name.startswith("vit_large_projector."):
+            return None
+
         # Map router bias (expert selection bias) to a GGUF bias tensor
         if name.endswith(".moe.router_bias"):
             name += ".bias"
@@ -252,15 +248,3 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
             rope_factors.extend([1.0] * (storage_dim // 2 - len(rope_factors)))
 
         yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
-
-
-@ModelBase.register("Step3p7ForConditionalGeneration")
-class Step37TextModel(Step35Model):
-    model_arch = gguf.MODEL_ARCH.STEP35
-
-    @classmethod
-    def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
-        name, _ = item
-        if name.startswith(("vision_model.", "model.vision_model.", "vit_large_projector.")):
-            return None
-        return super().filter_tensors(item)

From 84c5857e6163778cebb47a502a881e32e89dceee Mon Sep 17 00:00:00 2001
From: forforever73 <690105611@qq.com>
Date: Sat, 30 May 2026 02:23:29 +0800
Subject: [PATCH 5/6] drop vit projector in base filter

---
 conversion/base.py  | 2 +-
 conversion/step3.py | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/conversion/base.py b/conversion/base.py
index 55be84411959..10969a3156ab 100644
--- a/conversion/base.py
+++ b/conversion/base.py
@@ -1112,7 +1112,7 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca
         name, gen = item
 
         # Skip multimodal tensors
-        if name.startswith(("mlp", "vit.", "vpm.", "siglip2.", "conformer.", "merger.", "resampler.", "sound_encoder.", "sound_projection.", "speech_embeddings.")) \
+        if name.startswith(("mlp", "vit.", "vit_large_projector.", "vpm.", "siglip2.", "conformer.", "merger.", "resampler.", "sound_encoder.", "sound_projection.", "speech_embeddings.")) \
                 or "visual." in name or "vision." in name or "audio." in name or "talker." in name \
                 or "vision_" in name or "audio_" in name or "sam_model" in name \
                 or "token2wav." in name or "code2wav." in name \
diff --git a/conversion/step3.py b/conversion/step3.py
index 0b5a8ca3c6ee..673fa14af3e3 100644
--- a/conversion/step3.py
+++ b/conversion/step3.py
@@ -168,10 +168,6 @@ def set_gguf_parameters(self):
     @classmethod
     def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
         name, gen = item
-
-        if name.startswith("vit_large_projector."):
-            return None
-
         # Map router bias (expert selection bias) to a GGUF bias tensor
         if name.endswith(".moe.router_bias"):
             name += ".bias"

From 7e6776a1d44daf0cdea75fd6209b205f9ca21dc0 Mon Sep 17 00:00:00 2001
From: eauchs <138336683+eauchs@users.noreply.github.com>
Date: Sun, 31 May 2026 21:33:22 +0200
Subject: [PATCH 6/6] feat(step3.7): support NextN/MTP heads for speculative
 decoding

Step-3.7-Flash ships num_nextn_predict_layers (3) dense MTP blocks after
the main transformer (HF model.layers.{N..N+K-1}). The current converter
silently drops them and the runtime arch graph never declares an MTP
draft head, so `--spec-type draft-mtp` is unavailable for step35 GGUFs.

This change wires the full chain end-to-end:

conversion/step3.py
  - Extend block_count by num_nextn_predict_layers.
  - Stop filtering HF layers >= num_hidden_layers when MTP is enabled.
  - Emit `step35.nextn_predict_layers` GGUF metadata.
  - Pad per-layer arrays (layer_types, partial_rotary_factors,
    swiglu_limits[_shared]) for the MTP blocks (full-attention, no clamp).

gguf-py/gguf/constants.py
  - Register the NEXTN_* tensors on MODEL_ARCH.STEP35.

gguf-py/gguf/tensor_mapping.py
  - Map Step-3.7's `transformer.shared_head.{norm,output}` to
    NEXTN_SHARED_HEAD_{NORM,HEAD}.

src/models/step35.cpp + src/models/models.h
  - Read `nextn_predict_layers` in load_arch_hparams; force the trailing
    blocks to full-attention.
  - Split tensor loading: trunk (MoE + shared expert + Step35 attn) for
    [0, n_main) and MTP heads (dense SwiGLU MLP + nextn.* + per-block
    shared head) for [n_main, n_layer).
  - Trim the main forward to n_transformer_layers and expose
    res->t_h_pre_norm so the draft head can seed AR steps.
  - Implement llama_model_step35::graph_mtp following the Qwen3.5
    single-block convention but with Step35 attention semantics
    (head-wise sigmoid gate, q/k norm, partial rotary) and a dense
    MLP (Step-3.7 MTP heads use mlp.{gate,up,down}_proj, not MoE).
---
 conversion/step3.py            |  56 +++++--
 gguf-py/gguf/constants.py      |   6 +
 gguf-py/gguf/tensor_mapping.py |   2 +
 src/models/models.h            |   7 +
 src/models/step35.cpp          | 292 ++++++++++++++++++++++++++++++++-
 5 files changed, 349 insertions(+), 14 deletions(-)

diff --git a/conversion/step3.py b/conversion/step3.py
index 673fa14af3e3..b6c8c0d1060a 100644
--- a/conversion/step3.py
+++ b/conversion/step3.py
@@ -99,6 +99,19 @@ class Step3VLTextModel(Qwen3Model):
 class Step35Model(TextModel):
     model_arch = gguf.MODEL_ARCH.STEP35
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Step-3.7 ships NextN/MTP heads (num_nextn_predict_layers > 0) after the
+        # main transformer stack. We expose them as extra blocks (blk.N..blk.N+K-1)
+        # so the model loader can find their tensors under blk.%d.nextn.* and the
+        # final dense MLP / shared head tensors.
+        nextn = int(self.hparams.get("num_nextn_predict_layers", 0))
+        self._nextn_predict_layers = nextn
+        self._n_main_layers = int(self.hparams["num_hidden_layers"])
+        if nextn > 0:
+            self.block_count = self._n_main_layers + nextn
+            self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
+
     def set_gguf_parameters(self):
         rope_theta = self.hparams.get("rope_theta")
         if isinstance(rope_theta, list):
@@ -109,8 +122,11 @@ def set_gguf_parameters(self):
 
         super().set_gguf_parameters()
 
-        layer_types = self.hparams.get("layer_types") or []
-        partial_rotary_factors = self.hparams.get("partial_rotary_factors") or []
+        nextn = self._nextn_predict_layers
+        n_main = self._n_main_layers
+
+        layer_types = list(self.hparams.get("layer_types") or [])
+        partial_rotary_factors = list(self.hparams.get("partial_rotary_factors") or [])
         attn_other = self.hparams.get("attention_other_setting") or {}
 
         n_head_base = self.hparams["num_attention_heads"]
@@ -119,9 +135,19 @@ def set_gguf_parameters(self):
         n_head_swa = attn_other.get("num_attention_heads", n_head_base)
         n_kv_swa = attn_other.get("num_attention_groups", n_kv_base)
 
-        layer_types = layer_types[: self.block_count]
-        partial_rotary_factors = partial_rotary_factors[: self.block_count]
+        # Trim the HF lists to the main transformer length first; the upstream
+        # config sometimes includes entries for the MTP heads, sometimes not.
+        layer_types = layer_types[:n_main]
+        partial_rotary_factors = partial_rotary_factors[:n_main]
         assert [1.0 if lt == "sliding_attention" else 0.5 for lt in layer_types] == partial_rotary_factors
+
+        # MTP heads are full-attention only and use the full-attention rope branch
+        # (half rope dims, base rope_theta). Extend per-layer arrays accordingly so
+        # the GGUF carries one entry per block.
+        if nextn > 0:
+            layer_types += ["full_attention"] * nextn
+            partial_rotary_factors += [0.5] * nextn
+
         head_arr = [n_head_swa if lt == "sliding_attention" else n_head_base for lt in layer_types]
         kv_arr = [n_kv_swa if lt == "sliding_attention" else n_kv_base for lt in layer_types]
         swa_pat = [lt == "sliding_attention" for lt in layer_types]
@@ -157,12 +183,21 @@ def set_gguf_parameters(self):
 
         self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-5))
 
-        # Optional per-layer SwiGLU clamps.
+        # NextN/MTP heads — Step-3.7 ships num_nextn_predict_layers dense MTP
+        # blocks after the main transformer (model.layers.N..N+K-1 in HF).
+        if self._nextn_predict_layers > 0:
+            self.gguf_writer.add_nextn_predict_layers(self._nextn_predict_layers)
+
+        # Optional per-layer SwiGLU clamps. Pad with 0.0 for the MTP blocks
+        # (MTP heads use a dense MLP without clamping), so the array length
+        # matches block_count.
         if (limits := self.hparams.get("swiglu_limits")) is not None:
-            limits_f = [0.0 if v is None else float(v) for v in limits[: self.block_count]]
+            limits_f = [0.0 if v is None else float(v) for v in limits[: self._n_main_layers]]
+            limits_f += [0.0] * self._nextn_predict_layers
             self.gguf_writer.add_swiglu_clamp_exp(limits_f)
         if (limits_shared := self.hparams.get("swiglu_limits_shared")) is not None:
-            limits_shared_f = [0.0 if v is None else float(v) for v in limits_shared[: self.block_count]]
+            limits_shared_f = [0.0 if v is None else float(v) for v in limits_shared[: self._n_main_layers]]
+            limits_shared_f += [0.0] * self._nextn_predict_layers
             self.gguf_writer.add_swiglu_clamp_shexp(limits_shared_f)
 
     @classmethod
@@ -175,11 +210,12 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca
         return super().filter_tensors((name, gen))
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
-        # remove mtp layers
+        # Step-3.7 MTP heads live at model.layers.{N..N+K-1}.{eh_proj,enorm,hnorm,...}
+        # We keep them when nextn_predict_layers > 0 (mapped via NEXTN_* tensors)
+        # and drop them otherwise to preserve backward compatibility with text-only conversion.
         if (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None:
             il = int(m.group(1))
-            n_main = int(self.hparams.get("num_hidden_layers", self.block_count))
-            if il >= n_main:
+            if il >= self._n_main_layers and self._nextn_predict_layers == 0:
                 return
         if name.endswith("norm.weight"):
             data_torch += 1.0
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 0189f6f03c51..37f688ae72d6 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -3937,6 +3937,12 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_GATE_SHEXP,
         MODEL_TENSOR.FFN_DOWN_SHEXP,
         MODEL_TENSOR.FFN_EXP_PROBS_B,
+        # NextN/MTP heads (Step-3.7 num_nextn_predict_layers > 0)
+        MODEL_TENSOR.NEXTN_EH_PROJ,
+        MODEL_TENSOR.NEXTN_ENORM,
+        MODEL_TENSOR.NEXTN_HNORM,
+        MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
+        MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
     ],
     MODEL_ARCH.LLAMA_EMBED: [
         MODEL_TENSOR.TOKEN_EMBD,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index ecc3c05f99ac..7d5502c4e852 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -2273,10 +2273,12 @@ class TensorNameMap:
 
         MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: (
             "model.layers.{bid}.shared_head.head",
+            "model.layers.{bid}.transformer.shared_head.output",  # step3.7
         ),
 
         MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: (
             "model.layers.{bid}.shared_head.norm",
+            "model.layers.{bid}.transformer.shared_head.norm",    # step3.7
         ),
     }
 
diff --git a/src/models/models.h b/src/models/models.h
index db228865d5d0..fe85f742f542 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -1900,5 +1900,12 @@ struct llama_model_step35 : public llama_model_base {
         graph(const llama_model & model, const llm_graph_params & params);
     };
 
+    // NextN/MTP draft head used by --spec-type draft-mtp.
+    // Steps the AR draft loop one position ahead using the pre-norm hidden
+    // state from the trunk and the embedding of the previous draft token.
+    struct graph_mtp : public llm_graph_context {
+        graph_mtp(const llama_model & model, const llm_graph_params & params);
+    };
+
     std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
diff --git a/src/models/step35.cpp b/src/models/step35.cpp
index 3b68e68707ae..849a5adc9db5 100644
--- a/src/models/step35.cpp
+++ b/src/models/step35.cpp
@@ -26,7 +26,20 @@ void llama_model_step35::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP,   hparams.swiglu_clamp_exp,   hparams.n_layer, false);
     ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_SHEXP, hparams.swiglu_clamp_shexp, hparams.n_layer, false);
 
-    switch (hparams.n_layer) {
+    // NextN/MTP heads — Step-3.7 trails the main transformer with
+    // num_nextn_predict_layers dense MTP blocks (model.layers.N..N+K-1 in HF).
+    // The converter appends them to block_count so n_layer reflects the total.
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
+    GGML_ASSERT(hparams.nextn_predict_layers < hparams.n_layer && "nextn_predict_layers must be < n_layer");
+
+    // The MTP blocks are dense + full-attention even though the converter marks
+    // them as full_attention in swa_layers. Defensive: force full-attention.
+    for (uint32_t i = hparams.n_layer - hparams.nextn_predict_layers; i < hparams.n_layer; ++i) {
+        hparams.swa_layers[i] = false;
+    }
+
+    const uint32_t n_main_layer = hparams.n_layer - hparams.nextn_predict_layers;
+    switch (n_main_layer) {
         case 45: type = LLM_TYPE_196B_A11B; break;
         default: type = LLM_TYPE_UNKNOWN;
     }
@@ -35,6 +48,8 @@ void llama_model_step35::load_arch_hparams(llama_model_loader & ml) {
 void llama_model_step35::load_arch_tensors(llama_model_loader &) {
     LLAMA_LOAD_LOCALS;
 
+    const int n_main = n_layer - (int) hparams.nextn_predict_layers;
+
     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
     // output
@@ -51,7 +66,7 @@ void llama_model_step35::load_arch_tensors(llama_model_loader &) {
         n_rot_max = n_rot;
     }
 
-    for (int i = 0; i < n_layer; ++i) {
+    auto load_block_trunk = [&](int i) {
         auto & layer = layers[i];
 
         const uint32_t n_head_l      = hparams.n_head(i);
@@ -95,10 +110,74 @@ void llama_model_step35::load_arch_tensors(llama_model_loader &) {
         layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED);
         layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd, hparams.n_ff_shexp}, TENSOR_NOT_REQUIRED);
         layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, TENSOR_NOT_REQUIRED);
+    };
+
+    // Step-3.7 MTP block layout (per HF safetensors index):
+    //   model.layers.{N..N+K-1}.{eh_proj, enorm, hnorm,
+    //                            input_layernorm, post_attention_layernorm,
+    //                            self_attn.{q,k,v,o,g}_proj, self_attn.{q,k}_norm,
+    //                            mlp.{gate,up,down}_proj,
+    //                            transformer.shared_head.{norm,output}}
+    // Each MTP head is a single transformer block with full attention and a
+    // DENSE SwiGLU MLP (not MoE). It owns its own LM head (shared head).
+    auto load_block_mtp = [&](int i) {
+        auto & layer = layers[i];
+
+        const uint32_t n_head_l      = hparams.n_head(i);
+        const uint32_t n_embd_k_gqa  = hparams.n_embd_k_gqa(i);
+        const uint32_t n_embd_v_gqa  = hparams.n_embd_v_gqa(i);
+
+        // Pre-attention norm
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        // Standard Step35 attention block (q/k norm, head-wise gate, partial RoPE)
+        layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED);
+        layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED);
+
+        // rope factors (shared, see trunk)
+        if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
+            layer.rope_long  = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG,  "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | TENSOR_DUPLICATED);
+            layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | TENSOR_DUPLICATED);
+        } else {
+            layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | TENSOR_DUPLICATED);
+        }
+
+        create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head_l, n_embd_k_gqa, n_embd_v_gqa, 0);
+        layer.wo        = create_tensor(tn(LLM_TENSOR_ATTN_OUT,  "weight", i), {n_embd_head_v * n_head_l, n_embd}, 0);
+        layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_head_l}, TENSOR_NOT_REQUIRED);
+
+        // Dense SwiGLU MLP (mlp.gate_proj, mlp.up_proj, mlp.down_proj in HF)
+        // Sized via the standard ffn_dim (intermediate_size). `post_attention_layernorm`
+        // in the HF MTP block functions as the pre-FFN norm and therefore maps to
+        // FFN_NORM via the tensor name map.
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, 0);
+
+        // NextN-specific tensors
+        layer.nextn.eh_proj          = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ,          "weight", i), {2 * n_embd, n_embd}, 0);
+        layer.nextn.enorm            = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM,            "weight", i), {n_embd},             0);
+        layer.nextn.hnorm            = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM,            "weight", i), {n_embd},             0);
+        // Step-3.7 has per-MTP-block shared head (transformer.shared_head.{norm,output}).
+        layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), {n_embd},             TENSOR_NOT_REQUIRED);
+        layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), {n_embd, n_vocab},    TENSOR_NOT_REQUIRED);
+        // Step-3.7 does not ship a per-block embed_tokens — main tok_embd is reused.
+        layer.nextn.embed_tokens     = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS,     "weight", i), {n_embd, n_vocab},    TENSOR_NOT_REQUIRED);
+    };
+
+    for (int i = 0; i < n_main; ++i) {
+        load_block_trunk(i);
+    }
+    for (int i = n_main; i < n_layer; ++i) {
+        load_block_mtp(i);
     }
 }
 
 std::unique_ptr<llm_graph_context> llama_model_step35::build_arch_graph(const llm_graph_params & params) const {
+    if (params.gtype == LLM_GRAPH_TYPE_DECODER_MTP) {
+        return std::make_unique<graph_mtp>(*this, params);
+    }
     return std::make_unique<graph>(*this, params);
 }
 
@@ -111,7 +190,10 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para
     auto        * inp_attn    = build_attn_inp_kv_iswa();
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
-    for (int il = 0; il < n_layer; ++il) {
+    // Iterate only the main transformer stack; the trailing nextn_predict_layers
+    // blocks are MTP heads invoked via LLM_GRAPH_TYPE_DECODER_MTP.
+    const int n_transformer_layers = n_layer - (int) hparams.nextn_predict_layers;
+    for (int il = 0; il < n_transformer_layers; ++il) {
         ggml_tensor * inpSA = inpL;
 
         const uint32_t n_head_l    = hparams.n_head(il);
@@ -198,7 +280,7 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para
             cb(cur, "attn_proj", il);
         }
 
-        if (il == n_layer - 1 && inp_out_ids) {
+        if (il == n_transformer_layers - 1 && inp_out_ids) {
             cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
             inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         }
@@ -257,6 +339,11 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para
 
     cur = inpL;
 
+    // Expose the pre-norm hidden state — the MTP draft head consumes this as
+    // its `h_input` (the AR draft loop seeds successive MTP steps with it).
+    cb(cur, "h_pre_norm", -1);
+    res->t_h_pre_norm = cur;
+
     cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
     cb(cur, "result_norm", -1);
     res->t_embd = cur;
@@ -267,3 +354,200 @@ llama_model_step35::graph::graph(const llama_model & model, const llm_graph_para
 
     ggml_build_forward_expand(gf, cur);
 }
+
+// =============================================================================
+// MTP draft head graph (LLM_GRAPH_TYPE_DECODER_MTP)
+// =============================================================================
+//
+// Step-3.7-Flash ships num_nextn_predict_layers (typically 3) dense MTP blocks
+// trailing the main transformer. Each block is a single full-attention layer
+// with a dense SwiGLU MLP and its own LM head. The block predicts the token
+// one position ahead given (h_prev, prev_token) where h_prev is the pre-norm
+// hidden state from the previous step (trunk for the first MTP step, then the
+// previous MTP block's pre-norm output for subsequent chained MTP calls).
+//
+// To stay aligned with the existing speculative driver this graph follows the
+// Qwen3.5 MTP layout (single-block draft per invocation). For Step-3.7 we use
+// the FIRST MTP block (lowest index). Multi-step draft chains can be issued by
+// calling this graph repeatedly with refreshed (h, token) pairs.
+//
+// Graph layout per the reference HF tensors:
+//   h_norm     = RMSNorm_h(h_input)            // hnorm
+//   e_norm     = RMSNorm_e(embed(prev_token))  // enorm
+//   x          = eh_proj(concat(e_norm, h_norm, dim=0))
+//   attn_in    = input_layernorm(x)
+//   attn_out   = step35_self_attn(attn_in)
+//   x          = x + attn_out
+//   ffn_in     = post_attention_layernorm(x)
+//   ffn_out    = swiglu_mlp(ffn_in)
+//   h_next     = x + ffn_out
+//   logits     = shared_head_output(shared_head_norm(h_next))
+//
+// The attention block reuses the Step35 head-wise sigmoid gate and partial
+// rotary embeddings (full_attention => n_rot = head_dim/2).
+llama_model_step35::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params)
+    : llm_graph_context(params) {
+    GGML_ASSERT(hparams.nextn_predict_layers > 0 && "STEP35 MTP requires nextn_predict_layers > 0");
+
+    // Use the first MTP block (lowest index). Multi-block chains are driven
+    // externally by re-invoking this graph with refreshed (h, token) pairs.
+    const int il = (int) hparams.n_layer - (int) hparams.nextn_predict_layers;
+    const auto & layer = model.layers[il];
+
+    GGML_ASSERT(layer.nextn.eh_proj && "STEP35 MTP: missing nextn.eh_proj");
+    GGML_ASSERT(layer.nextn.enorm   && "STEP35 MTP: missing nextn.enorm");
+    GGML_ASSERT(layer.nextn.hnorm   && "STEP35 MTP: missing nextn.hnorm");
+    GGML_ASSERT(layer.ffn_gate && layer.ffn_up && layer.ffn_down && "STEP35 MTP: missing dense MLP weights");
+
+    // Input plumbing: the MTP graph takes (token_id, h_pre_norm_row) per draft
+    // position. We expose them through the standard llm_graph_input_embd.
+    auto inp = std::make_unique<llm_graph_input_embd>(hparams.n_embd);
+    inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+    ggml_set_input(inp->tokens);
+    inp->embd   = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens);
+    ggml_set_input(inp->embd);
+    ggml_set_name(inp->embd, "mtp_h_input");
+
+    ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd;
+
+    ggml_tensor * h_input  = inp->embd;
+    ggml_tensor * tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens);
+    cb(tok_embd, "mtp_tok_embd", il);
+
+    res->add_input(std::move(inp));
+
+    ggml_tensor * inp_pos     = build_inp_pos();
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+    auto        * inp_attn    = build_attn_inp_kv();
+
+    // hnorm/enorm + eh_proj
+    ggml_tensor * h_norm = build_norm(h_input, layer.nextn.hnorm, nullptr, LLM_NORM_RMS, il);
+    cb(h_norm, "mtp_hnorm", il);
+    ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, LLM_NORM_RMS, il);
+    cb(e_norm, "mtp_enorm", il);
+
+    ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0);
+    cb(concat, "mtp_concat", il);
+
+    ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat);
+    cb(cur, "mtp_eh_proj", il);
+
+    ggml_tensor * inpSA = cur;
+
+    // -------------------------------------------------------------------------
+    // Step35-style attention block (mirrors graph::graph for full-attention)
+    // -------------------------------------------------------------------------
+    {
+        cur = build_norm(cur, layer.attn_norm, nullptr, LLM_NORM_RMS, il);
+        cb(cur, "mtp_attn_norm", il);
+
+        const uint32_t n_head_l    = hparams.n_head(il);
+        const uint32_t n_head_kv_l = hparams.n_head_kv(il);
+
+        ggml_tensor * Qcur = build_lora_mm(layer.wq, cur);
+        ggml_tensor * Kcur = build_lora_mm(layer.wk, cur);
+        ggml_tensor * Vcur = build_lora_mm(layer.wv, cur);
+        cb(Qcur, "mtp_Qcur", il);
+        cb(Kcur, "mtp_Kcur", il);
+        cb(Vcur, "mtp_Vcur", il);
+
+        Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head_l,    n_tokens);
+        Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv_l, n_tokens);
+        Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head_v, n_head_kv_l, n_tokens);
+
+        if (layer.attn_q_norm) {
+            Qcur = build_norm(Qcur, layer.attn_q_norm, nullptr, LLM_NORM_RMS, il);
+            cb(Qcur, "mtp_Qcur_normed", il);
+        }
+        if (layer.attn_k_norm) {
+            Kcur = build_norm(Kcur, layer.attn_k_norm, nullptr, LLM_NORM_RMS, il);
+            cb(Kcur, "mtp_Kcur_normed", il);
+        }
+
+        // MTP block is full-attention (n_rot = head_dim/2 like main full-attn).
+        const float freq_base_l  = model.get_rope_freq_base(cparams, il);
+        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);
+        ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+        const int64_t n_rot_l = hparams.n_rot(il);
+
+        Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors,
+                n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                ext_factor, attn_factor, beta_fast, beta_slow);
+        Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors,
+                n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
+                ext_factor, attn_factor, beta_fast, beta_slow);
+        cb(Qcur, "mtp_Qcur_pos", il);
+        cb(Kcur, "mtp_Kcur_pos", il);
+
+        const float kq_scale = 1.0f / sqrtf(float(n_embd_head_k));
+        ggml_tensor * attn_out = build_attn(inp_attn,
+                nullptr, nullptr, nullptr,
+                Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
+        cb(attn_out, "mtp_attn_out_raw", il);
+
+        // head-wise sigmoid attention gate (g_proj)
+        if (layer.wqkv_gate) {
+            ggml_tensor * gate = build_lora_mm(layer.wqkv_gate, cur);
+            cb(gate, "mtp_attn_gate", il);
+            gate = ggml_sigmoid(ctx0, gate);
+            cb(gate, "mtp_attn_gate_sigmoid", il);
+
+            ggml_tensor * attn_3d = ggml_reshape_3d(ctx0, attn_out, n_embd_head_v, n_head_l, n_tokens);
+            ggml_tensor * gate_3d = ggml_reshape_3d(ctx0, gate,     1,             n_head_l, n_tokens);
+            attn_3d = ggml_mul(ctx0, attn_3d, gate_3d);
+            attn_out = ggml_reshape_2d(ctx0, attn_3d, n_embd_head_v * n_head_l, n_tokens);
+            cb(attn_out, "mtp_attn_gated", il);
+        }
+
+        cur = build_lora_mm(layer.wo, attn_out);
+        cb(cur, "mtp_attn_proj", il);
+    }
+
+    cur = ggml_add(ctx0, cur, inpSA);
+    cb(cur, "mtp_attn_residual", il);
+
+    // -------------------------------------------------------------------------
+    // Dense SwiGLU MLP — Step-3.7 MTP blocks use a single dense MLP (not MoE).
+    // HF `post_attention_layernorm` functions as the pre-FFN norm (FFN_NORM).
+    // -------------------------------------------------------------------------
+    ggml_tensor * ffn_residual = cur;
+    GGML_ASSERT(layer.ffn_norm && "STEP35 MTP: missing ffn_norm (HF post_attention_layernorm)");
+    cur = build_norm(cur, layer.ffn_norm, nullptr, LLM_NORM_RMS, il);
+    cb(cur, "mtp_ffn_norm", il);
+
+    cur = build_ffn(cur,
+            layer.ffn_up,   nullptr, nullptr,
+            layer.ffn_gate, nullptr, nullptr,
+            layer.ffn_down, nullptr, nullptr,
+            nullptr,
+            LLM_FFN_SILU, LLM_FFN_PAR, il);
+    cb(cur, "mtp_ffn_out", il);
+
+    cur = ggml_add(ctx0, cur, ffn_residual);
+    cb(cur, "mtp_post_ffn", il);
+
+    // Pre-norm hidden state for the AR draft loop (consumed as next h_input).
+    cb(cur, "h_pre_norm", -1);
+    res->t_h_pre_norm = cur;
+
+    if (inp_out_ids) {
+        cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+    }
+
+    // Per-block shared head: use nextn.shared_head_norm / nextn.shared_head_head
+    // when present; otherwise fall back to the main output_norm / output (i.e.
+    // tied LM head when the MTP block has no dedicated head — Step-3.7 always
+    // ships a per-block head).
+    ggml_tensor * head_norm_w = layer.nextn.shared_head_norm ? layer.nextn.shared_head_norm : model.output_norm;
+    GGML_ASSERT(head_norm_w && "STEP35 MTP: missing shared_head_norm / output_norm");
+    cur = build_norm(cur, head_norm_w, nullptr, LLM_NORM_RMS, -1);
+    cb(cur, "mtp_shared_head_norm", -1);
+
+    ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output;
+    GGML_ASSERT(head_w && "STEP35 MTP: missing shared_head_head / output");
+    cur = build_lora_mm(head_w, cur);
+    cb(cur, "result_output", -1);
+
+    res->t_logits = cur;
+    ggml_build_forward_expand(gf, cur);
+}