Pushkinist · Pushkinist · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026
diff --git a/crates/rmlx-cli/src/commands/info.rs b/crates/rmlx-cli/src/commands/info.rs
@@ -396,19 +396,29 @@ pub(crate) fn run_info(
         };
         print_load_phases_json();
 
-        // Load tokenizer. Inline BOS extraction -- avoids adding rmlx-server as a dep
-        // of the probe path (dep-DAG constraint). ~10 lines mirrors tokenizer_io logic.
+        // Load tokenizer.
         let bos_id = load_bos_id(model_path)?;
         info!(bos_id, "smoke_probe: resolved BOS token id");
 
         let tokenizer_path = model_path.join("tokenizer.json");
         let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path)
             .map_err(|e| anyhow::anyhow!("smoke_probe: load tokenizer: {e}"))?;
 
-        // B5b: seed with the fixed deterministic prompt (shared single source
-        // of truth in arch::smoke_prompt_ids — do NOT fork the seed logic).
-        let prompt_ids = arch::smoke_prompt_ids(&tokenizer, bos_id)
-            .map_err(|e| anyhow::anyhow!("smoke_probe: build seed prompt: {e}"))?;
+        // Build the smoke prompt through the model's real chat template when one
+        // exists (production-shaped, turn-structured input), falling back to the
+        // shared bare seed otherwise. This keeps the probe verdict consistent
+        // with how the model is actually served — a bare instruction can make a
+        // healthy instruction-tuned snapshot degenerate into a filler loop (the
+        // reference loader does the same), which previously raised false
+        // Broken* verdicts. `smoke_prompt_ids` returns None for base snapshots
+        // with no usable template; the bare seed then uses the canonical
+        // `load_bos_id` resolution already performed above.
+        let prompt_ids = match rmlx_server::chat_template::smoke_prompt_ids(model_path, &tokenizer)
+        {
+            Some(ids) => ids,
+            None => arch::smoke_prompt_ids(&tokenizer, bos_id)
+                .map_err(|e| anyhow::anyhow!("smoke_probe: build seed prompt: {e}"))?,
+        };
 
         info!(
             ?kv_quant_override,

diff --git a/crates/rmlx-models/src/arch/loader.rs b/crates/rmlx-models/src/arch/loader.rs
@@ -371,13 +371,22 @@ pub fn smoke_prompt_ids(tokenizer: &tokenizers::Tokenizer, bos_id: u32) -> Resul
 /// `tokenizer_config.json`, runs greedy generation for 8 steps, and returns
 /// a `SmokeVerdict`. Used by the server's `--require-smoke-probe` gate (B5).
 ///
+/// `prompt_ids_override` lets a caller that owns the chat-template engine
+/// (e.g. `rmlx-server`) feed a production-shaped, turn-structured prompt so the
+/// probe matches how the model is actually served. When `None`, the probe falls
+/// back to the shared bare-instruction seed (`smoke_prompt_ids`). Instruction
+/// models can degenerate into repeated filler on a bare prompt even when
+/// healthy — the reference loader reproduces this identically — so the templated
+/// path is preferred when available to avoid false `Broken*` verdicts.
+///
 /// Returns `Err` only for hard load/tokenizer failures. `Ok(verdict)` where
 /// `verdict != SmokeVerdict::Ok` means the snapshot is broken but loadable.
 pub fn run_smoke_probe(
     model_dir: &Path,
     device: Device,
     kv_quant: Option<rmlx_kv_quant::KvQuant>,
     max_ctx_override: Option<i32>,
+    prompt_ids_override: Option<Vec<u32>>,
 ) -> Result<crate::decode_loop::SmokeVerdict> {
     let model = load_model(model_dir, device, &LoadOpts::default())?;
 
@@ -389,11 +398,15 @@ pub fn run_smoke_probe(
     let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path)
         .map_err(|e| Error::Model(format!("run_smoke_probe: load tokenizer: {e}")))?;
 
-    // B5b: seed with a fixed deterministic prompt instead of bare BOS so
-    // healthy instruction-tuned snapshots generate real text.
-    let prompt_ids = smoke_prompt_ids(&tokenizer, bos_id)?;
+    // Prefer a caller-provided, chat-templated prompt; otherwise seed with the
+    // shared bare instruction so healthy snapshots generate real text.
+    let (prompt_ids, templated) = match prompt_ids_override {
+        Some(ids) if !ids.is_empty() => (ids, true),
+        _ => (smoke_prompt_ids(&tokenizer, bos_id)?, false),
+    };
     tracing::info!(
         prompt_len = prompt_ids.len(),
+        templated,
         seed = SMOKE_PROMPT,
         "run_smoke_probe: seeded smoke prompt"
     );

diff --git a/crates/rmlx-server/src/chat_template.rs b/crates/rmlx-server/src/chat_template.rs
@@ -157,7 +157,7 @@ pub struct RenderOpts<'a> {
 /// Construct once per model; `render` may be called many times.
 #[allow(
     clippy::exhaustive_structs,
-    reason = "internal closed template struct — private minijinja environment field; public API is from_template_string() and render(); adding a field requires updating ChatTemplate::from_template_string"
+    reason = "internal closed template struct — private minijinja environment field; public API is ChatTemplate::new() and render(); adding a field requires updating ChatTemplate::new"
 )]
 pub struct ChatTemplate {
     env: Environment<'static>,
@@ -218,6 +218,91 @@ pub fn load_template_source(model_dir: &Path) -> Result<String> {
         .map_err(|e| Error::Other(format!("cannot read {}: {e}", path.display())))
 }
 
+// ── Smoke-probe prompt ─────────────────────────────────────────────────────────
+
+/// Build the smoke-probe input token ids for a model snapshot by rendering the
+/// canonical seed through the model's real `chat_template.jinja`, so the probe
+/// exercises the same prompt shape the model is served with in production.
+///
+/// Instruction-tuned models are trained to continue *turn-structured* input
+/// (`<start_of_turn>user … <start_of_turn>model`). Fed a bare instruction with
+/// no turn scaffolding, a healthy model can still degenerate into a repeated
+/// filler token — a behaviour the reference loader reproduces identically. That
+/// made the bare-seed probe raise false `Broken*` verdicts for snapshots that
+/// generate correctly through `serve`. Rendering the canonical seed through
+/// `chat_template.jinja` removes that false positive generally, for any arch
+/// whose bare-prompt continuation is degenerate.
+///
+/// Returns `None` when the snapshot has no usable `chat_template.jinja` (or the
+/// render/encode fails) — base / non-chat snapshots. The caller then passes
+/// `None` to `run_smoke_probe`, which builds the shared bare seed itself with
+/// its own canonical BOS resolution. Keeping the bare-seed construction out of
+/// this function means the BOS fallback chain lives in exactly one place per
+/// entry point and no token id is hard-coded here.
+///
+/// The template emits its own `<bos>`, so the rendered text is tokenized with
+/// `add_special_tokens = false` (mirrors the production request path).
+pub fn smoke_prompt_ids(model_dir: &Path, tokenizer: &tokenizers::Tokenizer) -> Option<Vec<u32>> {
+    match render_templated_seed(model_dir, tokenizer) {
+        Ok(ids) => Some(ids),
+        Err(reason) => {
+            // Expected for base / non-chat snapshots. Recorded at debug level so
+            // a run's `.jsonl` shows whether the probe ran templated or fell back
+            // to the bare seed, and why — without warning on the normal case.
+            tracing::debug!(
+                reason,
+                "smoke_prompt_ids: no usable chat template — using bare seed"
+            );
+            None
+        }
+    }
+}
+
+/// Render `arch::SMOKE_PROMPT` as a single user turn through the model's chat
+/// template and tokenize the result. Returns the encoded ids on success, or a
+/// human-readable reason string on any miss (no template, compile/render/encode
+/// failure, or empty output) so the caller can log it once at the fallback
+/// boundary.
+fn render_templated_seed(
+    model_dir: &Path,
+    tokenizer: &tokenizers::Tokenizer,
+) -> std::result::Result<Vec<u32>, String> {
+    let src = load_template_source(model_dir).map_err(|e| format!("load template: {e}"))?;
+    let tpl = ChatTemplate::new(src).map_err(|e| format!("compile template: {e}"))?;
+
+    let cfg = crate::tokenizer_io::load_tokenizer_config(model_dir)
+        .map_err(|e| format!("load tokenizer_config: {e}"))?;
+    let bos_token = cfg.bos_token.as_deref().unwrap_or("");
+    let eos_token = cfg.eos_token.as_deref().unwrap_or("");
+
+    let messages = [ChatMessageTpl {
+        role: "user",
+        content: rmlx_models::arch::SMOKE_PROMPT,
+        ..ChatMessageTpl::default()
+    }];
+    let opts = RenderOpts {
+        bos_token,
+        eos_token,
+        add_generation_prompt: true,
+        tools: &[],
+        enable_thinking: None,
+    };
+    let rendered = tpl
+        .render(&messages, &opts)
+        .map_err(|e| format!("render template: {e}"))?;
+
+    // The template already emits the BOS marker, so encode without re-adding
+    // special tokens (mirrors the production request path).
+    let enc = tokenizer
+        .encode(rendered.text.as_str(), false)
+        .map_err(|e| format!("encode rendered prompt: {e}"))?;
+    let ids = enc.get_ids().to_vec();
+    if ids.is_empty() {
+        return Err("rendered prompt encoded to zero tokens".to_owned());
+    }
+    Ok(ids)
+}
+
 // ── ChatTemplate ──────────────────────────────────────────────────────────────
 
 impl ChatTemplate {

diff --git a/crates/rmlx-server/src/chat_template_tests.rs b/crates/rmlx-server/src/chat_template_tests.rs
@@ -633,3 +633,112 @@ fn qwen3_plain_render_byte_identical_after_tool_support() {
         "plain render diverged after tool-support change (A5.2 invariant)"
     );
 }
+
+// ── Smoke-probe prompt: template-shaped vs bare-seed fallback ───────────────
+//
+// Regression guard for the gemma4-unified 4-bit false-positive: the smoke probe
+// must feed turn-structured input when the snapshot ships a chat template, so a
+// healthy instruction-tuned model is exercised the same way it is served. A bare
+// instruction prompt can make even a healthy model loop a filler token; rendering
+// through the template removes that false Broken* verdict. These two tests assert
+// the template path is taken when present and the bare-seed fallback otherwise —
+// no model snapshot required.
+
+/// A minimal WordLevel `tokenizer.json` whose vocab covers the turn markers, the
+/// `user`/`model` role words, and every whitespace-split token of `SMOKE_PROMPT`.
+/// Token ids are arbitrary but distinct so the encoded id sequence is checkable.
+fn write_smoke_fixture(dir: &Path, with_template: bool) {
+    // Whitespace pre-tokenizer splits on spaces and punctuation, so `France?`
+    // becomes `France` + `?`; the vocab lists both pieces. The `?` id is 26.
+    let vocab = r#"{
+        "<bos>":0,"<eos>":1,"<unk>":2,
+        "<start_of_turn>":10,"<end_of_turn>":11,"user":12,"model":13,
+        "What":20,"is":21,"the":22,"capital":23,"of":24,"France":25,"?":26
+    }"#;
+    // The angle-bracket markers must tokenize atomically (the Whitespace
+    // pre-tokenizer would otherwise split `<bos>` into `< bos >`), so they are
+    // registered as added/special tokens — mirroring real HF tokenizers.
+    let added = r#"[
+        {"id":0,"content":"<bos>","single_word":false,"lstrip":false,"rstrip":false,"normalized":false,"special":true},
+        {"id":10,"content":"<start_of_turn>","single_word":false,"lstrip":false,"rstrip":false,"normalized":false,"special":true},
+        {"id":11,"content":"<end_of_turn>","single_word":false,"lstrip":false,"rstrip":false,"normalized":false,"special":true}
+    ]"#;
+    let tok_json = format!(
+        r#"{{"version":"1.0","truncation":null,"padding":null,"added_tokens":{added},"normalizer":null,"pre_tokenizer":{{"type":"Whitespace"}},"post_processor":null,"decoder":null,"model":{{"type":"WordLevel","vocab":{vocab},"unk_token":"<unk>"}}}}"#
+    );
+    std::fs::write(dir.join("tokenizer.json"), tok_json).expect("write tokenizer.json");
+    std::fs::write(
+        dir.join("tokenizer_config.json"),
+        r#"{"bos_token":"<bos>","eos_token":"<eos>"}"#,
+    )
+    .expect("write tokenizer_config.json");
+    if with_template {
+        // Gemma-style turn markers. The template emits its own BOS. Tokens are
+        // space-separated so the Whitespace pre-tokenizer yields exact vocab ids.
+        let tpl = "{{ bos_token }} {% for m in messages %}<start_of_turn> {{ m.role }} {{ m.content }} <end_of_turn> {% endfor %}{% if add_generation_prompt %}<start_of_turn> model{% endif %}";
+        std::fs::write(dir.join("chat_template.jinja"), tpl).expect("write chat_template.jinja");
+    }
+}
+
+#[test]
+fn smoke_prompt_uses_chat_template_when_present() {
+    let tmp = tempfile::tempdir().expect("tempdir");
+    write_smoke_fixture(tmp.path(), true);
+    let tk = tokenizers::Tokenizer::from_file(tmp.path().join("tokenizer.json")).expect("load tok");
+
+    let ids = smoke_prompt_ids(tmp.path(), &tk).expect("templated smoke prompt");
+
+    // Must be turn-structured: starts with BOS(0) + <start_of_turn>(10) user(12),
+    // contains the prompt tokens, and ends on the model-turn opener (10, 13).
+    assert_eq!(
+        ids.first(),
+        Some(&0),
+        "templated prompt must begin with BOS"
+    );
+    assert!(
+        ids.windows(2).any(|w| w == [10, 12]),
+        "expected a <start_of_turn> user span: {ids:?}"
+    );
+    assert_eq!(
+        &ids[ids.len() - 2..],
+        &[10, 13],
+        "templated prompt must end on the <start_of_turn> model opener: {ids:?}"
+    );
+    // The prompt body tokens are present (What=20, capital=23, France=25).
+    for t in [20u32, 23, 25] {
+        assert!(ids.contains(&t), "missing prompt token {t} in {ids:?}");
+    }
+}
+
+/// Without a chat template, `smoke_prompt_ids` must return `None` so the caller
+/// (`run_smoke_probe` / the CLI probe) builds the bare seed itself with its own
+/// canonical BOS resolution. The function must NOT invent a token id — earlier
+/// the server probe hard-coded a magic id when no `<bos>` resolved, seeding the
+/// probe wrong; returning `None` keeps the BOS fallback chain in one place.
+#[test]
+fn smoke_prompt_falls_back_to_bare_seed_without_template() {
+    let tmp = tempfile::tempdir().expect("tempdir");
+    write_smoke_fixture(tmp.path(), false); // no chat_template.jinja
+    let tk = tokenizers::Tokenizer::from_file(tmp.path().join("tokenizer.json")).expect("load tok");
+
+    // No template ⇒ None (the templated path is the only thing this fn owns).
+    assert!(
+        smoke_prompt_ids(tmp.path(), &tk).is_none(),
+        "no chat template must yield None so the caller resolves BOS itself"
+    );
+
+    // The bare-seed builder the caller falls back to is the shared
+    // `arch::smoke_prompt_ids`, which takes the canonically-resolved BOS id and
+    // never substitutes a magic literal. Seeded with a real BOS (0 here) it
+    // produces [bos] + SMOKE_PROMPT, no turn markers.
+    let ids = rmlx_models::arch::smoke_prompt_ids(&tk, 0).expect("bare-seed smoke prompt");
+    assert_eq!(ids.first(), Some(&0), "bare seed must begin with BOS");
+    assert!(
+        !ids.contains(&10) && !ids.contains(&13),
+        "bare-seed fallback must not contain turn markers: {ids:?}"
+    );
+    assert!(
+        ids.contains(&20) && ids.contains(&25),
+        "missing prompt body: {ids:?}"
+    );
+}
diff --git a/crates/rmlx-server/src/openai/state.rs b/crates/rmlx-server/src/openai/state.rs
@@ -854,11 +854,22 @@ impl AppState {
         // zero-overhead path unchanged.
         if self.require_smoke_probe {
             tracing::info!(model_id, path = %entry.abs_path.display(), "B5: running smoke probe before first load");
+
+            // Render the smoke seed through the model's real chat template so the
+            // probe exercises production-shaped, turn-structured input. When no
+            // usable template exists, `smoke_prompt_ids` returns None and
+            // `run_smoke_probe` builds the bare seed itself with its own
+            // canonical BOS resolution — no token id is invented here.
+            let templated_prompt = crate::tokenizer_io::load_tokenizer(&entry.abs_path)
+                .ok()
+                .and_then(|tk| crate::chat_template::smoke_prompt_ids(&entry.abs_path, &tk));
+
             let verdict = rmlx_models::arch::run_smoke_probe(
                 &entry.abs_path,
                 rmlx_mlx::Device::Gpu,
                 None, // use arch default KV quant
                 None, // use model default max_ctx
+                templated_prompt,
             )
             .map_err(|e| format!("smoke probe error for '{model_id}': {e}"))?;
 

diff --git a/docs/CLI.md b/docs/CLI.md
@@ -265,6 +265,15 @@ rmlx info --model /path/to/snapshot --probe-smoke
 | `--max-ctx` | u32 | (from model) | **Virtual ceiling** on context length, in tokens (not an eager allocation): the KV ring grows lazily up to it, prompts over it are rejected. See `docs/KV_CACHE.md` §4.6. Must be ≥ 256 when set. |
 | `--list-cache-types` | bool flag | off | Print the full §D1 KV codec table and exit. No model load. |
 
+The smoke probe renders its fixed seed prompt through the snapshot's
+`chat_template.jinja` when present, so an instruction-tuned model is exercised
+on the same turn-structured input it is served with. A *bare* instruction (no
+turn markers) makes some healthy instruction-tuned models loop a filler token —
+the reference loader (`mlx-lm`) reproduces this identically — which previously
+raised false `BrokenPunctLoop` verdicts (e.g. the QAT-4bit `gemma-4-12B`
+unified snapshots, which serve coherently via the chat template). Snapshots
+with no chat template fall back to the bare-instruction seed.
+
 ---
 
 ### `baseline`

diff --git a/docs/MODELS.md b/docs/MODELS.md
@@ -649,6 +649,16 @@ arch string to the Gemma4 text loader; the multimodal-embedder tensors
 (`embed_vision`/`embed_audio`/`vision_embedder.*`) are not read, so image/audio
 input is not yet wired for 12B (text serves end-to-end).
 
+Text serves correctly at **all weight quants**, including the mixed 4/8-bit QAT
+snapshots (`gemma-4-12B-it-qat-4bit` affine, `gemma-4-12B-it-qat-mxfp4`): their
+`quantization` block keeps the MLP `gate/up/down` projections at 8-bit while the
+rest is 4-bit, which the per-tensor override resolver handles unchanged. These
+snapshots emit a degenerate filler token (`'1'`) on a *bare* instruction prompt
+with no turn markers — `mlx-lm` reproduces this identically, and the mxfp8 build
+degenerates the same way to `.`/`_`. The `--probe-smoke` heuristic therefore
+templates its seed (see `docs/CLI.md` `info`) so the verdict matches the served
+behaviour rather than the bare-prompt artifact.
+
 ### Key structural properties
 
 **SWA + FullAttention alternation.** Per-layer `layer_types` array determines