From 896f7f7b4766874f82de101dd567caee91d5f426 Mon Sep 17 00:00:00 2001
From: Pushkinist <4850452+Pushkinist@users.noreply.github.com>
Date: Wed, 17 Jun 2026 17:40:33 +0700
Subject: [PATCH 1/2] fix(smoke): template the smoke-probe seed; fixes false
 Broken verdict on gemma4-unified 4-bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The --probe-smoke heuristic fed the model a bare instruction prompt with no
turn markers. Instruction-tuned snapshots can degenerate into a repeated filler
token on such input — the mlx-lm reference loader reproduces this identically,
and the mxfp8 gemma-4-12B build degenerates the same way (to '.'/'_'). The QAT
4-bit unified 12B snapshots happen to land on '1', which the classifier flags
as BrokenPunctLoop, while the served chat-template path generates correctly
("The capital of France is Paris.").

This was a false positive in the probe, not a 4-bit dequant bug: rMLX's step-0
logits match the oracle to within quant noise (top id 236770 '1', |max|~28.4 vs
28.5). The per-tensor mixed 4/8-bit quant overrides are resolved correctly.

Fix generally: render the fixed smoke seed through the snapshot's real
chat_template.jinja when present (production-shaped, turn-structured input),
falling back to the bare seed for base/non-chat snapshots. New
chat_template::smoke_prompt_ids owns this; both the rmlx info probe and the
server --require-smoke-probe gate use it. run_smoke_probe gains an optional
caller-supplied prompt-ids override so rmlx-models stays free of the template
engine dep.

- crates/rmlx-server/src/chat_template.rs: smoke_prompt_ids + templated builder
- crates/rmlx-models/src/arch/loader.rs: run_smoke_probe prompt-ids override
- crates/rmlx-server/src/openai/state.rs: render templated probe prompt at gate
- crates/rmlx-cli/src/commands/info.rs: use templated smoke_prompt_ids
- chat_template_tests.rs: template-path + bare-seed fallback unit tests
- docs/CLI.md, docs/MODELS.md: document the templated probe + 4-bit text status
---
 crates/rmlx-cli/src/commands/info.rs          | 17 ++--
 crates/rmlx-models/src/arch/loader.rs         | 19 +++-
 crates/rmlx-server/src/chat_template.rs       | 70 ++++++++++++++
 crates/rmlx-server/src/chat_template_tests.rs | 96 +++++++++++++++++++
 crates/rmlx-server/src/openai/state.rs        | 13 +++
 docs/CLI.md                                   |  9 ++
 docs/MODELS.md                                | 10 ++
 7 files changed, 225 insertions(+), 9 deletions(-)

diff --git a/crates/rmlx-cli/src/commands/info.rs b/crates/rmlx-cli/src/commands/info.rs
index 1369759..1685db8 100644
--- a/crates/rmlx-cli/src/commands/info.rs
+++ b/crates/rmlx-cli/src/commands/info.rs
@@ -396,8 +396,7 @@ pub(crate) fn run_info(
         };
         print_load_phases_json();
 
-        // Load tokenizer. Inline BOS extraction -- avoids adding rmlx-server as a dep
-        // of the probe path (dep-DAG constraint). ~10 lines mirrors tokenizer_io logic.
+        // Load tokenizer.
         let bos_id = load_bos_id(model_path)?;
         info!(bos_id, "smoke_probe: resolved BOS token id");
 
@@ -405,10 +404,16 @@ pub(crate) fn run_info(
         let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path)
             .map_err(|e| anyhow::anyhow!("smoke_probe: load tokenizer: {e}"))?;
 
-        // B5b: seed with the fixed deterministic prompt (shared single source
-        // of truth in arch::smoke_prompt_ids — do NOT fork the seed logic).
-        let prompt_ids = arch::smoke_prompt_ids(&tokenizer, bos_id)
-            .map_err(|e| anyhow::anyhow!("smoke_probe: build seed prompt: {e}"))?;
+        // Build the smoke prompt through the model's real chat template when one
+        // exists (production-shaped, turn-structured input), falling back to the
+        // shared bare seed otherwise. This keeps the probe verdict consistent
+        // with how the model is actually served — a bare instruction can make a
+        // healthy instruction-tuned snapshot degenerate into a filler loop (the
+        // reference loader does the same), which previously raised false
+        // Broken* verdicts. Single source of truth lives in chat_template.
+        let prompt_ids =
+            rmlx_server::chat_template::smoke_prompt_ids(model_path, &tokenizer, bos_id)
+                .map_err(|e| anyhow::anyhow!("smoke_probe: build seed prompt: {e}"))?;
 
         info!(
             ?kv_quant_override,
diff --git a/crates/rmlx-models/src/arch/loader.rs b/crates/rmlx-models/src/arch/loader.rs
index 91a680b..22f0abf 100644
--- a/crates/rmlx-models/src/arch/loader.rs
+++ b/crates/rmlx-models/src/arch/loader.rs
@@ -371,6 +371,14 @@ pub fn smoke_prompt_ids(tokenizer: &tokenizers::Tokenizer, bos_id: u32) -> Resul
 /// `tokenizer_config.json`, runs greedy generation for 8 steps, and returns
 /// a `SmokeVerdict`. Used by the server's `--require-smoke-probe` gate (B5).
 ///
+/// `prompt_ids_override` lets a caller that owns the chat-template engine
+/// (e.g. `rmlx-server`) feed a production-shaped, turn-structured prompt so the
+/// probe matches how the model is actually served. When `None`, the probe falls
+/// back to the shared bare-instruction seed (`smoke_prompt_ids`). Instruction
+/// models can degenerate into repeated filler on a bare prompt even when
+/// healthy — the reference loader reproduces this identically — so the templated
+/// path is preferred when available to avoid false `Broken*` verdicts.
+///
 /// Returns `Err` only for hard load/tokenizer failures. `Ok(verdict)` where
 /// `verdict != SmokeVerdict::Ok` means the snapshot is broken but loadable.
 pub fn run_smoke_probe(
@@ -378,6 +386,7 @@ pub fn run_smoke_probe(
     device: Device,
     kv_quant: Option<rmlx_kv_quant::KvQuant>,
     max_ctx_override: Option<i32>,
+    prompt_ids_override: Option<Vec<u32>>,
 ) -> Result<crate::decode_loop::SmokeVerdict> {
     let model = load_model(model_dir, device, &LoadOpts::default())?;
 
@@ -389,11 +398,15 @@ pub fn run_smoke_probe(
     let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path)
         .map_err(|e| Error::Model(format!("run_smoke_probe: load tokenizer: {e}")))?;
 
-    // B5b: seed with a fixed deterministic prompt instead of bare BOS so
-    // healthy instruction-tuned snapshots generate real text.
-    let prompt_ids = smoke_prompt_ids(&tokenizer, bos_id)?;
+    // Prefer a caller-provided, chat-templated prompt; otherwise seed with the
+    // shared bare instruction so healthy snapshots generate real text.
+    let (prompt_ids, templated) = match prompt_ids_override {
+        Some(ids) if !ids.is_empty() => (ids, true),
+        _ => (smoke_prompt_ids(&tokenizer, bos_id)?, false),
+    };
     tracing::info!(
         prompt_len = prompt_ids.len(),
+        templated,
         seed = SMOKE_PROMPT,
         "run_smoke_probe: seeded smoke prompt"
     );
diff --git a/crates/rmlx-server/src/chat_template.rs b/crates/rmlx-server/src/chat_template.rs
index f8f221b..4a10d64 100644
--- a/crates/rmlx-server/src/chat_template.rs
+++ b/crates/rmlx-server/src/chat_template.rs
@@ -218,6 +218,76 @@ pub fn load_template_source(model_dir: &Path) -> Result<String> {
         .map_err(|e| Error::Other(format!("cannot read {}: {e}", path.display())))
 }
 
+// ── Smoke-probe prompt ─────────────────────────────────────────────────────────
+
+/// Build the smoke-probe input token ids for a model snapshot, preferring the
+/// model's real chat template so the probe exercises the same prompt shape the
+/// model is served with in production.
+///
+/// Instruction-tuned models are trained to continue *turn-structured* input
+/// (`<start_of_turn>user … <start_of_turn>model`). Fed a bare instruction with
+/// no turn scaffolding, a healthy model can still degenerate into a repeated
+/// filler token — a behaviour the reference loader reproduces identically. That
+/// made the bare-seed probe (`arch::smoke_prompt_ids`) raise false `Broken*`
+/// verdicts for snapshots that generate correctly through `serve`. Rendering
+/// the canonical seed through `chat_template.jinja` removes that false positive
+/// generally, for any arch whose bare-prompt continuation is degenerate.
+///
+/// Falls back to the bare-seed `arch::smoke_prompt_ids` when the snapshot has no
+/// `chat_template.jinja` or the render/encode fails — preserving probe coverage
+/// for base (non-chat) snapshots. The template emits its own `<bos>`, so the
+/// rendered text is tokenized with `add_special_tokens = false`.
+pub fn smoke_prompt_ids(
+    model_dir: &Path,
+    tokenizer: &tokenizers::Tokenizer,
+    bos_id: u32,
+) -> Result<Vec<u32>> {
+    if let Some(ids) = templated_smoke_prompt_ids(model_dir, tokenizer) {
+        return Ok(ids);
+    }
+    // No usable chat template — fall back to the shared bare seed.
+    rmlx_models::arch::smoke_prompt_ids(tokenizer, bos_id)
+        .map_err(|e| Error::Other(format!("smoke_prompt_ids: bare-seed build failed: {e}")))
+}
+
+/// Render `arch::SMOKE_PROMPT` as a single user turn through the model's chat
+/// template and tokenize the result. Returns `None` (so the caller falls back
+/// to the bare seed) when no template exists or rendering/encoding fails.
+fn templated_smoke_prompt_ids(
+    model_dir: &Path,
+    tokenizer: &tokenizers::Tokenizer,
+) -> Option<Vec<u32>> {
+    let src = load_template_source(model_dir).ok()?;
+    let tpl = ChatTemplate::new(src).ok()?;
+
+    let cfg = crate::tokenizer_io::load_tokenizer_config(model_dir).ok()?;
+    let bos_token = cfg.bos_token.as_deref().unwrap_or("");
+    let eos_token = cfg.eos_token.as_deref().unwrap_or("");
+
+    let messages = [ChatMessageTpl {
+        role: "user",
+        content: rmlx_models::arch::SMOKE_PROMPT,
+        ..ChatMessageTpl::default()
+    }];
+    let opts = RenderOpts {
+        bos_token,
+        eos_token,
+        add_generation_prompt: true,
+        tools: &[],
+        enable_thinking: None,
+    };
+    let rendered = tpl.render(&messages, &opts).ok()?;
+
+    // The template already emits the BOS marker, so encode without re-adding
+    // special tokens (mirrors the production request path).
+    let enc = tokenizer.encode(rendered.text.as_str(), false).ok()?;
+    let ids = enc.get_ids().to_vec();
+    if ids.is_empty() {
+        return None;
+    }
+    Some(ids)
+}
+
 // ── ChatTemplate ──────────────────────────────────────────────────────────────
 
 impl ChatTemplate {
diff --git a/crates/rmlx-server/src/chat_template_tests.rs b/crates/rmlx-server/src/chat_template_tests.rs
index adca95c..43d3678 100644
--- a/crates/rmlx-server/src/chat_template_tests.rs
+++ b/crates/rmlx-server/src/chat_template_tests.rs
@@ -633,3 +633,99 @@ fn qwen3_plain_render_byte_identical_after_tool_support() {
         "plain render diverged after tool-support change (A5.2 invariant)"
     );
 }
+
+// ── Smoke-probe prompt: template-shaped vs bare-seed fallback ───────────────
+//
+// Regression guard for the gemma4-unified 4-bit false-positive: the smoke probe
+// must feed turn-structured input when the snapshot ships a chat template, so a
+// healthy instruction-tuned model is exercised the same way it is served. A bare
+// instruction prompt can make even a healthy model loop a filler token; rendering
+// through the template removes that false Broken* verdict. These two tests assert
+// the template path is taken when present and the bare-seed fallback otherwise —
+// no model snapshot required.
+
+/// A minimal WordLevel `tokenizer.json` whose vocab covers the turn markers, the
+/// `user`/`model` role words, and every whitespace-split token of `SMOKE_PROMPT`.
+/// Token ids are arbitrary but distinct so the encoded id sequence is checkable.
+fn write_smoke_fixture(dir: &Path, with_template: bool) {
+    // Whitespace pre-tokenizer splits on spaces and punctuation, so `France?`
+    // becomes `France` + `?`; the vocab lists both pieces. The `?` id is 26.
+    let vocab = r#"{
+        "<bos>":0,"<eos>":1,"<unk>":2,
+        "<start_of_turn>":10,"<end_of_turn>":11,"user":12,"model":13,
+        "What":20,"is":21,"the":22,"capital":23,"of":24,"France":25,"?":26
+    }"#;
+    // The angle-bracket markers must tokenize atomically (the Whitespace
+    // pre-tokenizer would otherwise split `<bos>` into `< bos >`), so they are
+    // registered as added/special tokens — mirroring real HF tokenizers.
+    let added = r#"[
+        {"id":0,"content":"<bos>","single_word":false,"lstrip":false,"rstrip":false,"normalized":false,"special":true},
+        {"id":10,"content":"<start_of_turn>","single_word":false,"lstrip":false,"rstrip":false,"normalized":false,"special":true},
+        {"id":11,"content":"<end_of_turn>","single_word":false,"lstrip":false,"rstrip":false,"normalized":false,"special":true}
+    ]"#;
+    let tok_json = format!(
+        r#"{{"version":"1.0","truncation":null,"padding":null,"added_tokens":{added},"normalizer":null,"pre_tokenizer":{{"type":"Whitespace"}},"post_processor":null,"decoder":null,"model":{{"type":"WordLevel","vocab":{vocab},"unk_token":"<unk>"}}}}"#
+    );
+    std::fs::write(dir.join("tokenizer.json"), tok_json).expect("write tokenizer.json");
+    std::fs::write(
+        dir.join("tokenizer_config.json"),
+        r#"{"bos_token":"<bos>","eos_token":"<eos>"}"#,
+    )
+    .expect("write tokenizer_config.json");
+    if with_template {
+        // Gemma-style turn markers. The template emits its own BOS. Tokens are
+        // space-separated so the Whitespace pre-tokenizer yields exact vocab ids.
+        let tpl = "{{ bos_token }} {% for m in messages %}<start_of_turn> {{ m.role }} {{ m.content }} <end_of_turn> {% endfor %}{% if add_generation_prompt %}<start_of_turn> model{% endif %}";
+        std::fs::write(dir.join("chat_template.jinja"), tpl).expect("write chat_template.jinja");
+    }
+}
+
+#[test]
+fn smoke_prompt_uses_chat_template_when_present() {
+    let tmp = tempfile::tempdir().expect("tempdir");
+    write_smoke_fixture(tmp.path(), true);
+    let tk = tokenizers::Tokenizer::from_file(tmp.path().join("tokenizer.json")).expect("load tok");
+
+    let ids = smoke_prompt_ids(tmp.path(), &tk, 0).expect("templated smoke prompt");
+
+    // Must be turn-structured: starts with BOS(0) + <start_of_turn>(10) user(12),
+    // contains the prompt tokens, and ends on the model-turn opener (10, 13).
+    assert_eq!(
+        ids.first(),
+        Some(&0),
+        "templated prompt must begin with BOS"
+    );
+    assert!(
+        ids.windows(2).any(|w| w == [10, 12]),
+        "expected a <start_of_turn> user span: {ids:?}"
+    );
+    assert_eq!(
+        &ids[ids.len() - 2..],
+        &[10, 13],
+        "templated prompt must end on the <start_of_turn> model opener: {ids:?}"
+    );
+    // The prompt body tokens are present (What=20, capital=23, France=25).
+    for t in [20u32, 23, 25] {
+        assert!(ids.contains(&t), "missing prompt token {t} in {ids:?}");
+    }
+}
+
+#[test]
+fn smoke_prompt_falls_back_to_bare_seed_without_template() {
+    let tmp = tempfile::tempdir().expect("tempdir");
+    write_smoke_fixture(tmp.path(), false); // no chat_template.jinja
+    let tk = tokenizers::Tokenizer::from_file(tmp.path().join("tokenizer.json")).expect("load tok");
+
+    let ids = smoke_prompt_ids(tmp.path(), &tk, 0).expect("bare-seed smoke prompt");
+
+    // Bare seed = [bos] + SMOKE_PROMPT tokens, no turn markers (10/13 absent).
+    assert_eq!(ids.first(), Some(&0), "bare seed must begin with BOS");
+    assert!(
+        !ids.contains(&10) && !ids.contains(&13),
+        "bare-seed fallback must not contain turn markers: {ids:?}"
+    );
+    assert!(
+        ids.contains(&20) && ids.contains(&25),
+        "missing prompt body: {ids:?}"
+    );
+}
diff --git a/crates/rmlx-server/src/openai/state.rs b/crates/rmlx-server/src/openai/state.rs
index 0e8c479..8d89046 100644
--- a/crates/rmlx-server/src/openai/state.rs
+++ b/crates/rmlx-server/src/openai/state.rs
@@ -854,11 +854,24 @@ impl AppState {
         // zero-overhead path unchanged.
         if self.require_smoke_probe {
             tracing::info!(model_id, path = %entry.abs_path.display(), "B5: running smoke probe before first load");
+
+            // Render the smoke seed through the model's real chat template so the
+            // probe exercises production-shaped, turn-structured input. Falls back
+            // to the bare seed inside run_smoke_probe when no template / encode
+            // succeeds (handled by passing None).
+            let templated_prompt = crate::tokenizer_io::load_tokenizer(&entry.abs_path)
+                .ok()
+                .and_then(|tk| {
+                    let bos_id = tk.token_to_id("<bos>").unwrap_or(2);
+                    crate::chat_template::smoke_prompt_ids(&entry.abs_path, &tk, bos_id).ok()
+                });
+
             let verdict = rmlx_models::arch::run_smoke_probe(
                 &entry.abs_path,
                 rmlx_mlx::Device::Gpu,
                 None, // use arch default KV quant
                 None, // use model default max_ctx
+                templated_prompt,
             )
             .map_err(|e| format!("smoke probe error for '{model_id}': {e}"))?;
 
diff --git a/docs/CLI.md b/docs/CLI.md
index fb35711..018a2af 100644
--- a/docs/CLI.md
+++ b/docs/CLI.md
@@ -265,6 +265,15 @@ rmlx info --model /path/to/snapshot --probe-smoke
 | `--max-ctx` | u32 | (from model) | **Virtual ceiling** on context length, in tokens (not an eager allocation): the KV ring grows lazily up to it, prompts over it are rejected. See `docs/KV_CACHE.md` §4.6. Must be ≥ 256 when set. |
 | `--list-cache-types` | bool flag | off | Print the full §D1 KV codec table and exit. No model load. |
 
+The smoke probe renders its fixed seed prompt through the snapshot's
+`chat_template.jinja` when present, so an instruction-tuned model is exercised
+on the same turn-structured input it is served with. A *bare* instruction (no
+turn markers) makes some healthy instruction-tuned models loop a filler token —
+the reference loader (`mlx-lm`) reproduces this identically — which previously
+raised false `BrokenPunctLoop` verdicts (e.g. the QAT-4bit `gemma-4-12B`
+unified snapshots, which serve coherently via the chat template). Snapshots
+with no chat template fall back to the bare-instruction seed.
+
 ---
 
 ### `baseline`
diff --git a/docs/MODELS.md b/docs/MODELS.md
index b45b539..dfb56fa 100644
--- a/docs/MODELS.md
+++ b/docs/MODELS.md
@@ -649,6 +649,16 @@ arch string to the Gemma4 text loader; the multimodal-embedder tensors
 (`embed_vision`/`embed_audio`/`vision_embedder.*`) are not read, so image/audio
 input is not yet wired for 12B (text serves end-to-end).
 
+Text serves correctly at **all weight quants**, including the mixed 4/8-bit QAT
+snapshots (`gemma-4-12B-it-qat-4bit` affine, `gemma-4-12B-it-qat-mxfp4`): their
+`quantization` block keeps the MLP `gate/up/down` projections at 8-bit while the
+rest is 4-bit, which the per-tensor override resolver handles unchanged. These
+snapshots emit a degenerate filler token (`'1'`) on a *bare* instruction prompt
+with no turn markers — `mlx-lm` reproduces this identically, and the mxfp8 build
+degenerates the same way to `.`/`_`. The `--probe-smoke` heuristic therefore
+templates its seed (see `docs/CLI.md` `info`) so the verdict matches the served
+behaviour rather than the bare-prompt artifact.
+
 ### Key structural properties
 
 **SWA + FullAttention alternation.** Per-layer `layer_types` array determines

From f02a978ff4ee9f8c2ca00027a91d5cd6cca4cbd6 Mon Sep 17 00:00:00 2001
From: Pushkinist <4850452+Pushkinist@users.noreply.github.com>
Date: Wed, 17 Jun 2026 18:02:08 +0700
Subject: [PATCH 2/2] fix(smoke): drop hardcoded BOS in server probe; trace
 templated-seed fallback

The server --require-smoke-probe bare-seed fallback computed BOS as
token_to_id("<bos>").unwrap_or(2), a hardcoded string + magic id that
seeded the probe wrong for chat-template-less vocabs lacking <bos> (and,
passed as Some(ids), suppressed run_smoke_probe's own resolve_bos_id).

smoke_prompt_ids now returns Option<Vec<u32>> from the chat-template render
path only; when no usable template exists it returns None so each entry
point uses its own canonical BOS resolver: run_smoke_probe::resolve_bos_id
(server) and load_bos_id + arch::smoke_prompt_ids (CLI info). No token id is
invented in chat_template.rs.

Per the traceability rule, the templated->bare-seed fallback now emits a
debug! event carrying the failure reason (load/compile/render/encode/empty),
so a run's .jsonl shows whether the probe ran templated or fell back and why.

Also fix a stale doc reference (ChatTemplate::from_template_string ->
ChatTemplate::new) and update the None-path unit test to assert no magic id
2 leaks.
---
 crates/rmlx-cli/src/commands/info.rs          | 13 ++-
 crates/rmlx-server/src/chat_template.rs       | 81 +++++++++++--------
 crates/rmlx-server/src/chat_template_tests.rs | 19 ++++-
 crates/rmlx-server/src/openai/state.rs        | 12 ++-
 4 files changed, 78 insertions(+), 47 deletions(-)

diff --git a/crates/rmlx-cli/src/commands/info.rs b/crates/rmlx-cli/src/commands/info.rs
index 1685db8..514bdc4 100644
--- a/crates/rmlx-cli/src/commands/info.rs
+++ b/crates/rmlx-cli/src/commands/info.rs
@@ -410,10 +410,15 @@ pub(crate) fn run_info(
         // with how the model is actually served — a bare instruction can make a
         // healthy instruction-tuned snapshot degenerate into a filler loop (the
         // reference loader does the same), which previously raised false
-        // Broken* verdicts. Single source of truth lives in chat_template.
-        let prompt_ids =
-            rmlx_server::chat_template::smoke_prompt_ids(model_path, &tokenizer, bos_id)
-                .map_err(|e| anyhow::anyhow!("smoke_probe: build seed prompt: {e}"))?;
+        // Broken* verdicts. `smoke_prompt_ids` returns None for base snapshots
+        // with no usable template; the bare seed then uses the canonical
+        // `load_bos_id` resolution already performed above.
+        let prompt_ids = match rmlx_server::chat_template::smoke_prompt_ids(model_path, &tokenizer)
+        {
+            Some(ids) => ids,
+            None => arch::smoke_prompt_ids(&tokenizer, bos_id)
+                .map_err(|e| anyhow::anyhow!("smoke_probe: build seed prompt: {e}"))?,
+        };
 
         info!(
             ?kv_quant_override,
diff --git a/crates/rmlx-server/src/chat_template.rs b/crates/rmlx-server/src/chat_template.rs
index 4a10d64..d63fee9 100644
--- a/crates/rmlx-server/src/chat_template.rs
+++ b/crates/rmlx-server/src/chat_template.rs
@@ -157,7 +157,7 @@ pub struct RenderOpts<'a> {
 /// Construct once per model; `render` may be called many times.
 #[allow(
     clippy::exhaustive_structs,
-    reason = "internal closed template struct — private minijinja environment field; public API is from_template_string() and render(); adding a field requires updating ChatTemplate::from_template_string"
+    reason = "internal closed template struct — private minijinja environment field; public API is ChatTemplate::new() and render(); adding a field requires updating ChatTemplate::new"
 )]
 pub struct ChatTemplate {
     env: Environment<'static>,
@@ -220,47 +220,58 @@ pub fn load_template_source(model_dir: &Path) -> Result<String> {
 
 // ── Smoke-probe prompt ─────────────────────────────────────────────────────────
 
-/// Build the smoke-probe input token ids for a model snapshot, preferring the
-/// model's real chat template so the probe exercises the same prompt shape the
-/// model is served with in production.
+/// Build the smoke-probe input token ids for a model snapshot by rendering the
+/// canonical seed through the model's real `chat_template.jinja`, so the probe
+/// exercises the same prompt shape the model is served with in production.
 ///
 /// Instruction-tuned models are trained to continue *turn-structured* input
 /// (`<start_of_turn>user … <start_of_turn>model`). Fed a bare instruction with
 /// no turn scaffolding, a healthy model can still degenerate into a repeated
 /// filler token — a behaviour the reference loader reproduces identically. That
-/// made the bare-seed probe (`arch::smoke_prompt_ids`) raise false `Broken*`
-/// verdicts for snapshots that generate correctly through `serve`. Rendering
-/// the canonical seed through `chat_template.jinja` removes that false positive
-/// generally, for any arch whose bare-prompt continuation is degenerate.
+/// made the bare-seed probe raise false `Broken*` verdicts for snapshots that
+/// generate correctly through `serve`. Rendering the canonical seed through
+/// `chat_template.jinja` removes that false positive generally, for any arch
+/// whose bare-prompt continuation is degenerate.
 ///
-/// Falls back to the bare-seed `arch::smoke_prompt_ids` when the snapshot has no
-/// `chat_template.jinja` or the render/encode fails — preserving probe coverage
-/// for base (non-chat) snapshots. The template emits its own `<bos>`, so the
-/// rendered text is tokenized with `add_special_tokens = false`.
-pub fn smoke_prompt_ids(
-    model_dir: &Path,
-    tokenizer: &tokenizers::Tokenizer,
-    bos_id: u32,
-) -> Result<Vec<u32>> {
-    if let Some(ids) = templated_smoke_prompt_ids(model_dir, tokenizer) {
-        return Ok(ids);
+/// Returns `None` when the snapshot has no usable `chat_template.jinja` (or the
+/// render/encode fails) — base / non-chat snapshots. The caller then passes
+/// `None` to `run_smoke_probe`, which builds the shared bare seed itself with
+/// its own canonical BOS resolution. Keeping the bare-seed construction out of
+/// this function means the BOS fallback chain lives in exactly one place per
+/// entry point and no token id is hard-coded here.
+///
+/// The template emits its own `<bos>`, so the rendered text is tokenized with
+/// `add_special_tokens = false` (mirrors the production request path).
+pub fn smoke_prompt_ids(model_dir: &Path, tokenizer: &tokenizers::Tokenizer) -> Option<Vec<u32>> {
+    match render_templated_seed(model_dir, tokenizer) {
+        Ok(ids) => Some(ids),
+        Err(reason) => {
+            // Expected for base / non-chat snapshots. Recorded at debug level so
+            // a run's `.jsonl` shows whether the probe ran templated or fell back
+            // to the bare seed, and why — without warning on the normal case.
+            tracing::debug!(
+                reason,
+                "smoke_prompt_ids: no usable chat template — using bare seed"
+            );
+            None
+        }
     }
-    // No usable chat template — fall back to the shared bare seed.
-    rmlx_models::arch::smoke_prompt_ids(tokenizer, bos_id)
-        .map_err(|e| Error::Other(format!("smoke_prompt_ids: bare-seed build failed: {e}")))
 }
 
 /// Render `arch::SMOKE_PROMPT` as a single user turn through the model's chat
-/// template and tokenize the result. Returns `None` (so the caller falls back
-/// to the bare seed) when no template exists or rendering/encoding fails.
-fn templated_smoke_prompt_ids(
+/// template and tokenize the result. Returns the encoded ids on success, or a
+/// human-readable reason string on any miss (no template, compile/render/encode
+/// failure, or empty output) so the caller can log it once at the fallback
+/// boundary.
+fn render_templated_seed(
     model_dir: &Path,
     tokenizer: &tokenizers::Tokenizer,
-) -> Option<Vec<u32>> {
-    let src = load_template_source(model_dir).ok()?;
-    let tpl = ChatTemplate::new(src).ok()?;
+) -> std::result::Result<Vec<u32>, String> {
+    let src = load_template_source(model_dir).map_err(|e| format!("load template: {e}"))?;
+    let tpl = ChatTemplate::new(src).map_err(|e| format!("compile template: {e}"))?;
 
-    let cfg = crate::tokenizer_io::load_tokenizer_config(model_dir).ok()?;
+    let cfg = crate::tokenizer_io::load_tokenizer_config(model_dir)
+        .map_err(|e| format!("load tokenizer_config: {e}"))?;
     let bos_token = cfg.bos_token.as_deref().unwrap_or("");
     let eos_token = cfg.eos_token.as_deref().unwrap_or("");
 
@@ -276,16 +287,20 @@ fn templated_smoke_prompt_ids(
         tools: &[],
         enable_thinking: None,
     };
-    let rendered = tpl.render(&messages, &opts).ok()?;
+    let rendered = tpl
+        .render(&messages, &opts)
+        .map_err(|e| format!("render template: {e}"))?;
 
     // The template already emits the BOS marker, so encode without re-adding
     // special tokens (mirrors the production request path).
-    let enc = tokenizer.encode(rendered.text.as_str(), false).ok()?;
+    let enc = tokenizer
+        .encode(rendered.text.as_str(), false)
+        .map_err(|e| format!("encode rendered prompt: {e}"))?;
     let ids = enc.get_ids().to_vec();
     if ids.is_empty() {
-        return None;
+        return Err("rendered prompt encoded to zero tokens".to_owned());
     }
-    Some(ids)
+    Ok(ids)
 }
 
 // ── ChatTemplate ──────────────────────────────────────────────────────────────
diff --git a/crates/rmlx-server/src/chat_template_tests.rs b/crates/rmlx-server/src/chat_template_tests.rs
index 43d3678..3644b5b 100644
--- a/crates/rmlx-server/src/chat_template_tests.rs
+++ b/crates/rmlx-server/src/chat_template_tests.rs
@@ -686,7 +686,7 @@ fn smoke_prompt_uses_chat_template_when_present() {
     write_smoke_fixture(tmp.path(), true);
     let tk = tokenizers::Tokenizer::from_file(tmp.path().join("tokenizer.json")).expect("load tok");
 
-    let ids = smoke_prompt_ids(tmp.path(), &tk, 0).expect("templated smoke prompt");
+    let ids = smoke_prompt_ids(tmp.path(), &tk).expect("templated smoke prompt");
 
     // Must be turn-structured: starts with BOS(0) + <start_of_turn>(10) user(12),
     // contains the prompt tokens, and ends on the model-turn opener (10, 13).
@@ -710,15 +710,28 @@ fn smoke_prompt_uses_chat_template_when_present() {
     }
 }
 
+/// Without a chat template, `smoke_prompt_ids` must return `None` so the caller
+/// (`run_smoke_probe` / the CLI probe) builds the bare seed itself with its own
+/// canonical BOS resolution. The function must NOT invent a token id — earlier
+/// the server probe hard-coded a magic id when no `<bos>` resolved, seeding the
+/// probe wrong; returning `None` keeps the BOS fallback chain in one place.
 #[test]
 fn smoke_prompt_falls_back_to_bare_seed_without_template() {
     let tmp = tempfile::tempdir().expect("tempdir");
     write_smoke_fixture(tmp.path(), false); // no chat_template.jinja
     let tk = tokenizers::Tokenizer::from_file(tmp.path().join("tokenizer.json")).expect("load tok");
 
-    let ids = smoke_prompt_ids(tmp.path(), &tk, 0).expect("bare-seed smoke prompt");
+    // No template ⇒ None (the templated path is the only thing this fn owns).
+    assert!(
+        smoke_prompt_ids(tmp.path(), &tk).is_none(),
+        "no chat template must yield None so the caller resolves BOS itself"
+    );
 
-    // Bare seed = [bos] + SMOKE_PROMPT tokens, no turn markers (10/13 absent).
+    // The bare-seed builder the caller falls back to is the shared
+    // `arch::smoke_prompt_ids`, which takes the canonically-resolved BOS id and
+    // never substitutes a magic literal. Seeded with a real BOS (0 here) it
+    // produces [bos] + SMOKE_PROMPT, no turn markers.
+    let ids = rmlx_models::arch::smoke_prompt_ids(&tk, 0).expect("bare-seed smoke prompt");
     assert_eq!(ids.first(), Some(&0), "bare seed must begin with BOS");
     assert!(
         !ids.contains(&10) && !ids.contains(&13),
diff --git a/crates/rmlx-server/src/openai/state.rs b/crates/rmlx-server/src/openai/state.rs
index 8d89046..869a5fa 100644
--- a/crates/rmlx-server/src/openai/state.rs
+++ b/crates/rmlx-server/src/openai/state.rs
@@ -856,15 +856,13 @@ impl AppState {
             tracing::info!(model_id, path = %entry.abs_path.display(), "B5: running smoke probe before first load");
 
             // Render the smoke seed through the model's real chat template so the
-            // probe exercises production-shaped, turn-structured input. Falls back
-            // to the bare seed inside run_smoke_probe when no template / encode
-            // succeeds (handled by passing None).
+            // probe exercises production-shaped, turn-structured input. When no
+            // usable template exists, `smoke_prompt_ids` returns None and
+            // `run_smoke_probe` builds the bare seed itself with its own
+            // canonical BOS resolution — no token id is invented here.
             let templated_prompt = crate::tokenizer_io::load_tokenizer(&entry.abs_path)
                 .ok()
-                .and_then(|tk| {
-                    let bos_id = tk.token_to_id("<bos>").unwrap_or(2);
-                    crate::chat_template::smoke_prompt_ids(&entry.abs_path, &tk, bos_id).ok()
-                });
+                .and_then(|tk| crate::chat_template::smoke_prompt_ids(&entry.abs_path, &tk));
 
             let verdict = rmlx_models::arch::run_smoke_probe(
                 &entry.abs_path,