From 896f7f7b4766874f82de101dd567caee91d5f426 Mon Sep 17 00:00:00 2001 From: Pushkinist <4850452+Pushkinist@users.noreply.github.com> Date: Wed, 17 Jun 2026 17:40:33 +0700 Subject: [PATCH 1/2] fix(smoke): template the smoke-probe seed; fixes false Broken verdict on gemma4-unified 4-bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The --probe-smoke heuristic fed the model a bare instruction prompt with no turn markers. Instruction-tuned snapshots can degenerate into a repeated filler token on such input — the mlx-lm reference loader reproduces this identically, and the mxfp8 gemma-4-12B build degenerates the same way (to '.'/'_'). The QAT 4-bit unified 12B snapshots happen to land on '1', which the classifier flags as BrokenPunctLoop, while the served chat-template path generates correctly ("The capital of France is Paris."). This was a false positive in the probe, not a 4-bit dequant bug: rMLX's step-0 logits match the oracle to within quant noise (top id 236770 '1', |max|~28.4 vs 28.5). The per-tensor mixed 4/8-bit quant overrides are resolved correctly. Fix generally: render the fixed smoke seed through the snapshot's real chat_template.jinja when present (production-shaped, turn-structured input), falling back to the bare seed for base/non-chat snapshots. New chat_template::smoke_prompt_ids owns this; both the rmlx info probe and the server --require-smoke-probe gate use it. run_smoke_probe gains an optional caller-supplied prompt-ids override so rmlx-models stays free of the template engine dep. - crates/rmlx-server/src/chat_template.rs: smoke_prompt_ids + templated builder - crates/rmlx-models/src/arch/loader.rs: run_smoke_probe prompt-ids override - crates/rmlx-server/src/openai/state.rs: render templated probe prompt at gate - crates/rmlx-cli/src/commands/info.rs: use templated smoke_prompt_ids - chat_template_tests.rs: template-path + bare-seed fallback unit tests - docs/CLI.md, docs/MODELS.md: document the templated probe + 4-bit text status --- crates/rmlx-cli/src/commands/info.rs | 17 ++-- crates/rmlx-models/src/arch/loader.rs | 19 +++- crates/rmlx-server/src/chat_template.rs | 70 ++++++++++++++ crates/rmlx-server/src/chat_template_tests.rs | 96 +++++++++++++++++++ crates/rmlx-server/src/openai/state.rs | 13 +++ docs/CLI.md | 9 ++ docs/MODELS.md | 10 ++ 7 files changed, 225 insertions(+), 9 deletions(-) diff --git a/crates/rmlx-cli/src/commands/info.rs b/crates/rmlx-cli/src/commands/info.rs index 1369759..1685db8 100644 --- a/crates/rmlx-cli/src/commands/info.rs +++ b/crates/rmlx-cli/src/commands/info.rs @@ -396,8 +396,7 @@ pub(crate) fn run_info( }; print_load_phases_json(); - // Load tokenizer. Inline BOS extraction -- avoids adding rmlx-server as a dep - // of the probe path (dep-DAG constraint). ~10 lines mirrors tokenizer_io logic. + // Load tokenizer. let bos_id = load_bos_id(model_path)?; info!(bos_id, "smoke_probe: resolved BOS token id"); @@ -405,10 +404,16 @@ pub(crate) fn run_info( let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path) .map_err(|e| anyhow::anyhow!("smoke_probe: load tokenizer: {e}"))?; - // B5b: seed with the fixed deterministic prompt (shared single source - // of truth in arch::smoke_prompt_ids — do NOT fork the seed logic). - let prompt_ids = arch::smoke_prompt_ids(&tokenizer, bos_id) - .map_err(|e| anyhow::anyhow!("smoke_probe: build seed prompt: {e}"))?; + // Build the smoke prompt through the model's real chat template when one + // exists (production-shaped, turn-structured input), falling back to the + // shared bare seed otherwise. This keeps the probe verdict consistent + // with how the model is actually served — a bare instruction can make a + // healthy instruction-tuned snapshot degenerate into a filler loop (the + // reference loader does the same), which previously raised false + // Broken* verdicts. Single source of truth lives in chat_template. + let prompt_ids = + rmlx_server::chat_template::smoke_prompt_ids(model_path, &tokenizer, bos_id) + .map_err(|e| anyhow::anyhow!("smoke_probe: build seed prompt: {e}"))?; info!( ?kv_quant_override, diff --git a/crates/rmlx-models/src/arch/loader.rs b/crates/rmlx-models/src/arch/loader.rs index 91a680b..22f0abf 100644 --- a/crates/rmlx-models/src/arch/loader.rs +++ b/crates/rmlx-models/src/arch/loader.rs @@ -371,6 +371,14 @@ pub fn smoke_prompt_ids(tokenizer: &tokenizers::Tokenizer, bos_id: u32) -> Resul /// `tokenizer_config.json`, runs greedy generation for 8 steps, and returns /// a `SmokeVerdict`. Used by the server's `--require-smoke-probe` gate (B5). /// +/// `prompt_ids_override` lets a caller that owns the chat-template engine +/// (e.g. `rmlx-server`) feed a production-shaped, turn-structured prompt so the +/// probe matches how the model is actually served. When `None`, the probe falls +/// back to the shared bare-instruction seed (`smoke_prompt_ids`). Instruction +/// models can degenerate into repeated filler on a bare prompt even when +/// healthy — the reference loader reproduces this identically — so the templated +/// path is preferred when available to avoid false `Broken*` verdicts. +/// /// Returns `Err` only for hard load/tokenizer failures. `Ok(verdict)` where /// `verdict != SmokeVerdict::Ok` means the snapshot is broken but loadable. pub fn run_smoke_probe( @@ -378,6 +386,7 @@ pub fn run_smoke_probe( device: Device, kv_quant: Option, max_ctx_override: Option, + prompt_ids_override: Option>, ) -> Result { let model = load_model(model_dir, device, &LoadOpts::default())?; @@ -389,11 +398,15 @@ pub fn run_smoke_probe( let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path) .map_err(|e| Error::Model(format!("run_smoke_probe: load tokenizer: {e}")))?; - // B5b: seed with a fixed deterministic prompt instead of bare BOS so - // healthy instruction-tuned snapshots generate real text. - let prompt_ids = smoke_prompt_ids(&tokenizer, bos_id)?; + // Prefer a caller-provided, chat-templated prompt; otherwise seed with the + // shared bare instruction so healthy snapshots generate real text. + let (prompt_ids, templated) = match prompt_ids_override { + Some(ids) if !ids.is_empty() => (ids, true), + _ => (smoke_prompt_ids(&tokenizer, bos_id)?, false), + }; tracing::info!( prompt_len = prompt_ids.len(), + templated, seed = SMOKE_PROMPT, "run_smoke_probe: seeded smoke prompt" ); diff --git a/crates/rmlx-server/src/chat_template.rs b/crates/rmlx-server/src/chat_template.rs index f8f221b..4a10d64 100644 --- a/crates/rmlx-server/src/chat_template.rs +++ b/crates/rmlx-server/src/chat_template.rs @@ -218,6 +218,76 @@ pub fn load_template_source(model_dir: &Path) -> Result { .map_err(|e| Error::Other(format!("cannot read {}: {e}", path.display()))) } +// ── Smoke-probe prompt ───────────────────────────────────────────────────────── + +/// Build the smoke-probe input token ids for a model snapshot, preferring the +/// model's real chat template so the probe exercises the same prompt shape the +/// model is served with in production. +/// +/// Instruction-tuned models are trained to continue *turn-structured* input +/// (`user … model`). Fed a bare instruction with +/// no turn scaffolding, a healthy model can still degenerate into a repeated +/// filler token — a behaviour the reference loader reproduces identically. That +/// made the bare-seed probe (`arch::smoke_prompt_ids`) raise false `Broken*` +/// verdicts for snapshots that generate correctly through `serve`. Rendering +/// the canonical seed through `chat_template.jinja` removes that false positive +/// generally, for any arch whose bare-prompt continuation is degenerate. +/// +/// Falls back to the bare-seed `arch::smoke_prompt_ids` when the snapshot has no +/// `chat_template.jinja` or the render/encode fails — preserving probe coverage +/// for base (non-chat) snapshots. The template emits its own ``, so the +/// rendered text is tokenized with `add_special_tokens = false`. +pub fn smoke_prompt_ids( + model_dir: &Path, + tokenizer: &tokenizers::Tokenizer, + bos_id: u32, +) -> Result> { + if let Some(ids) = templated_smoke_prompt_ids(model_dir, tokenizer) { + return Ok(ids); + } + // No usable chat template — fall back to the shared bare seed. + rmlx_models::arch::smoke_prompt_ids(tokenizer, bos_id) + .map_err(|e| Error::Other(format!("smoke_prompt_ids: bare-seed build failed: {e}"))) +} + +/// Render `arch::SMOKE_PROMPT` as a single user turn through the model's chat +/// template and tokenize the result. Returns `None` (so the caller falls back +/// to the bare seed) when no template exists or rendering/encoding fails. +fn templated_smoke_prompt_ids( + model_dir: &Path, + tokenizer: &tokenizers::Tokenizer, +) -> Option> { + let src = load_template_source(model_dir).ok()?; + let tpl = ChatTemplate::new(src).ok()?; + + let cfg = crate::tokenizer_io::load_tokenizer_config(model_dir).ok()?; + let bos_token = cfg.bos_token.as_deref().unwrap_or(""); + let eos_token = cfg.eos_token.as_deref().unwrap_or(""); + + let messages = [ChatMessageTpl { + role: "user", + content: rmlx_models::arch::SMOKE_PROMPT, + ..ChatMessageTpl::default() + }]; + let opts = RenderOpts { + bos_token, + eos_token, + add_generation_prompt: true, + tools: &[], + enable_thinking: None, + }; + let rendered = tpl.render(&messages, &opts).ok()?; + + // The template already emits the BOS marker, so encode without re-adding + // special tokens (mirrors the production request path). + let enc = tokenizer.encode(rendered.text.as_str(), false).ok()?; + let ids = enc.get_ids().to_vec(); + if ids.is_empty() { + return None; + } + Some(ids) +} + // ── ChatTemplate ────────────────────────────────────────────────────────────── impl ChatTemplate { diff --git a/crates/rmlx-server/src/chat_template_tests.rs b/crates/rmlx-server/src/chat_template_tests.rs index adca95c..43d3678 100644 --- a/crates/rmlx-server/src/chat_template_tests.rs +++ b/crates/rmlx-server/src/chat_template_tests.rs @@ -633,3 +633,99 @@ fn qwen3_plain_render_byte_identical_after_tool_support() { "plain render diverged after tool-support change (A5.2 invariant)" ); } + +// ── Smoke-probe prompt: template-shaped vs bare-seed fallback ─────────────── +// +// Regression guard for the gemma4-unified 4-bit false-positive: the smoke probe +// must feed turn-structured input when the snapshot ships a chat template, so a +// healthy instruction-tuned model is exercised the same way it is served. A bare +// instruction prompt can make even a healthy model loop a filler token; rendering +// through the template removes that false Broken* verdict. These two tests assert +// the template path is taken when present and the bare-seed fallback otherwise — +// no model snapshot required. + +/// A minimal WordLevel `tokenizer.json` whose vocab covers the turn markers, the +/// `user`/`model` role words, and every whitespace-split token of `SMOKE_PROMPT`. +/// Token ids are arbitrary but distinct so the encoded id sequence is checkable. +fn write_smoke_fixture(dir: &Path, with_template: bool) { + // Whitespace pre-tokenizer splits on spaces and punctuation, so `France?` + // becomes `France` + `?`; the vocab lists both pieces. The `?` id is 26. + let vocab = r#"{ + "":0,"":1,"":2, + "":10,"":11,"user":12,"model":13, + "What":20,"is":21,"the":22,"capital":23,"of":24,"France":25,"?":26 + }"#; + // The angle-bracket markers must tokenize atomically (the Whitespace + // pre-tokenizer would otherwise split `` into `< bos >`), so they are + // registered as added/special tokens — mirroring real HF tokenizers. + let added = r#"[ + {"id":0,"content":"","single_word":false,"lstrip":false,"rstrip":false,"normalized":false,"special":true}, + {"id":10,"content":"","single_word":false,"lstrip":false,"rstrip":false,"normalized":false,"special":true}, + {"id":11,"content":"","single_word":false,"lstrip":false,"rstrip":false,"normalized":false,"special":true} + ]"#; + let tok_json = format!( + r#"{{"version":"1.0","truncation":null,"padding":null,"added_tokens":{added},"normalizer":null,"pre_tokenizer":{{"type":"Whitespace"}},"post_processor":null,"decoder":null,"model":{{"type":"WordLevel","vocab":{vocab},"unk_token":""}}}}"# + ); + std::fs::write(dir.join("tokenizer.json"), tok_json).expect("write tokenizer.json"); + std::fs::write( + dir.join("tokenizer_config.json"), + r#"{"bos_token":"","eos_token":""}"#, + ) + .expect("write tokenizer_config.json"); + if with_template { + // Gemma-style turn markers. The template emits its own BOS. Tokens are + // space-separated so the Whitespace pre-tokenizer yields exact vocab ids. + let tpl = "{{ bos_token }} {% for m in messages %} {{ m.role }} {{ m.content }} {% endfor %}{% if add_generation_prompt %} model{% endif %}"; + std::fs::write(dir.join("chat_template.jinja"), tpl).expect("write chat_template.jinja"); + } +} + +#[test] +fn smoke_prompt_uses_chat_template_when_present() { + let tmp = tempfile::tempdir().expect("tempdir"); + write_smoke_fixture(tmp.path(), true); + let tk = tokenizers::Tokenizer::from_file(tmp.path().join("tokenizer.json")).expect("load tok"); + + let ids = smoke_prompt_ids(tmp.path(), &tk, 0).expect("templated smoke prompt"); + + // Must be turn-structured: starts with BOS(0) + (10) user(12), + // contains the prompt tokens, and ends on the model-turn opener (10, 13). + assert_eq!( + ids.first(), + Some(&0), + "templated prompt must begin with BOS" + ); + assert!( + ids.windows(2).any(|w| w == [10, 12]), + "expected a user span: {ids:?}" + ); + assert_eq!( + &ids[ids.len() - 2..], + &[10, 13], + "templated prompt must end on the model opener: {ids:?}" + ); + // The prompt body tokens are present (What=20, capital=23, France=25). + for t in [20u32, 23, 25] { + assert!(ids.contains(&t), "missing prompt token {t} in {ids:?}"); + } +} + +#[test] +fn smoke_prompt_falls_back_to_bare_seed_without_template() { + let tmp = tempfile::tempdir().expect("tempdir"); + write_smoke_fixture(tmp.path(), false); // no chat_template.jinja + let tk = tokenizers::Tokenizer::from_file(tmp.path().join("tokenizer.json")).expect("load tok"); + + let ids = smoke_prompt_ids(tmp.path(), &tk, 0).expect("bare-seed smoke prompt"); + + // Bare seed = [bos] + SMOKE_PROMPT tokens, no turn markers (10/13 absent). + assert_eq!(ids.first(), Some(&0), "bare seed must begin with BOS"); + assert!( + !ids.contains(&10) && !ids.contains(&13), + "bare-seed fallback must not contain turn markers: {ids:?}" + ); + assert!( + ids.contains(&20) && ids.contains(&25), + "missing prompt body: {ids:?}" + ); +} diff --git a/crates/rmlx-server/src/openai/state.rs b/crates/rmlx-server/src/openai/state.rs index 0e8c479..8d89046 100644 --- a/crates/rmlx-server/src/openai/state.rs +++ b/crates/rmlx-server/src/openai/state.rs @@ -854,11 +854,24 @@ impl AppState { // zero-overhead path unchanged. if self.require_smoke_probe { tracing::info!(model_id, path = %entry.abs_path.display(), "B5: running smoke probe before first load"); + + // Render the smoke seed through the model's real chat template so the + // probe exercises production-shaped, turn-structured input. Falls back + // to the bare seed inside run_smoke_probe when no template / encode + // succeeds (handled by passing None). + let templated_prompt = crate::tokenizer_io::load_tokenizer(&entry.abs_path) + .ok() + .and_then(|tk| { + let bos_id = tk.token_to_id("").unwrap_or(2); + crate::chat_template::smoke_prompt_ids(&entry.abs_path, &tk, bos_id).ok() + }); + let verdict = rmlx_models::arch::run_smoke_probe( &entry.abs_path, rmlx_mlx::Device::Gpu, None, // use arch default KV quant None, // use model default max_ctx + templated_prompt, ) .map_err(|e| format!("smoke probe error for '{model_id}': {e}"))?; diff --git a/docs/CLI.md b/docs/CLI.md index fb35711..018a2af 100644 --- a/docs/CLI.md +++ b/docs/CLI.md @@ -265,6 +265,15 @@ rmlx info --model /path/to/snapshot --probe-smoke | `--max-ctx` | u32 | (from model) | **Virtual ceiling** on context length, in tokens (not an eager allocation): the KV ring grows lazily up to it, prompts over it are rejected. See `docs/KV_CACHE.md` §4.6. Must be ≥ 256 when set. | | `--list-cache-types` | bool flag | off | Print the full §D1 KV codec table and exit. No model load. | +The smoke probe renders its fixed seed prompt through the snapshot's +`chat_template.jinja` when present, so an instruction-tuned model is exercised +on the same turn-structured input it is served with. A *bare* instruction (no +turn markers) makes some healthy instruction-tuned models loop a filler token — +the reference loader (`mlx-lm`) reproduces this identically — which previously +raised false `BrokenPunctLoop` verdicts (e.g. the QAT-4bit `gemma-4-12B` +unified snapshots, which serve coherently via the chat template). Snapshots +with no chat template fall back to the bare-instruction seed. + --- ### `baseline` diff --git a/docs/MODELS.md b/docs/MODELS.md index b45b539..dfb56fa 100644 --- a/docs/MODELS.md +++ b/docs/MODELS.md @@ -649,6 +649,16 @@ arch string to the Gemma4 text loader; the multimodal-embedder tensors (`embed_vision`/`embed_audio`/`vision_embedder.*`) are not read, so image/audio input is not yet wired for 12B (text serves end-to-end). +Text serves correctly at **all weight quants**, including the mixed 4/8-bit QAT +snapshots (`gemma-4-12B-it-qat-4bit` affine, `gemma-4-12B-it-qat-mxfp4`): their +`quantization` block keeps the MLP `gate/up/down` projections at 8-bit while the +rest is 4-bit, which the per-tensor override resolver handles unchanged. These +snapshots emit a degenerate filler token (`'1'`) on a *bare* instruction prompt +with no turn markers — `mlx-lm` reproduces this identically, and the mxfp8 build +degenerates the same way to `.`/`_`. The `--probe-smoke` heuristic therefore +templates its seed (see `docs/CLI.md` `info`) so the verdict matches the served +behaviour rather than the bare-prompt artifact. + ### Key structural properties **SWA + FullAttention alternation.** Per-layer `layer_types` array determines From f02a978ff4ee9f8c2ca00027a91d5cd6cca4cbd6 Mon Sep 17 00:00:00 2001 From: Pushkinist <4850452+Pushkinist@users.noreply.github.com> Date: Wed, 17 Jun 2026 18:02:08 +0700 Subject: [PATCH 2/2] fix(smoke): drop hardcoded BOS in server probe; trace templated-seed fallback The server --require-smoke-probe bare-seed fallback computed BOS as token_to_id("").unwrap_or(2), a hardcoded string + magic id that seeded the probe wrong for chat-template-less vocabs lacking (and, passed as Some(ids), suppressed run_smoke_probe's own resolve_bos_id). smoke_prompt_ids now returns Option> from the chat-template render path only; when no usable template exists it returns None so each entry point uses its own canonical BOS resolver: run_smoke_probe::resolve_bos_id (server) and load_bos_id + arch::smoke_prompt_ids (CLI info). No token id is invented in chat_template.rs. Per the traceability rule, the templated->bare-seed fallback now emits a debug! event carrying the failure reason (load/compile/render/encode/empty), so a run's .jsonl shows whether the probe ran templated or fell back and why. Also fix a stale doc reference (ChatTemplate::from_template_string -> ChatTemplate::new) and update the None-path unit test to assert no magic id 2 leaks. --- crates/rmlx-cli/src/commands/info.rs | 13 ++- crates/rmlx-server/src/chat_template.rs | 81 +++++++++++-------- crates/rmlx-server/src/chat_template_tests.rs | 19 ++++- crates/rmlx-server/src/openai/state.rs | 12 ++- 4 files changed, 78 insertions(+), 47 deletions(-) diff --git a/crates/rmlx-cli/src/commands/info.rs b/crates/rmlx-cli/src/commands/info.rs index 1685db8..514bdc4 100644 --- a/crates/rmlx-cli/src/commands/info.rs +++ b/crates/rmlx-cli/src/commands/info.rs @@ -410,10 +410,15 @@ pub(crate) fn run_info( // with how the model is actually served — a bare instruction can make a // healthy instruction-tuned snapshot degenerate into a filler loop (the // reference loader does the same), which previously raised false - // Broken* verdicts. Single source of truth lives in chat_template. - let prompt_ids = - rmlx_server::chat_template::smoke_prompt_ids(model_path, &tokenizer, bos_id) - .map_err(|e| anyhow::anyhow!("smoke_probe: build seed prompt: {e}"))?; + // Broken* verdicts. `smoke_prompt_ids` returns None for base snapshots + // with no usable template; the bare seed then uses the canonical + // `load_bos_id` resolution already performed above. + let prompt_ids = match rmlx_server::chat_template::smoke_prompt_ids(model_path, &tokenizer) + { + Some(ids) => ids, + None => arch::smoke_prompt_ids(&tokenizer, bos_id) + .map_err(|e| anyhow::anyhow!("smoke_probe: build seed prompt: {e}"))?, + }; info!( ?kv_quant_override, diff --git a/crates/rmlx-server/src/chat_template.rs b/crates/rmlx-server/src/chat_template.rs index 4a10d64..d63fee9 100644 --- a/crates/rmlx-server/src/chat_template.rs +++ b/crates/rmlx-server/src/chat_template.rs @@ -157,7 +157,7 @@ pub struct RenderOpts<'a> { /// Construct once per model; `render` may be called many times. #[allow( clippy::exhaustive_structs, - reason = "internal closed template struct — private minijinja environment field; public API is from_template_string() and render(); adding a field requires updating ChatTemplate::from_template_string" + reason = "internal closed template struct — private minijinja environment field; public API is ChatTemplate::new() and render(); adding a field requires updating ChatTemplate::new" )] pub struct ChatTemplate { env: Environment<'static>, @@ -220,47 +220,58 @@ pub fn load_template_source(model_dir: &Path) -> Result { // ── Smoke-probe prompt ───────────────────────────────────────────────────────── -/// Build the smoke-probe input token ids for a model snapshot, preferring the -/// model's real chat template so the probe exercises the same prompt shape the -/// model is served with in production. +/// Build the smoke-probe input token ids for a model snapshot by rendering the +/// canonical seed through the model's real `chat_template.jinja`, so the probe +/// exercises the same prompt shape the model is served with in production. /// /// Instruction-tuned models are trained to continue *turn-structured* input /// (`user … model`). Fed a bare instruction with /// no turn scaffolding, a healthy model can still degenerate into a repeated /// filler token — a behaviour the reference loader reproduces identically. That -/// made the bare-seed probe (`arch::smoke_prompt_ids`) raise false `Broken*` -/// verdicts for snapshots that generate correctly through `serve`. Rendering -/// the canonical seed through `chat_template.jinja` removes that false positive -/// generally, for any arch whose bare-prompt continuation is degenerate. +/// made the bare-seed probe raise false `Broken*` verdicts for snapshots that +/// generate correctly through `serve`. Rendering the canonical seed through +/// `chat_template.jinja` removes that false positive generally, for any arch +/// whose bare-prompt continuation is degenerate. /// -/// Falls back to the bare-seed `arch::smoke_prompt_ids` when the snapshot has no -/// `chat_template.jinja` or the render/encode fails — preserving probe coverage -/// for base (non-chat) snapshots. The template emits its own ``, so the -/// rendered text is tokenized with `add_special_tokens = false`. -pub fn smoke_prompt_ids( - model_dir: &Path, - tokenizer: &tokenizers::Tokenizer, - bos_id: u32, -) -> Result> { - if let Some(ids) = templated_smoke_prompt_ids(model_dir, tokenizer) { - return Ok(ids); +/// Returns `None` when the snapshot has no usable `chat_template.jinja` (or the +/// render/encode fails) — base / non-chat snapshots. The caller then passes +/// `None` to `run_smoke_probe`, which builds the shared bare seed itself with +/// its own canonical BOS resolution. Keeping the bare-seed construction out of +/// this function means the BOS fallback chain lives in exactly one place per +/// entry point and no token id is hard-coded here. +/// +/// The template emits its own ``, so the rendered text is tokenized with +/// `add_special_tokens = false` (mirrors the production request path). +pub fn smoke_prompt_ids(model_dir: &Path, tokenizer: &tokenizers::Tokenizer) -> Option> { + match render_templated_seed(model_dir, tokenizer) { + Ok(ids) => Some(ids), + Err(reason) => { + // Expected for base / non-chat snapshots. Recorded at debug level so + // a run's `.jsonl` shows whether the probe ran templated or fell back + // to the bare seed, and why — without warning on the normal case. + tracing::debug!( + reason, + "smoke_prompt_ids: no usable chat template — using bare seed" + ); + None + } } - // No usable chat template — fall back to the shared bare seed. - rmlx_models::arch::smoke_prompt_ids(tokenizer, bos_id) - .map_err(|e| Error::Other(format!("smoke_prompt_ids: bare-seed build failed: {e}"))) } /// Render `arch::SMOKE_PROMPT` as a single user turn through the model's chat -/// template and tokenize the result. Returns `None` (so the caller falls back -/// to the bare seed) when no template exists or rendering/encoding fails. -fn templated_smoke_prompt_ids( +/// template and tokenize the result. Returns the encoded ids on success, or a +/// human-readable reason string on any miss (no template, compile/render/encode +/// failure, or empty output) so the caller can log it once at the fallback +/// boundary. +fn render_templated_seed( model_dir: &Path, tokenizer: &tokenizers::Tokenizer, -) -> Option> { - let src = load_template_source(model_dir).ok()?; - let tpl = ChatTemplate::new(src).ok()?; +) -> std::result::Result, String> { + let src = load_template_source(model_dir).map_err(|e| format!("load template: {e}"))?; + let tpl = ChatTemplate::new(src).map_err(|e| format!("compile template: {e}"))?; - let cfg = crate::tokenizer_io::load_tokenizer_config(model_dir).ok()?; + let cfg = crate::tokenizer_io::load_tokenizer_config(model_dir) + .map_err(|e| format!("load tokenizer_config: {e}"))?; let bos_token = cfg.bos_token.as_deref().unwrap_or(""); let eos_token = cfg.eos_token.as_deref().unwrap_or(""); @@ -276,16 +287,20 @@ fn templated_smoke_prompt_ids( tools: &[], enable_thinking: None, }; - let rendered = tpl.render(&messages, &opts).ok()?; + let rendered = tpl + .render(&messages, &opts) + .map_err(|e| format!("render template: {e}"))?; // The template already emits the BOS marker, so encode without re-adding // special tokens (mirrors the production request path). - let enc = tokenizer.encode(rendered.text.as_str(), false).ok()?; + let enc = tokenizer + .encode(rendered.text.as_str(), false) + .map_err(|e| format!("encode rendered prompt: {e}"))?; let ids = enc.get_ids().to_vec(); if ids.is_empty() { - return None; + return Err("rendered prompt encoded to zero tokens".to_owned()); } - Some(ids) + Ok(ids) } // ── ChatTemplate ────────────────────────────────────────────────────────────── diff --git a/crates/rmlx-server/src/chat_template_tests.rs b/crates/rmlx-server/src/chat_template_tests.rs index 43d3678..3644b5b 100644 --- a/crates/rmlx-server/src/chat_template_tests.rs +++ b/crates/rmlx-server/src/chat_template_tests.rs @@ -686,7 +686,7 @@ fn smoke_prompt_uses_chat_template_when_present() { write_smoke_fixture(tmp.path(), true); let tk = tokenizers::Tokenizer::from_file(tmp.path().join("tokenizer.json")).expect("load tok"); - let ids = smoke_prompt_ids(tmp.path(), &tk, 0).expect("templated smoke prompt"); + let ids = smoke_prompt_ids(tmp.path(), &tk).expect("templated smoke prompt"); // Must be turn-structured: starts with BOS(0) + (10) user(12), // contains the prompt tokens, and ends on the model-turn opener (10, 13). @@ -710,15 +710,28 @@ fn smoke_prompt_uses_chat_template_when_present() { } } +/// Without a chat template, `smoke_prompt_ids` must return `None` so the caller +/// (`run_smoke_probe` / the CLI probe) builds the bare seed itself with its own +/// canonical BOS resolution. The function must NOT invent a token id — earlier +/// the server probe hard-coded a magic id when no `` resolved, seeding the +/// probe wrong; returning `None` keeps the BOS fallback chain in one place. #[test] fn smoke_prompt_falls_back_to_bare_seed_without_template() { let tmp = tempfile::tempdir().expect("tempdir"); write_smoke_fixture(tmp.path(), false); // no chat_template.jinja let tk = tokenizers::Tokenizer::from_file(tmp.path().join("tokenizer.json")).expect("load tok"); - let ids = smoke_prompt_ids(tmp.path(), &tk, 0).expect("bare-seed smoke prompt"); + // No template ⇒ None (the templated path is the only thing this fn owns). + assert!( + smoke_prompt_ids(tmp.path(), &tk).is_none(), + "no chat template must yield None so the caller resolves BOS itself" + ); - // Bare seed = [bos] + SMOKE_PROMPT tokens, no turn markers (10/13 absent). + // The bare-seed builder the caller falls back to is the shared + // `arch::smoke_prompt_ids`, which takes the canonically-resolved BOS id and + // never substitutes a magic literal. Seeded with a real BOS (0 here) it + // produces [bos] + SMOKE_PROMPT, no turn markers. + let ids = rmlx_models::arch::smoke_prompt_ids(&tk, 0).expect("bare-seed smoke prompt"); assert_eq!(ids.first(), Some(&0), "bare seed must begin with BOS"); assert!( !ids.contains(&10) && !ids.contains(&13), diff --git a/crates/rmlx-server/src/openai/state.rs b/crates/rmlx-server/src/openai/state.rs index 8d89046..869a5fa 100644 --- a/crates/rmlx-server/src/openai/state.rs +++ b/crates/rmlx-server/src/openai/state.rs @@ -856,15 +856,13 @@ impl AppState { tracing::info!(model_id, path = %entry.abs_path.display(), "B5: running smoke probe before first load"); // Render the smoke seed through the model's real chat template so the - // probe exercises production-shaped, turn-structured input. Falls back - // to the bare seed inside run_smoke_probe when no template / encode - // succeeds (handled by passing None). + // probe exercises production-shaped, turn-structured input. When no + // usable template exists, `smoke_prompt_ids` returns None and + // `run_smoke_probe` builds the bare seed itself with its own + // canonical BOS resolution — no token id is invented here. let templated_prompt = crate::tokenizer_io::load_tokenizer(&entry.abs_path) .ok() - .and_then(|tk| { - let bos_id = tk.token_to_id("").unwrap_or(2); - crate::chat_template::smoke_prompt_ids(&entry.abs_path, &tk, bos_id).ok() - }); + .and_then(|tk| crate::chat_template::smoke_prompt_ids(&entry.abs_path, &tk)); let verdict = rmlx_models::arch::run_smoke_probe( &entry.abs_path,