diff --git a/crates/rmlx-cli/src/commands/info.rs b/crates/rmlx-cli/src/commands/info.rs index 1369759..514bdc4 100644 --- a/crates/rmlx-cli/src/commands/info.rs +++ b/crates/rmlx-cli/src/commands/info.rs @@ -396,8 +396,7 @@ pub(crate) fn run_info( }; print_load_phases_json(); - // Load tokenizer. Inline BOS extraction -- avoids adding rmlx-server as a dep - // of the probe path (dep-DAG constraint). ~10 lines mirrors tokenizer_io logic. + // Load tokenizer. let bos_id = load_bos_id(model_path)?; info!(bos_id, "smoke_probe: resolved BOS token id"); @@ -405,10 +404,21 @@ pub(crate) fn run_info( let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path) .map_err(|e| anyhow::anyhow!("smoke_probe: load tokenizer: {e}"))?; - // B5b: seed with the fixed deterministic prompt (shared single source - // of truth in arch::smoke_prompt_ids — do NOT fork the seed logic). - let prompt_ids = arch::smoke_prompt_ids(&tokenizer, bos_id) - .map_err(|e| anyhow::anyhow!("smoke_probe: build seed prompt: {e}"))?; + // Build the smoke prompt through the model's real chat template when one + // exists (production-shaped, turn-structured input), falling back to the + // shared bare seed otherwise. This keeps the probe verdict consistent + // with how the model is actually served — a bare instruction can make a + // healthy instruction-tuned snapshot degenerate into a filler loop (the + // reference loader does the same), which previously raised false + // Broken* verdicts. `smoke_prompt_ids` returns None for base snapshots + // with no usable template; the bare seed then uses the canonical + // `load_bos_id` resolution already performed above. + let prompt_ids = match rmlx_server::chat_template::smoke_prompt_ids(model_path, &tokenizer) + { + Some(ids) => ids, + None => arch::smoke_prompt_ids(&tokenizer, bos_id) + .map_err(|e| anyhow::anyhow!("smoke_probe: build seed prompt: {e}"))?, + }; info!( ?kv_quant_override, diff --git a/crates/rmlx-models/src/arch/loader.rs b/crates/rmlx-models/src/arch/loader.rs index 91a680b..22f0abf 100644 --- a/crates/rmlx-models/src/arch/loader.rs +++ b/crates/rmlx-models/src/arch/loader.rs @@ -371,6 +371,14 @@ pub fn smoke_prompt_ids(tokenizer: &tokenizers::Tokenizer, bos_id: u32) -> Resul /// `tokenizer_config.json`, runs greedy generation for 8 steps, and returns /// a `SmokeVerdict`. Used by the server's `--require-smoke-probe` gate (B5). /// +/// `prompt_ids_override` lets a caller that owns the chat-template engine +/// (e.g. `rmlx-server`) feed a production-shaped, turn-structured prompt so the +/// probe matches how the model is actually served. When `None`, the probe falls +/// back to the shared bare-instruction seed (`smoke_prompt_ids`). Instruction +/// models can degenerate into repeated filler on a bare prompt even when +/// healthy — the reference loader reproduces this identically — so the templated +/// path is preferred when available to avoid false `Broken*` verdicts. +/// /// Returns `Err` only for hard load/tokenizer failures. `Ok(verdict)` where /// `verdict != SmokeVerdict::Ok` means the snapshot is broken but loadable. pub fn run_smoke_probe( @@ -378,6 +386,7 @@ pub fn run_smoke_probe( device: Device, kv_quant: Option, max_ctx_override: Option, + prompt_ids_override: Option>, ) -> Result { let model = load_model(model_dir, device, &LoadOpts::default())?; @@ -389,11 +398,15 @@ pub fn run_smoke_probe( let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path) .map_err(|e| Error::Model(format!("run_smoke_probe: load tokenizer: {e}")))?; - // B5b: seed with a fixed deterministic prompt instead of bare BOS so - // healthy instruction-tuned snapshots generate real text. - let prompt_ids = smoke_prompt_ids(&tokenizer, bos_id)?; + // Prefer a caller-provided, chat-templated prompt; otherwise seed with the + // shared bare instruction so healthy snapshots generate real text. + let (prompt_ids, templated) = match prompt_ids_override { + Some(ids) if !ids.is_empty() => (ids, true), + _ => (smoke_prompt_ids(&tokenizer, bos_id)?, false), + }; tracing::info!( prompt_len = prompt_ids.len(), + templated, seed = SMOKE_PROMPT, "run_smoke_probe: seeded smoke prompt" ); diff --git a/crates/rmlx-server/src/chat_template.rs b/crates/rmlx-server/src/chat_template.rs index f8f221b..d63fee9 100644 --- a/crates/rmlx-server/src/chat_template.rs +++ b/crates/rmlx-server/src/chat_template.rs @@ -157,7 +157,7 @@ pub struct RenderOpts<'a> { /// Construct once per model; `render` may be called many times. #[allow( clippy::exhaustive_structs, - reason = "internal closed template struct — private minijinja environment field; public API is from_template_string() and render(); adding a field requires updating ChatTemplate::from_template_string" + reason = "internal closed template struct — private minijinja environment field; public API is ChatTemplate::new() and render(); adding a field requires updating ChatTemplate::new" )] pub struct ChatTemplate { env: Environment<'static>, @@ -218,6 +218,91 @@ pub fn load_template_source(model_dir: &Path) -> Result { .map_err(|e| Error::Other(format!("cannot read {}: {e}", path.display()))) } +// ── Smoke-probe prompt ───────────────────────────────────────────────────────── + +/// Build the smoke-probe input token ids for a model snapshot by rendering the +/// canonical seed through the model's real `chat_template.jinja`, so the probe +/// exercises the same prompt shape the model is served with in production. +/// +/// Instruction-tuned models are trained to continue *turn-structured* input +/// (`user … model`). Fed a bare instruction with +/// no turn scaffolding, a healthy model can still degenerate into a repeated +/// filler token — a behaviour the reference loader reproduces identically. That +/// made the bare-seed probe raise false `Broken*` verdicts for snapshots that +/// generate correctly through `serve`. Rendering the canonical seed through +/// `chat_template.jinja` removes that false positive generally, for any arch +/// whose bare-prompt continuation is degenerate. +/// +/// Returns `None` when the snapshot has no usable `chat_template.jinja` (or the +/// render/encode fails) — base / non-chat snapshots. The caller then passes +/// `None` to `run_smoke_probe`, which builds the shared bare seed itself with +/// its own canonical BOS resolution. Keeping the bare-seed construction out of +/// this function means the BOS fallback chain lives in exactly one place per +/// entry point and no token id is hard-coded here. +/// +/// The template emits its own ``, so the rendered text is tokenized with +/// `add_special_tokens = false` (mirrors the production request path). +pub fn smoke_prompt_ids(model_dir: &Path, tokenizer: &tokenizers::Tokenizer) -> Option> { + match render_templated_seed(model_dir, tokenizer) { + Ok(ids) => Some(ids), + Err(reason) => { + // Expected for base / non-chat snapshots. Recorded at debug level so + // a run's `.jsonl` shows whether the probe ran templated or fell back + // to the bare seed, and why — without warning on the normal case. + tracing::debug!( + reason, + "smoke_prompt_ids: no usable chat template — using bare seed" + ); + None + } + } +} + +/// Render `arch::SMOKE_PROMPT` as a single user turn through the model's chat +/// template and tokenize the result. Returns the encoded ids on success, or a +/// human-readable reason string on any miss (no template, compile/render/encode +/// failure, or empty output) so the caller can log it once at the fallback +/// boundary. +fn render_templated_seed( + model_dir: &Path, + tokenizer: &tokenizers::Tokenizer, +) -> std::result::Result, String> { + let src = load_template_source(model_dir).map_err(|e| format!("load template: {e}"))?; + let tpl = ChatTemplate::new(src).map_err(|e| format!("compile template: {e}"))?; + + let cfg = crate::tokenizer_io::load_tokenizer_config(model_dir) + .map_err(|e| format!("load tokenizer_config: {e}"))?; + let bos_token = cfg.bos_token.as_deref().unwrap_or(""); + let eos_token = cfg.eos_token.as_deref().unwrap_or(""); + + let messages = [ChatMessageTpl { + role: "user", + content: rmlx_models::arch::SMOKE_PROMPT, + ..ChatMessageTpl::default() + }]; + let opts = RenderOpts { + bos_token, + eos_token, + add_generation_prompt: true, + tools: &[], + enable_thinking: None, + }; + let rendered = tpl + .render(&messages, &opts) + .map_err(|e| format!("render template: {e}"))?; + + // The template already emits the BOS marker, so encode without re-adding + // special tokens (mirrors the production request path). + let enc = tokenizer + .encode(rendered.text.as_str(), false) + .map_err(|e| format!("encode rendered prompt: {e}"))?; + let ids = enc.get_ids().to_vec(); + if ids.is_empty() { + return Err("rendered prompt encoded to zero tokens".to_owned()); + } + Ok(ids) +} + // ── ChatTemplate ────────────────────────────────────────────────────────────── impl ChatTemplate { diff --git a/crates/rmlx-server/src/chat_template_tests.rs b/crates/rmlx-server/src/chat_template_tests.rs index adca95c..3644b5b 100644 --- a/crates/rmlx-server/src/chat_template_tests.rs +++ b/crates/rmlx-server/src/chat_template_tests.rs @@ -633,3 +633,112 @@ fn qwen3_plain_render_byte_identical_after_tool_support() { "plain render diverged after tool-support change (A5.2 invariant)" ); } + +// ── Smoke-probe prompt: template-shaped vs bare-seed fallback ─────────────── +// +// Regression guard for the gemma4-unified 4-bit false-positive: the smoke probe +// must feed turn-structured input when the snapshot ships a chat template, so a +// healthy instruction-tuned model is exercised the same way it is served. A bare +// instruction prompt can make even a healthy model loop a filler token; rendering +// through the template removes that false Broken* verdict. These two tests assert +// the template path is taken when present and the bare-seed fallback otherwise — +// no model snapshot required. + +/// A minimal WordLevel `tokenizer.json` whose vocab covers the turn markers, the +/// `user`/`model` role words, and every whitespace-split token of `SMOKE_PROMPT`. +/// Token ids are arbitrary but distinct so the encoded id sequence is checkable. +fn write_smoke_fixture(dir: &Path, with_template: bool) { + // Whitespace pre-tokenizer splits on spaces and punctuation, so `France?` + // becomes `France` + `?`; the vocab lists both pieces. The `?` id is 26. + let vocab = r#"{ + "":0,"":1,"":2, + "":10,"":11,"user":12,"model":13, + "What":20,"is":21,"the":22,"capital":23,"of":24,"France":25,"?":26 + }"#; + // The angle-bracket markers must tokenize atomically (the Whitespace + // pre-tokenizer would otherwise split `` into `< bos >`), so they are + // registered as added/special tokens — mirroring real HF tokenizers. + let added = r#"[ + {"id":0,"content":"","single_word":false,"lstrip":false,"rstrip":false,"normalized":false,"special":true}, + {"id":10,"content":"","single_word":false,"lstrip":false,"rstrip":false,"normalized":false,"special":true}, + {"id":11,"content":"","single_word":false,"lstrip":false,"rstrip":false,"normalized":false,"special":true} + ]"#; + let tok_json = format!( + r#"{{"version":"1.0","truncation":null,"padding":null,"added_tokens":{added},"normalizer":null,"pre_tokenizer":{{"type":"Whitespace"}},"post_processor":null,"decoder":null,"model":{{"type":"WordLevel","vocab":{vocab},"unk_token":""}}}}"# + ); + std::fs::write(dir.join("tokenizer.json"), tok_json).expect("write tokenizer.json"); + std::fs::write( + dir.join("tokenizer_config.json"), + r#"{"bos_token":"","eos_token":""}"#, + ) + .expect("write tokenizer_config.json"); + if with_template { + // Gemma-style turn markers. The template emits its own BOS. Tokens are + // space-separated so the Whitespace pre-tokenizer yields exact vocab ids. + let tpl = "{{ bos_token }} {% for m in messages %} {{ m.role }} {{ m.content }} {% endfor %}{% if add_generation_prompt %} model{% endif %}"; + std::fs::write(dir.join("chat_template.jinja"), tpl).expect("write chat_template.jinja"); + } +} + +#[test] +fn smoke_prompt_uses_chat_template_when_present() { + let tmp = tempfile::tempdir().expect("tempdir"); + write_smoke_fixture(tmp.path(), true); + let tk = tokenizers::Tokenizer::from_file(tmp.path().join("tokenizer.json")).expect("load tok"); + + let ids = smoke_prompt_ids(tmp.path(), &tk).expect("templated smoke prompt"); + + // Must be turn-structured: starts with BOS(0) + (10) user(12), + // contains the prompt tokens, and ends on the model-turn opener (10, 13). + assert_eq!( + ids.first(), + Some(&0), + "templated prompt must begin with BOS" + ); + assert!( + ids.windows(2).any(|w| w == [10, 12]), + "expected a user span: {ids:?}" + ); + assert_eq!( + &ids[ids.len() - 2..], + &[10, 13], + "templated prompt must end on the model opener: {ids:?}" + ); + // The prompt body tokens are present (What=20, capital=23, France=25). + for t in [20u32, 23, 25] { + assert!(ids.contains(&t), "missing prompt token {t} in {ids:?}"); + } +} + +/// Without a chat template, `smoke_prompt_ids` must return `None` so the caller +/// (`run_smoke_probe` / the CLI probe) builds the bare seed itself with its own +/// canonical BOS resolution. The function must NOT invent a token id — earlier +/// the server probe hard-coded a magic id when no `` resolved, seeding the +/// probe wrong; returning `None` keeps the BOS fallback chain in one place. +#[test] +fn smoke_prompt_falls_back_to_bare_seed_without_template() { + let tmp = tempfile::tempdir().expect("tempdir"); + write_smoke_fixture(tmp.path(), false); // no chat_template.jinja + let tk = tokenizers::Tokenizer::from_file(tmp.path().join("tokenizer.json")).expect("load tok"); + + // No template ⇒ None (the templated path is the only thing this fn owns). + assert!( + smoke_prompt_ids(tmp.path(), &tk).is_none(), + "no chat template must yield None so the caller resolves BOS itself" + ); + + // The bare-seed builder the caller falls back to is the shared + // `arch::smoke_prompt_ids`, which takes the canonically-resolved BOS id and + // never substitutes a magic literal. Seeded with a real BOS (0 here) it + // produces [bos] + SMOKE_PROMPT, no turn markers. + let ids = rmlx_models::arch::smoke_prompt_ids(&tk, 0).expect("bare-seed smoke prompt"); + assert_eq!(ids.first(), Some(&0), "bare seed must begin with BOS"); + assert!( + !ids.contains(&10) && !ids.contains(&13), + "bare-seed fallback must not contain turn markers: {ids:?}" + ); + assert!( + ids.contains(&20) && ids.contains(&25), + "missing prompt body: {ids:?}" + ); +} diff --git a/crates/rmlx-server/src/openai/state.rs b/crates/rmlx-server/src/openai/state.rs index 0e8c479..869a5fa 100644 --- a/crates/rmlx-server/src/openai/state.rs +++ b/crates/rmlx-server/src/openai/state.rs @@ -854,11 +854,22 @@ impl AppState { // zero-overhead path unchanged. if self.require_smoke_probe { tracing::info!(model_id, path = %entry.abs_path.display(), "B5: running smoke probe before first load"); + + // Render the smoke seed through the model's real chat template so the + // probe exercises production-shaped, turn-structured input. When no + // usable template exists, `smoke_prompt_ids` returns None and + // `run_smoke_probe` builds the bare seed itself with its own + // canonical BOS resolution — no token id is invented here. + let templated_prompt = crate::tokenizer_io::load_tokenizer(&entry.abs_path) + .ok() + .and_then(|tk| crate::chat_template::smoke_prompt_ids(&entry.abs_path, &tk)); + let verdict = rmlx_models::arch::run_smoke_probe( &entry.abs_path, rmlx_mlx::Device::Gpu, None, // use arch default KV quant None, // use model default max_ctx + templated_prompt, ) .map_err(|e| format!("smoke probe error for '{model_id}': {e}"))?; diff --git a/docs/CLI.md b/docs/CLI.md index fb35711..018a2af 100644 --- a/docs/CLI.md +++ b/docs/CLI.md @@ -265,6 +265,15 @@ rmlx info --model /path/to/snapshot --probe-smoke | `--max-ctx` | u32 | (from model) | **Virtual ceiling** on context length, in tokens (not an eager allocation): the KV ring grows lazily up to it, prompts over it are rejected. See `docs/KV_CACHE.md` §4.6. Must be ≥ 256 when set. | | `--list-cache-types` | bool flag | off | Print the full §D1 KV codec table and exit. No model load. | +The smoke probe renders its fixed seed prompt through the snapshot's +`chat_template.jinja` when present, so an instruction-tuned model is exercised +on the same turn-structured input it is served with. A *bare* instruction (no +turn markers) makes some healthy instruction-tuned models loop a filler token — +the reference loader (`mlx-lm`) reproduces this identically — which previously +raised false `BrokenPunctLoop` verdicts (e.g. the QAT-4bit `gemma-4-12B` +unified snapshots, which serve coherently via the chat template). Snapshots +with no chat template fall back to the bare-instruction seed. + --- ### `baseline` diff --git a/docs/MODELS.md b/docs/MODELS.md index b45b539..dfb56fa 100644 --- a/docs/MODELS.md +++ b/docs/MODELS.md @@ -649,6 +649,16 @@ arch string to the Gemma4 text loader; the multimodal-embedder tensors (`embed_vision`/`embed_audio`/`vision_embedder.*`) are not read, so image/audio input is not yet wired for 12B (text serves end-to-end). +Text serves correctly at **all weight quants**, including the mixed 4/8-bit QAT +snapshots (`gemma-4-12B-it-qat-4bit` affine, `gemma-4-12B-it-qat-mxfp4`): their +`quantization` block keeps the MLP `gate/up/down` projections at 8-bit while the +rest is 4-bit, which the per-tensor override resolver handles unchanged. These +snapshots emit a degenerate filler token (`'1'`) on a *bare* instruction prompt +with no turn markers — `mlx-lm` reproduces this identically, and the mxfp8 build +degenerates the same way to `.`/`_`. The `--probe-smoke` heuristic therefore +templates its seed (see `docs/CLI.md` `info`) so the verdict matches the served +behaviour rather than the bare-prompt artifact. + ### Key structural properties **SWA + FullAttention alternation.** Per-layer `layer_types` array determines