Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 16 additions & 6 deletions crates/rmlx-cli/src/commands/info.rs
Original file line number Diff line number Diff line change
Expand Up @@ -396,19 +396,29 @@ pub(crate) fn run_info(
};
print_load_phases_json();

// Load tokenizer. Inline BOS extraction -- avoids adding rmlx-server as a dep
// of the probe path (dep-DAG constraint). ~10 lines mirrors tokenizer_io logic.
// Load tokenizer.
let bos_id = load_bos_id(model_path)?;
info!(bos_id, "smoke_probe: resolved BOS token id");

let tokenizer_path = model_path.join("tokenizer.json");
let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path)
.map_err(|e| anyhow::anyhow!("smoke_probe: load tokenizer: {e}"))?;

// B5b: seed with the fixed deterministic prompt (shared single source
// of truth in arch::smoke_prompt_ids — do NOT fork the seed logic).
let prompt_ids = arch::smoke_prompt_ids(&tokenizer, bos_id)
.map_err(|e| anyhow::anyhow!("smoke_probe: build seed prompt: {e}"))?;
// Build the smoke prompt through the model's real chat template when one
// exists (production-shaped, turn-structured input), falling back to the
// shared bare seed otherwise. This keeps the probe verdict consistent
// with how the model is actually served — a bare instruction can make a
// healthy instruction-tuned snapshot degenerate into a filler loop (the
// reference loader does the same), which previously raised false
// Broken* verdicts. `smoke_prompt_ids` returns None for base snapshots
// with no usable template; the bare seed then uses the canonical
// `load_bos_id` resolution already performed above.
let prompt_ids = match rmlx_server::chat_template::smoke_prompt_ids(model_path, &tokenizer)
{
Some(ids) => ids,
None => arch::smoke_prompt_ids(&tokenizer, bos_id)
.map_err(|e| anyhow::anyhow!("smoke_probe: build seed prompt: {e}"))?,
};

info!(
?kv_quant_override,
Expand Down
19 changes: 16 additions & 3 deletions crates/rmlx-models/src/arch/loader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -371,13 +371,22 @@ pub fn smoke_prompt_ids(tokenizer: &tokenizers::Tokenizer, bos_id: u32) -> Resul
/// `tokenizer_config.json`, runs greedy generation for 8 steps, and returns
/// a `SmokeVerdict`. Used by the server's `--require-smoke-probe` gate (B5).
///
/// `prompt_ids_override` lets a caller that owns the chat-template engine
/// (e.g. `rmlx-server`) feed a production-shaped, turn-structured prompt so the
/// probe matches how the model is actually served. When `None`, the probe falls
/// back to the shared bare-instruction seed (`smoke_prompt_ids`). Instruction
/// models can degenerate into repeated filler on a bare prompt even when
/// healthy — the reference loader reproduces this identically — so the templated
/// path is preferred when available to avoid false `Broken*` verdicts.
///
/// Returns `Err` only for hard load/tokenizer failures. `Ok(verdict)` where
/// `verdict != SmokeVerdict::Ok` means the snapshot is broken but loadable.
pub fn run_smoke_probe(
model_dir: &Path,
device: Device,
kv_quant: Option<rmlx_kv_quant::KvQuant>,
max_ctx_override: Option<i32>,
prompt_ids_override: Option<Vec<u32>>,
) -> Result<crate::decode_loop::SmokeVerdict> {
let model = load_model(model_dir, device, &LoadOpts::default())?;

Expand All @@ -389,11 +398,15 @@ pub fn run_smoke_probe(
let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path)
.map_err(|e| Error::Model(format!("run_smoke_probe: load tokenizer: {e}")))?;

// B5b: seed with a fixed deterministic prompt instead of bare BOS so
// healthy instruction-tuned snapshots generate real text.
let prompt_ids = smoke_prompt_ids(&tokenizer, bos_id)?;
// Prefer a caller-provided, chat-templated prompt; otherwise seed with the
// shared bare instruction so healthy snapshots generate real text.
let (prompt_ids, templated) = match prompt_ids_override {
Some(ids) if !ids.is_empty() => (ids, true),
_ => (smoke_prompt_ids(&tokenizer, bos_id)?, false),
};
tracing::info!(
prompt_len = prompt_ids.len(),
templated,
seed = SMOKE_PROMPT,
"run_smoke_probe: seeded smoke prompt"
);
Expand Down
87 changes: 86 additions & 1 deletion crates/rmlx-server/src/chat_template.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ pub struct RenderOpts<'a> {
/// Construct once per model; `render` may be called many times.
#[allow(
clippy::exhaustive_structs,
reason = "internal closed template struct — private minijinja environment field; public API is from_template_string() and render(); adding a field requires updating ChatTemplate::from_template_string"
reason = "internal closed template struct — private minijinja environment field; public API is ChatTemplate::new() and render(); adding a field requires updating ChatTemplate::new"
)]
pub struct ChatTemplate {
env: Environment<'static>,
Expand Down Expand Up @@ -218,6 +218,91 @@ pub fn load_template_source(model_dir: &Path) -> Result<String> {
.map_err(|e| Error::Other(format!("cannot read {}: {e}", path.display())))
}

// ── Smoke-probe prompt ─────────────────────────────────────────────────────────

/// Build the smoke-probe input token ids for a model snapshot by rendering the
/// canonical seed through the model's real `chat_template.jinja`, so the probe
/// exercises the same prompt shape the model is served with in production.
///
/// Instruction-tuned models are trained to continue *turn-structured* input
/// (`<start_of_turn>user … <start_of_turn>model`). Fed a bare instruction with
/// no turn scaffolding, a healthy model can still degenerate into a repeated
/// filler token — a behaviour the reference loader reproduces identically. That
/// made the bare-seed probe raise false `Broken*` verdicts for snapshots that
/// generate correctly through `serve`. Rendering the canonical seed through
/// `chat_template.jinja` removes that false positive generally, for any arch
/// whose bare-prompt continuation is degenerate.
///
/// Returns `None` when the snapshot has no usable `chat_template.jinja` (or the
/// render/encode fails) — base / non-chat snapshots. The caller then passes
/// `None` to `run_smoke_probe`, which builds the shared bare seed itself with
/// its own canonical BOS resolution. Keeping the bare-seed construction out of
/// this function means the BOS fallback chain lives in exactly one place per
/// entry point and no token id is hard-coded here.
///
/// The template emits its own `<bos>`, so the rendered text is tokenized with
/// `add_special_tokens = false` (mirrors the production request path).
pub fn smoke_prompt_ids(model_dir: &Path, tokenizer: &tokenizers::Tokenizer) -> Option<Vec<u32>> {
match render_templated_seed(model_dir, tokenizer) {
Ok(ids) => Some(ids),
Err(reason) => {
// Expected for base / non-chat snapshots. Recorded at debug level so
// a run's `.jsonl` shows whether the probe ran templated or fell back
// to the bare seed, and why — without warning on the normal case.
tracing::debug!(
reason,
"smoke_prompt_ids: no usable chat template — using bare seed"
);
None
}
}
}

/// Render `arch::SMOKE_PROMPT` as a single user turn through the model's chat
/// template and tokenize the result. Returns the encoded ids on success, or a
/// human-readable reason string on any miss (no template, compile/render/encode
/// failure, or empty output) so the caller can log it once at the fallback
/// boundary.
fn render_templated_seed(
model_dir: &Path,
tokenizer: &tokenizers::Tokenizer,
) -> std::result::Result<Vec<u32>, String> {
let src = load_template_source(model_dir).map_err(|e| format!("load template: {e}"))?;
let tpl = ChatTemplate::new(src).map_err(|e| format!("compile template: {e}"))?;

let cfg = crate::tokenizer_io::load_tokenizer_config(model_dir)
.map_err(|e| format!("load tokenizer_config: {e}"))?;
let bos_token = cfg.bos_token.as_deref().unwrap_or("");
let eos_token = cfg.eos_token.as_deref().unwrap_or("");

let messages = [ChatMessageTpl {
role: "user",
content: rmlx_models::arch::SMOKE_PROMPT,
..ChatMessageTpl::default()
}];
let opts = RenderOpts {
bos_token,
eos_token,
add_generation_prompt: true,
tools: &[],
enable_thinking: None,
};
let rendered = tpl
.render(&messages, &opts)
.map_err(|e| format!("render template: {e}"))?;

// The template already emits the BOS marker, so encode without re-adding
// special tokens (mirrors the production request path).
let enc = tokenizer
.encode(rendered.text.as_str(), false)
.map_err(|e| format!("encode rendered prompt: {e}"))?;
let ids = enc.get_ids().to_vec();
if ids.is_empty() {
return Err("rendered prompt encoded to zero tokens".to_owned());
}
Ok(ids)
}

// ── ChatTemplate ──────────────────────────────────────────────────────────────

impl ChatTemplate {
Expand Down
109 changes: 109 additions & 0 deletions crates/rmlx-server/src/chat_template_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -633,3 +633,112 @@ fn qwen3_plain_render_byte_identical_after_tool_support() {
"plain render diverged after tool-support change (A5.2 invariant)"
);
}

// ── Smoke-probe prompt: template-shaped vs bare-seed fallback ───────────────
//
// Regression guard for the gemma4-unified 4-bit false-positive: the smoke probe
// must feed turn-structured input when the snapshot ships a chat template, so a
// healthy instruction-tuned model is exercised the same way it is served. A bare
// instruction prompt can make even a healthy model loop a filler token; rendering
// through the template removes that false Broken* verdict. These two tests assert
// the template path is taken when present and the bare-seed fallback otherwise —
// no model snapshot required.

/// A minimal WordLevel `tokenizer.json` whose vocab covers the turn markers, the
/// `user`/`model` role words, and every whitespace-split token of `SMOKE_PROMPT`.
/// Token ids are arbitrary but distinct so the encoded id sequence is checkable.
fn write_smoke_fixture(dir: &Path, with_template: bool) {
// Whitespace pre-tokenizer splits on spaces and punctuation, so `France?`
// becomes `France` + `?`; the vocab lists both pieces. The `?` id is 26.
let vocab = r#"{
"<bos>":0,"<eos>":1,"<unk>":2,
"<start_of_turn>":10,"<end_of_turn>":11,"user":12,"model":13,
"What":20,"is":21,"the":22,"capital":23,"of":24,"France":25,"?":26
}"#;
// The angle-bracket markers must tokenize atomically (the Whitespace
// pre-tokenizer would otherwise split `<bos>` into `< bos >`), so they are
// registered as added/special tokens — mirroring real HF tokenizers.
let added = r#"[
{"id":0,"content":"<bos>","single_word":false,"lstrip":false,"rstrip":false,"normalized":false,"special":true},
{"id":10,"content":"<start_of_turn>","single_word":false,"lstrip":false,"rstrip":false,"normalized":false,"special":true},
{"id":11,"content":"<end_of_turn>","single_word":false,"lstrip":false,"rstrip":false,"normalized":false,"special":true}
]"#;
let tok_json = format!(
r#"{{"version":"1.0","truncation":null,"padding":null,"added_tokens":{added},"normalizer":null,"pre_tokenizer":{{"type":"Whitespace"}},"post_processor":null,"decoder":null,"model":{{"type":"WordLevel","vocab":{vocab},"unk_token":"<unk>"}}}}"#
);
std::fs::write(dir.join("tokenizer.json"), tok_json).expect("write tokenizer.json");
std::fs::write(
dir.join("tokenizer_config.json"),
r#"{"bos_token":"<bos>","eos_token":"<eos>"}"#,
)
.expect("write tokenizer_config.json");
if with_template {
// Gemma-style turn markers. The template emits its own BOS. Tokens are
// space-separated so the Whitespace pre-tokenizer yields exact vocab ids.
let tpl = "{{ bos_token }} {% for m in messages %}<start_of_turn> {{ m.role }} {{ m.content }} <end_of_turn> {% endfor %}{% if add_generation_prompt %}<start_of_turn> model{% endif %}";
std::fs::write(dir.join("chat_template.jinja"), tpl).expect("write chat_template.jinja");
}
}

#[test]
fn smoke_prompt_uses_chat_template_when_present() {
let tmp = tempfile::tempdir().expect("tempdir");
write_smoke_fixture(tmp.path(), true);
let tk = tokenizers::Tokenizer::from_file(tmp.path().join("tokenizer.json")).expect("load tok");

let ids = smoke_prompt_ids(tmp.path(), &tk).expect("templated smoke prompt");

// Must be turn-structured: starts with BOS(0) + <start_of_turn>(10) user(12),
// contains the prompt tokens, and ends on the model-turn opener (10, 13).
assert_eq!(
ids.first(),
Some(&0),
"templated prompt must begin with BOS"
);
assert!(
ids.windows(2).any(|w| w == [10, 12]),
"expected a <start_of_turn> user span: {ids:?}"
);
assert_eq!(
&ids[ids.len() - 2..],
&[10, 13],
"templated prompt must end on the <start_of_turn> model opener: {ids:?}"
);
// The prompt body tokens are present (What=20, capital=23, France=25).
for t in [20u32, 23, 25] {
assert!(ids.contains(&t), "missing prompt token {t} in {ids:?}");
}
}

/// Without a chat template, `smoke_prompt_ids` must return `None` so the caller
/// (`run_smoke_probe` / the CLI probe) builds the bare seed itself with its own
/// canonical BOS resolution. The function must NOT invent a token id — earlier
/// the server probe hard-coded a magic id when no `<bos>` resolved, seeding the
/// probe wrong; returning `None` keeps the BOS fallback chain in one place.
#[test]
fn smoke_prompt_falls_back_to_bare_seed_without_template() {
let tmp = tempfile::tempdir().expect("tempdir");
write_smoke_fixture(tmp.path(), false); // no chat_template.jinja
let tk = tokenizers::Tokenizer::from_file(tmp.path().join("tokenizer.json")).expect("load tok");

// No template ⇒ None (the templated path is the only thing this fn owns).
assert!(
smoke_prompt_ids(tmp.path(), &tk).is_none(),
"no chat template must yield None so the caller resolves BOS itself"
);

// The bare-seed builder the caller falls back to is the shared
// `arch::smoke_prompt_ids`, which takes the canonically-resolved BOS id and
// never substitutes a magic literal. Seeded with a real BOS (0 here) it
// produces [bos] + SMOKE_PROMPT, no turn markers.
let ids = rmlx_models::arch::smoke_prompt_ids(&tk, 0).expect("bare-seed smoke prompt");
assert_eq!(ids.first(), Some(&0), "bare seed must begin with BOS");
assert!(
!ids.contains(&10) && !ids.contains(&13),
"bare-seed fallback must not contain turn markers: {ids:?}"
);
assert!(
ids.contains(&20) && ids.contains(&25),
"missing prompt body: {ids:?}"
);
}
11 changes: 11 additions & 0 deletions crates/rmlx-server/src/openai/state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -854,11 +854,22 @@ impl AppState {
// zero-overhead path unchanged.
if self.require_smoke_probe {
tracing::info!(model_id, path = %entry.abs_path.display(), "B5: running smoke probe before first load");

// Render the smoke seed through the model's real chat template so the
// probe exercises production-shaped, turn-structured input. When no
// usable template exists, `smoke_prompt_ids` returns None and
// `run_smoke_probe` builds the bare seed itself with its own
// canonical BOS resolution — no token id is invented here.
let templated_prompt = crate::tokenizer_io::load_tokenizer(&entry.abs_path)
.ok()
.and_then(|tk| crate::chat_template::smoke_prompt_ids(&entry.abs_path, &tk));

let verdict = rmlx_models::arch::run_smoke_probe(
&entry.abs_path,
rmlx_mlx::Device::Gpu,
None, // use arch default KV quant
None, // use model default max_ctx
templated_prompt,
)
.map_err(|e| format!("smoke probe error for '{model_id}': {e}"))?;

Expand Down
9 changes: 9 additions & 0 deletions docs/CLI.md
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,15 @@ rmlx info --model /path/to/snapshot --probe-smoke
| `--max-ctx` | u32 | (from model) | **Virtual ceiling** on context length, in tokens (not an eager allocation): the KV ring grows lazily up to it, prompts over it are rejected. See `docs/KV_CACHE.md` §4.6. Must be ≥ 256 when set. |
| `--list-cache-types` | bool flag | off | Print the full §D1 KV codec table and exit. No model load. |

The smoke probe renders its fixed seed prompt through the snapshot's
`chat_template.jinja` when present, so an instruction-tuned model is exercised
on the same turn-structured input it is served with. A *bare* instruction (no
turn markers) makes some healthy instruction-tuned models loop a filler token —
the reference loader (`mlx-lm`) reproduces this identically — which previously
raised false `BrokenPunctLoop` verdicts (e.g. the QAT-4bit `gemma-4-12B`
unified snapshots, which serve coherently via the chat template). Snapshots
with no chat template fall back to the bare-instruction seed.

---

### `baseline`
Expand Down
10 changes: 10 additions & 0 deletions docs/MODELS.md
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,16 @@ arch string to the Gemma4 text loader; the multimodal-embedder tensors
(`embed_vision`/`embed_audio`/`vision_embedder.*`) are not read, so image/audio
input is not yet wired for 12B (text serves end-to-end).

Text serves correctly at **all weight quants**, including the mixed 4/8-bit QAT
snapshots (`gemma-4-12B-it-qat-4bit` affine, `gemma-4-12B-it-qat-mxfp4`): their
`quantization` block keeps the MLP `gate/up/down` projections at 8-bit while the
rest is 4-bit, which the per-tensor override resolver handles unchanged. These
snapshots emit a degenerate filler token (`'1'`) on a *bare* instruction prompt
with no turn markers — `mlx-lm` reproduces this identically, and the mxfp8 build
degenerates the same way to `.`/`_`. The `--probe-smoke` heuristic therefore
templates its seed (see `docs/CLI.md` `info`) so the verdict matches the served
behaviour rather than the bare-prompt artifact.

### Key structural properties

**SWA + FullAttention alternation.** Per-layer `layer_types` array determines
Expand Down
Loading