diff --git a/Cargo.lock b/Cargo.lock index fb9f76c..0fd2e30 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -24,7 +24,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" dependencies = [ "cfg-if", + "getrandom 0.3.4", "once_cell", + "serde", "version_check", "zerocopy", ] @@ -104,7 +106,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys 0.61.2", + "windows-sys", ] [[package]] @@ -115,7 +117,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.61.2", + "windows-sys", ] [[package]] @@ -243,7 +245,7 @@ dependencies = [ "bitflags", "cexpr", "clang-sys", - "itertools 0.12.1", + "itertools 0.13.0", "log", "prettyplease", "proc-macro2", @@ -299,6 +301,15 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" +[[package]] +name = "castaway" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dec551ab6e7578819132c713a93c022a05d60159dc86e7a7050223577484c55a" +dependencies = [ + "rustversion", +] + [[package]] name = "cc" version = "1.2.61" @@ -420,17 +431,31 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" +[[package]] +name = "compact_str" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dfdd1c2274d9aa354115b09dc9a901d6c5576818cdf70d14cae2bdb47df00ab" +dependencies = [ + "castaway", + "cfg-if", + "itoa", + "rustversion", + "ryu", + "serde", + "static_assertions", +] + [[package]] name = "console" -version = "0.15.11" +version = "0.16.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8" +checksum = "d64e8af5551369d19cf50138de61f1c42074ab970f74e99be916646777f8fc87" dependencies = [ "encode_unicode", "libc", - "once_cell", "unicode-width", - "windows-sys 0.59.0", + "windows-sys", ] [[package]] @@ -578,6 +603,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "daachorse" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f55d7153ba3b507595872a3874803f07a8a81d1e888abed8e5db7da0597d6e2" + [[package]] name = "darling" version = "0.20.11" @@ -613,6 +644,15 @@ dependencies = [ "syn", ] +[[package]] +name = "dary_heap" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b1e3a325bc115f096c8b77bbf027a7c2592230e70be2d985be950d3d5e60ebe" +dependencies = [ + "serde", +] + [[package]] name = "der" version = "0.8.0" @@ -721,7 +761,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.61.2", + "windows-sys", ] [[package]] @@ -914,13 +954,14 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.17" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", "libc", - "wasi", + "r-efi 5.3.0", + "wasip2", ] [[package]] @@ -931,7 +972,7 @@ checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" dependencies = [ "cfg-if", "libc", - "r-efi", + "r-efi 6.0.0", "wasip2", "wasip3", ] @@ -1165,14 +1206,14 @@ dependencies = [ [[package]] name = "indicatif" -version = "0.17.11" +version = "0.18.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235" +checksum = "25470f23803092da7d239834776d653104d551bc4d7eacaf31e6837854b8e9eb" dependencies = [ "console", - "number_prefix", "portable-atomic", "unicode-width", + "unit-prefix", "web-time", ] @@ -1184,27 +1225,18 @@ checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" [[package]] name = "itertools" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" -dependencies = [ - "either", -] - -[[package]] -name = "itertools" -version = "0.12.1" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" dependencies = [ "either", ] [[package]] name = "itertools" -version = "0.13.0" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" dependencies = [ "either", ] @@ -1387,7 +1419,7 @@ checksum = "50b7e5b27aa02a74bac8c3f23f448f8d87ff11f92d3aac1a6ed369ee08cc56c1" dependencies = [ "libc", "wasi", - "windows-sys 0.61.2", + "windows-sys", ] [[package]] @@ -1472,7 +1504,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.61.2", + "windows-sys", ] [[package]] @@ -1508,12 +1540,6 @@ dependencies = [ "autocfg", ] -[[package]] -name = "number_prefix" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" - [[package]] name = "object" version = "0.37.3" @@ -1777,6 +1803,12 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + [[package]] name = "r-efi" version = "6.0.0" @@ -1785,20 +1817,19 @@ checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" [[package]] name = "rand" -version = "0.8.6" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" dependencies = [ - "libc", "rand_chacha", "rand_core", ] [[package]] name = "rand_chacha" -version = "0.3.1" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" dependencies = [ "ppv-lite86", "rand_core", @@ -1806,11 +1837,11 @@ dependencies = [ [[package]] name = "rand_core" -version = "0.6.4" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" dependencies = [ - "getrandom 0.2.17", + "getrandom 0.3.4", ] [[package]] @@ -1825,12 +1856,12 @@ dependencies = [ [[package]] name = "rayon-cond" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9" +checksum = "2964d0cf57a3e7a06e8183d14a8b527195c706b7983549cd5462d5aa3747438f" dependencies = [ "either", - "itertools 0.11.0", + "itertools 0.14.0", "rayon", ] @@ -1901,7 +1932,7 @@ dependencies = [ "serde_json", "symphonia", "tempfile", - "thiserror 2.0.18", + "thiserror", "tokenizers", "tracing", ] @@ -1949,7 +1980,7 @@ dependencies = [ "serde", "serde_json", "tempfile", - "thiserror 2.0.18", + "thiserror", "toml", "tracing", ] @@ -1960,7 +1991,7 @@ version = "0.2.0" dependencies = [ "rmlx-core", "rmlx-mlx", - "thiserror 2.0.18", + "thiserror", "tracing", ] @@ -1976,7 +2007,7 @@ dependencies = [ "safetensors", "serde_json", "tempfile", - "thiserror 2.0.18", + "thiserror", "tracing", ] @@ -1992,7 +2023,7 @@ dependencies = [ "serde", "serde_json", "tempfile", - "thiserror 2.0.18", + "thiserror", "tracing", ] @@ -2008,7 +2039,7 @@ dependencies = [ "serde_json", "sha2", "tempfile", - "thiserror 2.0.18", + "thiserror", "time", "toml", "tracing", @@ -2023,7 +2054,7 @@ dependencies = [ "rmlx-core", "rmlx-loader", "safetensors", - "thiserror 2.0.18", + "thiserror", "tracing", ] @@ -2048,7 +2079,7 @@ dependencies = [ "serde_json", "symphonia", "tempfile", - "thiserror 2.0.18", + "thiserror", "tokenizers", "tracing", "twox-hash", @@ -2061,7 +2092,7 @@ dependencies = [ "criterion", "rmlx-core", "rmlx-kv-quant", - "thiserror 2.0.18", + "thiserror", "tracing", ] @@ -2098,7 +2129,7 @@ dependencies = [ "serde", "serde_json", "tempfile", - "thiserror 2.0.18", + "thiserror", "tokenizers", "tokio", "tower", @@ -2163,7 +2194,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys 0.61.2", + "windows-sys", ] [[package]] @@ -2213,7 +2244,7 @@ version = "0.1.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91c1b7e4904c873ef0710c1f407dde2e6287de2bebc1bbbf7d430bb7cbffd939" dependencies = [ - "windows-sys 0.61.2", + "windows-sys", ] [[package]] @@ -2388,7 +2419,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" dependencies = [ "libc", - "windows-sys 0.61.2", + "windows-sys", ] [[package]] @@ -2409,6 +2440,12 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "strength_reduce" version = "0.2.4" @@ -2528,16 +2565,7 @@ dependencies = [ "getrandom 0.4.2", "once_cell", "rustix", - "windows-sys 0.61.2", -] - -[[package]] -name = "thiserror" -version = "1.0.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" -dependencies = [ - "thiserror-impl 1.0.69", + "windows-sys", ] [[package]] @@ -2546,18 +2574,7 @@ version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" dependencies = [ - "thiserror-impl 2.0.18", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" -dependencies = [ - "proc-macro2", - "quote", - "syn", + "thiserror-impl", ] [[package]] @@ -2648,17 +2665,19 @@ dependencies = [ [[package]] name = "tokenizers" -version = "0.20.4" +version = "0.23.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b08cc37428a476fc9e20ac850132a513a2e1ce32b6a31addf2b74fa7033b905" +checksum = "44e5bea67576e04b6ff8564c5d9e09c2ef0cf476502245f2f120e497769d3112" dependencies = [ - "aho-corasick", + "ahash", + "compact_str", + "daachorse", + "dary_heap", "derive_builder", "esaxx-rs", - "getrandom 0.2.17", + "getrandom 0.3.4", "indicatif", - "itertools 0.12.1", - "lazy_static", + "itertools 0.14.0", "log", "macro_rules_attribute", "monostate", @@ -2672,7 +2691,7 @@ dependencies = [ "serde", "serde_json", "spm_precompiled", - "thiserror 1.0.69", + "thiserror", "unicode-normalization-alignments", "unicode-segmentation", "unicode_categories", @@ -2691,7 +2710,7 @@ dependencies = [ "signal-hook-registry", "socket2", "tokio-macros", - "windows-sys 0.61.2", + "windows-sys", ] [[package]] @@ -2792,7 +2811,7 @@ checksum = "050686193eb999b4bb3bc2acfa891a13da00f79734704c4b8b4ef1a10b368a3c" dependencies = [ "crossbeam-channel", "symlink", - "thiserror 2.0.18", + "thiserror", "time", "tracing-subscriber", ] @@ -2921,6 +2940,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" +[[package]] +name = "unit-prefix" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3" + [[package]] name = "ureq" version = "3.3.0" @@ -3155,7 +3180,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.61.2", + "windows-sys", ] [[package]] @@ -3223,15 +3248,6 @@ dependencies = [ "windows-link", ] -[[package]] -name = "windows-sys" -version = "0.59.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" -dependencies = [ - "windows-targets", -] - [[package]] name = "windows-sys" version = "0.61.2" @@ -3241,70 +3257,6 @@ dependencies = [ "windows-link", ] -[[package]] -name = "windows-targets" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" -dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", -] - -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" - -[[package]] -name = "windows_aarch64_msvc" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" - -[[package]] -name = "windows_i686_gnu" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" - -[[package]] -name = "windows_i686_gnullvm" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" - -[[package]] -name = "windows_i686_msvc" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" - -[[package]] -name = "windows_x86_64_gnu" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" - -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" - -[[package]] -name = "windows_x86_64_msvc" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" - [[package]] name = "winnow" version = "1.0.3" diff --git a/Cargo.toml b/Cargo.toml index f8fe03c..7d5a688 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -82,7 +82,7 @@ tower = { version = "0.5", default-features = false } # Templates / tokenizer minijinja = { version = "2", features = ["json", "preserve_order"] } -tokenizers = "0.20" +tokenizers = "0.23" # MLX bindings — disabled until tensor compute is needed (S1.4 Gemma4 graph). # See FINDINGS.md "MLX bindings" for the build-tooling situation. diff --git a/crates/rmlx-audio/src/tts.rs b/crates/rmlx-audio/src/tts.rs index ceb9d51..409387e 100644 --- a/crates/rmlx-audio/src/tts.rs +++ b/crates/rmlx-audio/src/tts.rs @@ -1928,7 +1928,13 @@ impl TtsTokenizer { }) .collect(); if !special_tokens.is_empty() { - inner.add_special_tokens(&special_tokens); + // tokenizers 0.21+ takes the tokens by value + // (`impl IntoIterator`) and + // returns the added count as a `Result`; the + // local Vec is not used afterwards. + inner + .add_special_tokens(special_tokens) + .map_err(|e| TtsError::Tokenizer(e.to_string()))?; } } } diff --git a/crates/rmlx-models/src/decode_loop_tests.rs b/crates/rmlx-models/src/decode_loop_tests.rs index d8a0a89..9a4b782 100644 --- a/crates/rmlx-models/src/decode_loop_tests.rs +++ b/crates/rmlx-models/src/decode_loop_tests.rs @@ -1,12 +1,12 @@ use super::*; use std::cell::Cell; -use std::collections::HashMap; +use std::fmt::Write as _; +use std::str::FromStr as _; use std::sync::{Mutex, MutexGuard}; use rmlx_core::error::Error; use rmlx_kv_quant::KvQuant; -use tokenizers::models::wordlevel::WordLevel; use tokenizers::Tokenizer; /// Serializes the MLX-touching tests *in this module* against each other. MLX @@ -55,18 +55,25 @@ fn id_of(a: &Array) -> u32 { } /// A minimal `WordLevel` tokenizer over ids `0..n` mapped to `t`. +/// +/// Built from a HuggingFace tokenizer JSON string rather than the +/// `WordLevelBuilder::vocab` API: tokenizers 0.21+ switched the builder's +/// vocab map to `ahash::AHashMap`, which is not a direct dependency of this +/// crate. The JSON path is behaviour-identical and dependency-free. #[allow(clippy::expect_used)] fn tiny_tokenizer(n: u32) -> Tokenizer { - let mut vocab: HashMap = HashMap::new(); + let mut vocab = String::from("{"); for i in 0..n { - vocab.insert(format!("t{i}"), i); + if i > 0 { + vocab.push(','); + } + let _ = write!(vocab, "\"t{i}\":{i}"); } - let wl = WordLevel::builder() - .vocab(vocab) - .unk_token("".to_string()) - .build() - .expect("build wordlevel"); - Tokenizer::new(wl) + vocab.push('}'); + let json = format!( + r#"{{"version":"1.0","truncation":null,"padding":null,"added_tokens":[],"normalizer":null,"pre_tokenizer":null,"post_processor":null,"decoder":null,"model":{{"type":"WordLevel","vocab":{vocab},"unk_token":""}}}}"# + ); + Tokenizer::from_str(&json).expect("build wordlevel tokenizer") } fn greedy_cfg() -> SamplerConfig {