diff --git a/Cargo.lock b/Cargo.lock index 01d56cd..6df81b9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,6 +11,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "autocfg" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53" + [[package]] name = "bit-set" version = "0.8.0" @@ -26,12 +32,34 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" +[[package]] +name = "bitflags" +version = "2.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + [[package]] name = "daachorse" version = "3.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "99251f238b74cd219a86fe6ea9328308ebb223fcbb5b8eb5aa400b847a41dded" +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys", +] + [[package]] name = "fancy-regex" version = "0.18.0" @@ -43,12 +71,176 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "fastrand" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "libc", + "r-efi 5.3.0", + "wasip2", +] + +[[package]] +name = "getrandom" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "300e883d756b2e4ec94e02791f39b04b522276138852cfc41d9fb7e904106099" +dependencies = [ + "cfg-if", + "libc", + "r-efi 6.0.0", +] + +[[package]] +name = "libc" +version = "0.2.186" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" + +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + [[package]] name = "memchr" version = "2.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4" +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "proptest" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b45fcc2344c680f5025fe57779faef368840d0bd1f42f216291f0dc4ace4744" +dependencies = [ + "bit-set", + "bit-vec", + "bitflags", + "num-traits", + "rand", + "rand_chacha", + "rand_xorshift", + "regex-syntax", + "rusty-fork", + "tempfile", + "unarray", +] + +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + +[[package]] +name = "quote" +version = "1.0.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfbc457d0c7a0759a614551b11a6409e5951f6c7537be1f1b7682b9ae9230368" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + +[[package]] +name = "rand" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" +dependencies = [ + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", +] + +[[package]] +name = "rand_xorshift" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "513962919efc330f829edb2535844d1b912b0fbe2ca165d613e4e8788bb05a5a" +dependencies = [ + "rand_core", +] + [[package]] name = "regex" version = "1.12.4" @@ -78,6 +270,31 @@ version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4" +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys", +] + +[[package]] +name = "rusty-fork" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc6bf79ff24e648f6da1f8d1f011e9cac26491b619e6b9280f2b47f1774e6ee2" +dependencies = [ + "fnv", + "quick-error", + "tempfile", + "wait-timeout", +] + [[package]] name = "stella-aho-corasick-core" version = "1.0.4" @@ -113,11 +330,36 @@ dependencies = [ name = "stella-text-search-core" version = "1.0.6" dependencies = [ + "proptest", "stella-aho-corasick-core", "stella-fuzzy-search-core", "stella-regex-set-core", ] +[[package]] +name = "syn" +version = "2.0.118" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9ae57f904213ebb649ce6895b8a66c66f0203b9319718f69a5612a065b1422" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tempfile" +version = "3.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" +dependencies = [ + "fastrand", + "getrandom 0.4.3", + "once_cell", + "rustix", + "windows-sys", +] + [[package]] name = "tinyvec" version = "1.11.0" @@ -133,12 +375,24 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "unarray" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" + [[package]] name = "unicode-case-mapping" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4e9026503b74f3207a4c04e6bf4ea735daa8edf6c0bbfa044cae597bb947a9db" +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + [[package]] name = "unicode-normalization" version = "0.1.25" @@ -153,3 +407,62 @@ name = "unicode-segmentation" version = "1.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c6f5d3c3b1bf09027a88a6bc961fc00497d651009560b5463668dc81b0fa87a8" + +[[package]] +name = "wait-timeout" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ac3b126d3914f9849036f826e054cbabdc8519970b8998ddaf3b5bd3c65f11" +dependencies = [ + "libc", +] + +[[package]] +name = "wasip2" +version = "1.0.4+wasi-0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b67efb37e106e55ce722a510d6b5f9c17f083e5fc79afc2badeb12cc313d9487" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "wit-bindgen" +version = "0.57.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" + +[[package]] +name = "zerocopy" +version = "0.8.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce1022995ff5ff5d841ad7d994facc23098cd40152f2c1d11cd607c6f530653f" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ae7f38b72ec2a254e2b87ef277cf2cd4fb97cbebf944faa6f33354da0867930" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/crates/core/Cargo.toml b/crates/core/Cargo.toml index 5f4d757..3f33367 100644 --- a/crates/core/Cargo.toml +++ b/crates/core/Cargo.toml @@ -12,5 +12,8 @@ stella-aho-corasick-core = { version = "1.0.4", git = "https://github.com/stella stella-fuzzy-search-core = { version = "1.1.3", git = "https://github.com/stella/fuzzy-search", rev = "0743b9c6710a84bb7e6863fdcda9a9cc1dce4fa2" } stella-regex-set-core = { version = "1.0.5", git = "https://github.com/stella/regex-set", rev = "75b6a7f7a89880b70c8497f5b86a3f09748ea3fd" } +[dev-dependencies] +proptest = "1" + [lints] workspace = true diff --git a/crates/core/src/lib.rs b/crates/core/src/lib.rs index ab5b88e..ca59079 100644 --- a/crates/core/src/lib.rs +++ b/crates/core/src/lib.rs @@ -1,5 +1,6 @@ use std::collections::BTreeMap; use std::sync::OnceLock; +use std::time::Instant; use std::{error, fmt}; use stella_aho_corasick_core as aho_core; @@ -11,12 +12,12 @@ pub type Result = std::result::Result; const AUTO_REGEX_CHUNK_MAX_SIZE: usize = 16; const AUTO_REGEX_CHUNK_COMPLEXITY_BUDGET: u32 = 6; const AUTO_REGEX_ISOLATE_COMPLEXITY: u32 = 7; -const SPLIT_IDENTITY_AC_CHUNK_SIZE: usize = 20_000; +const SPLIT_IDENTITY_AC_CHUNK_SIZE: usize = 100_000; const SPLIT_IDENTITY_AC_MIN_PATTERNS: usize = SPLIT_IDENTITY_AC_CHUNK_SIZE; const MATCH_FIELDS: usize = 3; const FUZZY_MATCH_FIELDS: usize = 4; const PREPARED_ARTIFACTS_MAGIC: &[u8; 8] = b"TXSRCH01"; -const PREPARED_ARTIFACTS_VERSION: u32 = 6; +const PREPARED_ARTIFACTS_VERSION: u32 = 7; const PREPARED_AHO_ARTIFACT_MIN_BYTES: usize = std::mem::size_of::() + std::mem::size_of::() + std::mem::size_of::() @@ -30,6 +31,13 @@ const PREPARED_LITERAL_UNICODE_BOUNDARIES: u8 = 1 << 2; const PREPARED_LITERAL_FLAGS_MASK: u8 = PREPARED_LITERAL_CASE_INSENSITIVE | PREPARED_LITERAL_WHOLE_WORDS | PREPARED_LITERAL_UNICODE_BOUNDARIES; +const PARALLEL_LAZY_REGEX_WARM_MIN_SLOTS: usize = 4; +const PARALLEL_FIND_MIN_ENGINES: usize = 4; +const PARALLEL_FIND_MIN_BYTES: usize = 32 * 1024; +const PARALLEL_FIND_MAX_WORKERS: usize = 4; +const PARALLEL_SPLIT_LITERAL_MIN_ENGINES: usize = 2; +const INLINE_LITERAL_PREFILTER_MAX_PATTERNS: usize = 4; +const INLINE_LITERAL_PREFILTER_MAX_BYTES: usize = 128; #[derive(Clone, Debug, Eq, PartialEq)] pub enum Error { @@ -224,6 +232,9 @@ pub struct RegexPattern { /// regex engine is skipped unless this pattern also matches. Lazy patterns /// only, matching the TS layer. pub prefilter_regex: Option, + /// Optional byte radius around literal prefilter hits for lazy single-pattern + /// scans. Keeps broad cue words from forcing a full-haystack regex pass. + pub prefilter_window_bytes: Option, } impl RegexPattern { @@ -236,6 +247,7 @@ impl RegexPattern { prefilter_any: Vec::new(), prefilter_case_insensitive: None, prefilter_regex: None, + prefilter_window_bytes: None, } } } @@ -368,6 +380,53 @@ pub struct EngineStats { pub fuzzy_slots: usize, } +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum EngineKind { + Literal, + SplitLiteral, + Regex, + Fuzzy, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct BuildStats { + pub slot: usize, + pub engine: EngineKind, + pub pattern_count: usize, + pub first_pattern: Option, + pub last_pattern: Option, + pub elapsed_us: u64, + pub aho_artifact_count: usize, + pub aho_artifact_bytes: usize, + pub regex_artifact_count: usize, + pub regex_artifact_bytes: usize, + pub regex_lazy: bool, + pub regex_prefilter: bool, + pub regex_prefilter_regex: bool, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct FindStats { + pub slot: usize, + pub subslot: Option, + pub engine: EngineKind, + pub pattern_count: usize, + pub first_pattern: Option, + pub last_pattern: Option, + pub match_count: usize, + pub elapsed_us: u64, +} + +pub struct TextSearchBuildResult { + pub search: TextSearch, + pub stats: Vec, +} + +pub struct TextSearchFindResult { + pub matches: Vec, + pub stats: Vec, +} + #[derive(Clone, Debug, Default, Eq, PartialEq)] pub struct PreparedTextSearchArtifacts { pub aho_automata: Vec, @@ -509,6 +568,7 @@ pub struct RegexOptions { pub prefilter_any: Vec, pub prefilter_case_insensitive: Option, pub prefilter_regex: Option, + pub prefilter_window_bytes: Option, } pub struct TextSearch { @@ -546,6 +606,8 @@ struct RegexSlot { engine: RegexEngine, prefilter: Option, prefilter_regex: Option>, + prefilter_window_bytes: Option, + prefilter_window_needs_full_context: bool, index_map: Vec, name_map: Vec>, } @@ -567,7 +629,13 @@ struct FuzzySlot { } enum LiteralPrefilter { - Single { needle: String }, + Single { + needle: String, + }, + Inline { + needles: Vec, + case_insensitive: bool, + }, Many(Box), } @@ -589,7 +657,33 @@ enum RegexBuildMode<'a> { }, } +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +struct ArtifactMetrics { + count: usize, + bytes: usize, +} + impl RegexBuildMode<'_> { + const fn position(&self) -> usize { + match self { + Self::Build => 0, + Self::Capture(artifacts) => artifacts.len(), + Self::Load { index, .. } => *index, + } + } + + fn artifact_metrics_since(&self, start: usize) -> ArtifactMetrics { + match self { + Self::Build => ArtifactMetrics::default(), + Self::Capture(artifacts) => artifacts + .get(start..) + .map_or_else(ArtifactMetrics::default, artifact_metrics), + Self::Load { artifacts, index } => artifacts + .get(start..*index) + .map_or_else(ArtifactMetrics::default, artifact_metrics), + } + } + fn next_prepared_regex(&mut self) -> Result<&[u8]> { let Self::Load { artifacts, index } = self else { return Err(Error::BuildRegex(String::from( @@ -619,6 +713,26 @@ impl RegexBuildMode<'_> { } impl AhoBuildMode<'_> { + const fn position(&self) -> usize { + match self { + Self::Build => 0, + Self::Capture(automata) => automata.len(), + Self::Load { index, .. } => *index, + } + } + + fn artifact_metrics_since(&self, start: usize) -> ArtifactMetrics { + match self { + Self::Build => ArtifactMetrics::default(), + Self::Capture(automata) => automata + .get(start..) + .map_or_else(ArtifactMetrics::default, artifact_metrics), + Self::Load { automata, index } => automata + .get(start..*index) + .map_or_else(ArtifactMetrics::default, artifact_metrics), + } + } + fn prepared_aho_count(&self) -> Result { let Self::Load { automata, .. } = self else { return Err(Error::BuildLiteral(String::from( @@ -664,6 +778,136 @@ impl AhoBuildMode<'_> { } } +trait PreparedArtifactBytes { + fn byte_len(&self) -> usize; +} + +impl PreparedArtifactBytes for PreparedAhoArtifact { + fn byte_len(&self) -> usize { + self.bytes.len() + } +} + +impl PreparedArtifactBytes for PreparedRegexArtifact { + fn byte_len(&self) -> usize { + self.bytes.len() + } +} + +fn artifact_metrics( + artifacts: &[impl PreparedArtifactBytes], +) -> ArtifactMetrics { + ArtifactMetrics { + count: artifacts.len(), + bytes: artifacts + .iter() + .map(PreparedArtifactBytes::byte_len) + .fold(0usize, usize::saturating_add), + } +} + +fn push_timed_engine( + engines: &mut Vec, + stats: Option<&mut Vec>, + pattern_count: usize, + pattern_bounds: PatternBounds, + aho_mode: &mut AhoBuildMode<'_>, + regex_mode: &mut RegexBuildMode<'_>, + build: impl FnOnce( + &mut AhoBuildMode<'_>, + &mut RegexBuildMode<'_>, + ) -> Result, +) -> Result<()> { + let slot = engines.len(); + let aho_start = aho_mode.position(); + let regex_start = regex_mode.position(); + let start = stats.as_ref().map(|_| Instant::now()); + let engine = build(aho_mode, regex_mode)?; + if let (Some(stats), Some(start)) = (stats, start) { + stats.push(build_stats_for_engine( + slot, + &engine, + pattern_count, + pattern_bounds, + elapsed_us(start), + aho_mode.artifact_metrics_since(aho_start), + regex_mode.artifact_metrics_since(regex_start), + )); + } + engines.push(engine); + Ok(()) +} + +const fn build_stats_for_engine( + slot: usize, + engine: &EngineSlot, + pattern_count: usize, + pattern_bounds: PatternBounds, + elapsed_us: u64, + aho_metrics: ArtifactMetrics, + regex_metrics: ArtifactMetrics, +) -> BuildStats { + BuildStats { + slot, + engine: engine_kind(engine), + pattern_count, + first_pattern: pattern_bounds.first, + last_pattern: pattern_bounds.last, + elapsed_us, + aho_artifact_count: aho_metrics.count, + aho_artifact_bytes: aho_metrics.bytes, + regex_artifact_count: regex_metrics.count, + regex_artifact_bytes: regex_metrics.bytes, + regex_lazy: engine_regex_lazy(engine), + regex_prefilter: engine_regex_prefilter(engine), + regex_prefilter_regex: engine_regex_prefilter_regex(engine), + } +} + +const fn engine_kind(engine: &EngineSlot) -> EngineKind { + match engine { + EngineSlot::Literal(_) => EngineKind::Literal, + EngineSlot::SplitLiteral(_) => EngineKind::SplitLiteral, + EngineSlot::Regex(_) => EngineKind::Regex, + EngineSlot::Fuzzy(_) => EngineKind::Fuzzy, + } +} + +const fn engine_regex_lazy(engine: &EngineSlot) -> bool { + matches!( + engine, + EngineSlot::Regex(RegexSlot { + engine: RegexEngine::Lazy { .. }, + .. + }) + ) +} + +const fn engine_regex_prefilter(engine: &EngineSlot) -> bool { + matches!( + engine, + EngineSlot::Regex(RegexSlot { + prefilter: Some(_), + .. + }) + ) +} + +const fn engine_regex_prefilter_regex(engine: &EngineSlot) -> bool { + matches!( + engine, + EngineSlot::Regex(RegexSlot { + prefilter_regex: Some(_), + .. + }) + ) +} + +fn elapsed_us(start: Instant) -> u64 { + let micros = start.elapsed().as_micros(); + u64::try_from(micros).unwrap_or(u64::MAX) +} + impl TextSearch { pub fn new( patterns: impl IntoIterator, @@ -674,6 +918,20 @@ impl TextSearch { Self::build_with_modes(patterns, options, &mut aho_mode, &mut regex_mode) } + pub fn new_with_build_stats( + patterns: impl IntoIterator, + options: TextSearchOptions, + ) -> Result { + let mut aho_mode = AhoBuildMode::Build; + let mut regex_mode = RegexBuildMode::Build; + Self::build_with_modes_stats( + patterns, + options, + &mut aho_mode, + &mut regex_mode, + ) + } + pub fn prepare_artifacts( patterns: impl IntoIterator, options: TextSearchOptions, @@ -718,6 +976,30 @@ impl TextSearch { Ok(search) } + pub fn with_prepared_artifacts_build_stats( + patterns: impl IntoIterator, + options: TextSearchOptions, + artifacts: &PreparedTextSearchArtifacts, + ) -> Result { + let mut aho_mode = AhoBuildMode::Load { + automata: &artifacts.aho_automata, + index: 0, + }; + let mut regex_mode = RegexBuildMode::Load { + artifacts: &artifacts.regex_sets, + index: 0, + }; + let result = Self::build_with_modes_stats( + patterns, + options, + &mut aho_mode, + &mut regex_mode, + )?; + aho_mode.finish()?; + regex_mode.finish()?; + Ok(result) + } + pub fn with_prepared_all_literal_artifacts( options: TextSearchOptions, artifacts: &PreparedTextSearchArtifacts, @@ -737,6 +1019,43 @@ impl TextSearch { Ok(search) } + pub fn with_prepared_all_literal_artifacts_build_stats( + options: TextSearchOptions, + artifacts: &PreparedTextSearchArtifacts, + ) -> Result { + let mut aho_mode = AhoBuildMode::Load { + automata: &artifacts.aho_automata, + index: 0, + }; + let regex_mode = RegexBuildMode::Load { + artifacts: &artifacts.regex_sets, + index: 0, + }; + let start = Instant::now(); + let search = + Self::build_all_literal_from_aho_artifacts(options, &mut aho_mode)?; + let stats = search + .engines + .first() + .map(|engine| { + let artifact_metrics = aho_mode.artifact_metrics_since(0); + build_stats_for_engine( + 0, + engine, + search.pattern_count, + identity_pattern_bounds(search.pattern_count), + elapsed_us(start), + artifact_metrics, + ArtifactMetrics::default(), + ) + }) + .into_iter() + .collect(); + aho_mode.finish()?; + regex_mode.finish()?; + Ok(TextSearchBuildResult { search, stats }) + } + fn build_all_literal_from_aho_artifacts( options: TextSearchOptions, aho_mode: &mut AhoBuildMode<'_>, @@ -793,100 +1112,110 @@ impl TextSearch { aho_mode: &mut AhoBuildMode<'_>, regex_mode: &mut RegexBuildMode<'_>, ) -> Result { + Ok( + Self::build_with_modes_inner( + patterns, options, aho_mode, regex_mode, None, + )? + .search, + ) + } + + fn build_with_modes_stats( + patterns: impl IntoIterator, + options: TextSearchOptions, + aho_mode: &mut AhoBuildMode<'_>, + regex_mode: &mut RegexBuildMode<'_>, + ) -> Result { + let mut stats = Vec::new(); + Self::build_with_modes_inner( + patterns, + options, + aho_mode, + regex_mode, + Some(&mut stats), + ) + } + + fn build_with_modes_inner( + patterns: impl IntoIterator, + options: TextSearchOptions, + aho_mode: &mut AhoBuildMode<'_>, + regex_mode: &mut RegexBuildMode<'_>, + mut stats: Option<&mut Vec>, + ) -> Result { let patterns = patterns.into_iter().collect::>(); - let pattern_count = patterns.len(); + let total_pattern_count = patterns.len(); let mut engines = Vec::new(); if options.all_literal && all_auto_patterns(&patterns) && !patterns.is_empty() { - engines.push(build_identity_literal_engine(patterns, options, aho_mode)?); - return Ok(Self { - engines, - pattern_count, - overlap_strategy: options.overlap_strategy, - }); - } - - let classified = classify_pattern_entries(patterns, options.all_literal)?; - let mut fuzzy = Vec::new(); - let mut literals = Vec::new(); - let mut shared_regex = Vec::new(); - let mut isolated_regex = Vec::new(); - - for pattern in classified { - if pattern.fuzzy_distance.is_some() { - fuzzy.push(pattern); - } else if pattern.is_literal { - literals.push(pattern); - } else if pattern - .regex_options - .as_ref() - .is_some_and(|regex_options| regex_options.lazy) - || pattern.alternation_count > options.max_alternations - { - isolated_regex.push(pattern); - } else { - shared_regex.push(pattern); - } - } - - if !fuzzy.is_empty() { - engines.push(EngineSlot::Fuzzy(build_fuzzy_engine(fuzzy, options)?)); - } - - for (literal_options, group) in group_literals(literals, options) { - engines.push(EngineSlot::Literal(build_literal_engine( - group, - literal_options, - options.overlap_strategy, - aho_mode, - )?)); - } - - if options.overlap_strategy == OverlapStrategy::All { - for pattern in shared_regex { - let regex_options = - Some(pattern.regex_options.clone().unwrap_or_default()); - engines.push(EngineSlot::Regex(build_regex_engine( - vec![pattern], - options, - regex_options, - aho_mode, - regex_mode, - )?)); - } - } else { - for chunk in - chunk_shared_regex_patterns(shared_regex, options.regex_chunk_size) - { - engines.push(EngineSlot::Regex(build_regex_engine( - chunk, options, None, aho_mode, regex_mode, - )?)); - } - } - - for pattern in isolated_regex { - // Mirror the TS layer: isolated regexes carry their own prefilter - // options. Passing `Some(..)` (even when the pattern has no explicit - // options) marks this as the isolated path, which suppresses the - // shared-path leading-literal prefilter inference. - let lazy_options = - Some(pattern.regex_options.clone().unwrap_or_default()); - engines.push(EngineSlot::Regex(build_regex_engine( - vec![pattern], - options, - lazy_options, + push_timed_engine( + &mut engines, + stats.as_deref_mut(), + total_pattern_count, + identity_pattern_bounds(total_pattern_count), aho_mode, regex_mode, - )?)); + |aho_mode, _| { + build_identity_literal_engine(patterns, options, aho_mode) + }, + )?; + return Ok(TextSearchBuildResult { + search: Self { + engines, + pattern_count: total_pattern_count, + overlap_strategy: options.overlap_strategy, + }, + stats: stats.map_or_else(Vec::new, std::mem::take), + }); } - Ok(Self { - engines, - pattern_count, - overlap_strategy: options.overlap_strategy, + let parts = partition_classified_patterns( + classify_pattern_entries(patterns, options.all_literal)?, + options, + ); + push_fuzzy_engines( + &mut engines, + &mut stats, + parts.fuzzy, + options, + aho_mode, + regex_mode, + )?; + push_literal_engines( + &mut engines, + &mut stats, + parts.literals, + options, + aho_mode, + regex_mode, + )?; + push_shared_regex_engines( + &mut engines, + &mut stats, + parts.shared_regex, + options, + aho_mode, + regex_mode, + )?; + push_isolated_regex_engines( + &mut engines, + &mut stats, + parts.isolated_regex, + options, + aho_mode, + regex_mode, + )?; + + Ok(TextSearchBuildResult { + search: Self { + engines, + pattern_count: total_pattern_count, + overlap_strategy: options.overlap_strategy, + }, + stats: stats.map_or_else(Vec::new, std::mem::take), }) } @@ -927,10 +1256,36 @@ impl TextSearch { } pub fn warm_lazy_regex(&self) -> Result<()> { - for engine in &self.engines { - warm_engine_lazy_regex(engine)?; - } - Ok(()) + let lazy_engines = self + .engines + .iter() + .filter(|engine| engine_has_uninitialized_lazy_regex(engine)) + .collect::>(); + let lazy_count = lazy_engines.len(); + if lazy_count < PARALLEL_LAZY_REGEX_WARM_MIN_SLOTS { + return warm_engine_refs_lazy_regex(&lazy_engines); + } + + let workers = std::thread::available_parallelism() + .map_or(1, usize::from) + .min(lazy_count); + if workers <= 1 { + return warm_engine_refs_lazy_regex(&lazy_engines); + } + + let chunk_size = lazy_count.div_ceil(workers); + std::thread::scope(|scope| { + let mut handles = Vec::with_capacity(workers); + for chunk in lazy_engines.chunks(chunk_size) { + handles.push(scope.spawn(move || warm_engine_refs_lazy_regex(chunk))); + } + for handle in handles { + handle.join().map_err(|_| { + Error::BuildRegex(String::from("Lazy regex warm-up panicked")) + })??; + } + Ok(()) + }) } pub fn is_match(&self, haystack: &str) -> Result { @@ -943,20 +1298,18 @@ impl TextSearch { } pub fn find_iter(&self, haystack: &str) -> Result> { - let mut matches = Vec::new(); - for engine in &self.engines { - matches.extend(engine_find_iter(engine, haystack)?); - } - - if matches.len() <= 1 { - return Ok(matches); - } - if self.overlap_strategy == OverlapStrategy::All { - matches.sort_by_key(|found| found.start); - return Ok(matches); - } + let mut matches = find_iter_engines(&self.engines, haystack)?; + finalize_find_matches(&mut matches, self.overlap_strategy); + Ok(matches) + } - Ok(merge_and_select(matches)) + pub fn find_iter_with_stats( + &self, + haystack: &str, + ) -> Result { + let mut result = find_iter_engines_with_stats(&self.engines, haystack)?; + finalize_find_matches(&mut result.matches, self.overlap_strategy); + Ok(result) } /// Like [`find_iter`](Self::find_iter) but reports UTF-16 code-unit offsets. @@ -1039,39 +1392,277 @@ impl TextSearch { } } -fn warm_engine_lazy_regex(engine: &EngineSlot) -> Result<()> { - if let EngineSlot::Regex(slot) = engine { - _ = regex_slot_engine(slot)?; +struct ClassifiedParts { + fuzzy: Vec, + literals: Vec, + shared_regex: Vec, + isolated_regex: Vec, +} + +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +struct PatternBounds { + first: Option, + last: Option, +} + +fn pattern_bounds(patterns: &[ClassifiedPattern]) -> PatternBounds { + PatternBounds { + first: patterns.first().map(|pattern| pattern.original_index), + last: patterns.last().map(|pattern| pattern.original_index), } - Ok(()) } -pub fn classify_patterns( - entries: &[PatternEntry], - all_literal: bool, -) -> Result> { - classify_pattern_entries(entries.to_vec(), all_literal) +fn identity_pattern_bounds(pattern_count: usize) -> PatternBounds { + let last = pattern_count + .checked_sub(1) + .and_then(|index| u32::try_from(index).ok()); + PatternBounds { + first: last.map(|_| 0), + last, + } } -fn classify_pattern_entries( - entries: Vec, - all_literal: bool, -) -> Result> { - let mut result = Vec::with_capacity(entries.len()); - for (index, entry) in entries.into_iter().enumerate() { - let original_index = pattern_index(index)?; - result.push(match entry { - PatternEntry::Auto(pattern) => { - let alternation_count = if all_literal { - 0 - } else { - count_alternations(&pattern) - }; - let is_literal = all_literal || is_literal_pattern(&pattern); - let regex_complexity = - score_regex_complexity(&pattern, alternation_count); - ClassifiedPattern { - original_index, +fn partition_classified_patterns( + classified: Vec, + options: TextSearchOptions, +) -> ClassifiedParts { + let mut fuzzy = Vec::new(); + let mut literals = Vec::new(); + let mut shared_regex = Vec::new(); + let mut isolated_regex = Vec::new(); + + for pattern in classified { + if pattern.fuzzy_distance.is_some() { + fuzzy.push(pattern); + } else if pattern.is_literal { + literals.push(pattern); + } else if pattern + .regex_options + .as_ref() + .is_some_and(|regex_options| regex_options.lazy) + || pattern.alternation_count > options.max_alternations + { + isolated_regex.push(pattern); + } else { + shared_regex.push(pattern); + } + } + + ClassifiedParts { + fuzzy, + literals, + shared_regex, + isolated_regex, + } +} + +fn push_fuzzy_engines( + engines: &mut Vec, + stats: &mut Option<&mut Vec>, + fuzzy: Vec, + options: TextSearchOptions, + aho_mode: &mut AhoBuildMode<'_>, + regex_mode: &mut RegexBuildMode<'_>, +) -> Result<()> { + if fuzzy.is_empty() { + return Ok(()); + } + let fuzzy_pattern_count = fuzzy.len(); + push_timed_engine( + engines, + stats.as_deref_mut(), + fuzzy_pattern_count, + pattern_bounds(&fuzzy), + aho_mode, + regex_mode, + |_, _| Ok(EngineSlot::Fuzzy(build_fuzzy_engine(fuzzy, options)?)), + ) +} + +fn push_literal_engines( + engines: &mut Vec, + stats: &mut Option<&mut Vec>, + literals: Vec, + options: TextSearchOptions, + aho_mode: &mut AhoBuildMode<'_>, + regex_mode: &mut RegexBuildMode<'_>, +) -> Result<()> { + for (literal_options, group) in group_literals(literals, options) { + let literal_pattern_count = group.len(); + push_timed_engine( + engines, + stats.as_deref_mut(), + literal_pattern_count, + pattern_bounds(&group), + aho_mode, + regex_mode, + |aho_mode, _| { + Ok(EngineSlot::Literal(build_literal_engine( + group, + literal_options, + options.overlap_strategy, + aho_mode, + )?)) + }, + )?; + } + Ok(()) +} + +fn push_shared_regex_engines( + engines: &mut Vec, + stats: &mut Option<&mut Vec>, + shared_regex: Vec, + options: TextSearchOptions, + aho_mode: &mut AhoBuildMode<'_>, + regex_mode: &mut RegexBuildMode<'_>, +) -> Result<()> { + if options.overlap_strategy == OverlapStrategy::All { + return push_overlap_all_regex_engines( + engines, + stats, + shared_regex, + options, + aho_mode, + regex_mode, + ); + } + + for chunk in + chunk_shared_regex_patterns(shared_regex, options.regex_chunk_size) + { + let regex_pattern_count = chunk.len(); + push_timed_engine( + engines, + stats.as_deref_mut(), + regex_pattern_count, + pattern_bounds(&chunk), + aho_mode, + regex_mode, + |aho_mode, regex_mode| { + Ok(EngineSlot::Regex(build_regex_engine( + chunk, options, None, aho_mode, regex_mode, + )?)) + }, + )?; + } + Ok(()) +} + +fn push_overlap_all_regex_engines( + engines: &mut Vec, + stats: &mut Option<&mut Vec>, + patterns: Vec, + options: TextSearchOptions, + aho_mode: &mut AhoBuildMode<'_>, + regex_mode: &mut RegexBuildMode<'_>, +) -> Result<()> { + for pattern in patterns { + let regex_options = Some(pattern.regex_options.clone().unwrap_or_default()); + push_timed_engine( + engines, + stats.as_deref_mut(), + 1, + pattern_bounds(std::slice::from_ref(&pattern)), + aho_mode, + regex_mode, + |aho_mode, regex_mode| { + Ok(EngineSlot::Regex(build_regex_engine( + vec![pattern], + options, + regex_options, + aho_mode, + regex_mode, + )?)) + }, + )?; + } + Ok(()) +} + +fn push_isolated_regex_engines( + engines: &mut Vec, + stats: &mut Option<&mut Vec>, + isolated_regex: Vec, + options: TextSearchOptions, + aho_mode: &mut AhoBuildMode<'_>, + regex_mode: &mut RegexBuildMode<'_>, +) -> Result<()> { + for pattern in isolated_regex { + // Mirrors the TS lazyOptions path: `Some(..)` marks isolated regexes and + // suppresses shared leading-literal inference. + let lazy_options = Some(pattern.regex_options.clone().unwrap_or_default()); + push_timed_engine( + engines, + stats.as_deref_mut(), + 1, + pattern_bounds(std::slice::from_ref(&pattern)), + aho_mode, + regex_mode, + |aho_mode, regex_mode| { + Ok(EngineSlot::Regex(build_regex_engine( + vec![pattern], + options, + lazy_options, + aho_mode, + regex_mode, + )?)) + }, + )?; + } + Ok(()) +} + +fn warm_engine_lazy_regex(engine: &EngineSlot) -> Result<()> { + if let EngineSlot::Regex(slot) = engine { + _ = regex_slot_engine(slot)?; + } + Ok(()) +} + +fn warm_engine_refs_lazy_regex(engines: &[&EngineSlot]) -> Result<()> { + for engine in engines { + warm_engine_lazy_regex(engine)?; + } + Ok(()) +} + +fn engine_has_uninitialized_lazy_regex(engine: &EngineSlot) -> bool { + matches!( + engine, + EngineSlot::Regex(RegexSlot { + engine: RegexEngine::Lazy { cell, .. }, + .. + }) if cell.get().is_none() + ) +} + +pub fn classify_patterns( + entries: &[PatternEntry], + all_literal: bool, +) -> Result> { + classify_pattern_entries(entries.to_vec(), all_literal) +} + +fn classify_pattern_entries( + entries: Vec, + all_literal: bool, +) -> Result> { + let mut result = Vec::with_capacity(entries.len()); + for (index, entry) in entries.into_iter().enumerate() { + let original_index = pattern_index(index)?; + result.push(match entry { + PatternEntry::Auto(pattern) => { + let alternation_count = if all_literal { + 0 + } else { + count_alternations(&pattern) + }; + let is_literal = all_literal || is_literal_pattern(&pattern); + let regex_complexity = + score_regex_complexity(&pattern, alternation_count); + ClassifiedPattern { + original_index, pattern, name: None, alternation_count, @@ -1090,6 +1681,7 @@ fn classify_pattern_entries( prefilter_any, prefilter_case_insensitive, prefilter_regex, + prefilter_window_bytes, } = regex_pattern; let alternation_count = count_alternations(&source); let regex_complexity = @@ -1107,6 +1699,7 @@ fn classify_pattern_entries( prefilter_any, prefilter_case_insensitive, prefilter_regex, + prefilter_window_bytes, }), regex_complexity, } @@ -1530,6 +2123,7 @@ fn build_regex_engine( &[prefilter.literal], prefilter.case_insensitive || options.case_insensitive, aho_mode, + false, ) }) .transpose()? @@ -1550,9 +2144,19 @@ fn build_regex_engine( whole_words: options.whole_words, unicode_boundaries: options.unicode_boundaries, }; + let prefilter_window_bytes = + lazy_options.as_ref().and_then(|regex_options| { + regex_options + .lazy + .then_some(regex_options.prefilter_window_bytes) + .flatten() + }); + let prefilter_window_needs_full_context = prefilter_window_bytes.is_some() + && values.iter().any(|pattern| has_lookaround(pattern)); let (engine, prefilter, prefilter_regex) = match lazy_options { Some(lazy_options) if lazy_options.lazy => { + let windowed = lazy_options.prefilter_window_bytes.is_some(); let prefilter = if lazy_options.prefilter_any.is_empty() { None } else { @@ -1562,6 +2166,7 @@ fn build_regex_engine( .prefilter_case_insensitive .unwrap_or(options.case_insensitive), aho_mode, + windowed, )?) }; let prefilter_regex = lazy_options @@ -1593,6 +2198,8 @@ fn build_regex_engine( engine, prefilter, prefilter_regex, + prefilter_window_bytes, + prefilter_window_needs_full_context, index_map, name_map, }) @@ -1861,6 +2468,7 @@ fn build_literal_prefilter( literals: &[String], case_insensitive: bool, aho_mode: &mut AhoBuildMode<'_>, + force_engine: bool, ) -> Result { let mut unique = Vec::::new(); for literal in literals { @@ -1874,11 +2482,18 @@ fn build_literal_prefilter( // `s` yet is already lowercase). Route case-insensitive prefilters through // Aho-Corasick, which folds identically to the search engines; reserve the // single-literal fast path for the case-sensitive exact-substring check. - if unique.len() == 1 && !case_insensitive { + if unique.len() == 1 && !case_insensitive && !force_engine { let needle = unique.pop().unwrap_or_default(); return Ok(LiteralPrefilter::Single { needle }); } + if should_inline_literal_prefilter(&unique) && !force_engine { + return Ok(LiteralPrefilter::Inline { + needles: unique, + case_insensitive, + }); + } + build_aho( unique, LiteralOptions { @@ -1893,6 +2508,92 @@ fn build_literal_prefilter( .map(LiteralPrefilter::Many) } +fn should_inline_literal_prefilter(literals: &[String]) -> bool { + if literals.is_empty() + || literals.len() > INLINE_LITERAL_PREFILTER_MAX_PATTERNS + { + return false; + } + let byte_len = literals + .iter() + .map(String::len) + .try_fold(0usize, usize::checked_add); + byte_len.is_some_and(|len| len <= INLINE_LITERAL_PREFILTER_MAX_BYTES) +} + +fn inline_literal_prefilter_matches( + haystack: &str, + needle: &str, + case_insensitive: bool, +) -> bool { + if haystack.contains(needle) { + return true; + } + if !case_insensitive { + return false; + } + if needle.is_ascii() { + if contains_ignore_ascii_case(haystack.as_bytes(), needle.as_bytes()) { + return true; + } + if haystack.is_ascii() { + return false; + } + } + contains_unicode_case_insensitive(haystack, needle) +} + +fn contains_ignore_ascii_case(haystack: &[u8], needle: &[u8]) -> bool { + if needle.is_empty() { + return true; + } + haystack + .windows(needle.len()) + .any(|candidate| candidate.eq_ignore_ascii_case(needle)) +} + +fn contains_unicode_case_insensitive(haystack: &str, needle: &str) -> bool { + if needle.is_empty() { + return true; + } + + let needle_lower = needle + .chars() + .flat_map(char::to_lowercase) + .collect::>(); + if contains_case_folded_chars(haystack, &needle_lower, char::to_lowercase) { + return true; + } + + let needle_upper = needle + .chars() + .flat_map(char::to_uppercase) + .collect::>(); + if needle_upper == needle_lower { + return false; + } + contains_case_folded_chars(haystack, &needle_upper, char::to_uppercase) +} + +fn contains_case_folded_chars( + haystack: &str, + needle: &[char], + fold: impl Fn(char) -> I + Copy, +) -> bool +where + I: Iterator, +{ + haystack.char_indices().any(|(start, _)| { + let Some(rest) = haystack.get(start..) else { + return false; + }; + let mut folded = rest.chars().flat_map(fold); + needle + .iter() + .all(|needle_char| folded.next() == Some(*needle_char)) + }) +} + /// Builds a secondary regex prefilter gate. /// /// Mirrors the TS `prefilterRegex.test(haystack)` check: a bare match test with @@ -2196,12 +2897,7 @@ fn engine_is_match(engine: &EngineSlot, haystack: &str) -> Result { } Ok(false) } - EngineSlot::Regex(slot) => { - if !regex_prefilter_matches(slot, haystack)? { - return Ok(false); - } - Ok(regex_slot_engine(slot)?.is_match(haystack)) - } + EngineSlot::Regex(slot) => regex_slot_is_match(slot, haystack), EngineSlot::Fuzzy(slot) => slot .engine .is_match(haystack) @@ -2231,15 +2927,11 @@ fn engine_find_iter(engine: &EngineSlot, haystack: &str) -> Result> { } EngineSlot::SplitLiteral(slot) => split_literal_find_iter(slot, haystack), EngineSlot::Regex(slot) => { - if !regex_prefilter_matches(slot, haystack)? { - return Ok(Vec::new()); - } + let packed = regex_slot_find_iter_packed_bytes(slot, haystack)?; extend_triple_matches( SearchEngine::Regex, haystack, - ®ex_slot_engine(slot)? - .find_iter_packed_bytes(haystack) - .map_err(|error| Error::BuildRegex(error.to_string()))?, + &packed, &Remap::Mapped { index_map: &slot.index_map, name_map: &slot.name_map, @@ -2259,25 +2951,164 @@ fn engine_find_iter(engine: &EngineSlot, haystack: &str) -> Result> { } } -fn split_literal_find_iter( - slot: &SplitLiteralSlot, +fn engine_find_iter_with_stats( + engine: &EngineSlot, + haystack: &str, + slot: usize, +) -> Result { + if let EngineSlot::SplitLiteral(split) = engine { + return split_literal_find_iter_with_stats(split, haystack, slot); + } + + let start = Instant::now(); + let matches = engine_find_iter(engine, haystack)?; + let stats = vec![find_stats_for_engine( + slot, + None, + engine_kind(engine), + engine_pattern_count(engine), + engine_pattern_bounds(engine), + matches.len(), + start, + )]; + Ok(TextSearchFindResult { matches, stats }) +} + +fn find_iter_engines( + engines: &[EngineSlot], + haystack: &str, +) -> Result> { + if should_parallel_find(engines, haystack) { + return find_iter_engines_parallel(engines, haystack); + } + + find_iter_engines_sequential(engines, haystack) +} + +fn find_iter_engines_with_stats( + engines: &[EngineSlot], + haystack: &str, +) -> Result { + if should_parallel_find(engines, haystack) { + return find_iter_engines_parallel_with_stats(engines, haystack); + } + + find_iter_engines_sequential_with_stats(engines, haystack, 0) +} + +fn find_iter_engines_sequential( + engines: &[EngineSlot], haystack: &str, ) -> Result> { let mut matches = Vec::new(); - for engine in &slot.engines { - let packed = engine - .engine - .find_overlapping_iter_packed_bytes(haystack) - .map_err(|error| Error::BuildLiteral(error.to_string()))?; - matches.extend(extend_triple_matches( - SearchEngine::Literal, + for engine in engines { + matches.extend(engine_find_iter(engine, haystack)?); + } + Ok(matches) +} + +fn find_iter_engines_sequential_with_stats( + engines: &[EngineSlot], + haystack: &str, + slot_offset: usize, +) -> Result { + let mut matches = Vec::new(); + let mut stats = Vec::new(); + for (index, engine) in engines.iter().enumerate() { + let result = engine_find_iter_with_stats( + engine, haystack, - &packed, - &Remap::Offset { - pattern_offset: engine.pattern_offset, - }, - )?); + slot_offset.saturating_add(index), + )?; + matches.extend(result.matches); + stats.extend(result.stats); + } + Ok(TextSearchFindResult { matches, stats }) +} + +fn find_iter_engines_parallel( + engines: &[EngineSlot], + haystack: &str, +) -> Result> { + let workers = parallel_find_workers(engines.len()); + if workers <= 1 { + return find_iter_engines_sequential(engines, haystack); } + + let chunk_size = engines.len().div_ceil(workers); + std::thread::scope(|scope| { + let mut handles = Vec::new(); + for chunk in engines.chunks(chunk_size) { + handles.push( + scope.spawn(move || find_iter_engines_sequential(chunk, haystack)), + ); + } + + let mut matches = Vec::new(); + for handle in handles { + let chunk_matches = handle.join().map_err(|_| { + Error::BuildRegex(String::from("Parallel search panicked")) + })??; + matches.extend(chunk_matches); + } + Ok(matches) + }) +} + +fn find_iter_engines_parallel_with_stats( + engines: &[EngineSlot], + haystack: &str, +) -> Result { + let workers = parallel_find_workers(engines.len()); + if workers <= 1 { + return find_iter_engines_sequential_with_stats(engines, haystack, 0); + } + + let chunk_size = engines.len().div_ceil(workers); + std::thread::scope(|scope| { + let mut handles = Vec::new(); + for (chunk_index, chunk) in engines.chunks(chunk_size).enumerate() { + let slot_offset = chunk_index.saturating_mul(chunk_size); + handles.push(scope.spawn(move || { + find_iter_engines_sequential_with_stats(chunk, haystack, slot_offset) + })); + } + + let mut matches = Vec::new(); + let mut stats = Vec::new(); + for handle in handles { + let result = handle.join().map_err(|_| { + Error::BuildRegex(String::from("Parallel search panicked")) + })??; + matches.extend(result.matches); + stats.extend(result.stats); + } + Ok(TextSearchFindResult { matches, stats }) + }) +} + +const fn should_parallel_find(engines: &[EngineSlot], haystack: &str) -> bool { + haystack.len() >= PARALLEL_FIND_MIN_BYTES + && engines.len() >= PARALLEL_FIND_MIN_ENGINES +} + +fn parallel_find_workers(engine_count: usize) -> usize { + std::thread::available_parallelism() + .map_or(1, std::num::NonZeroUsize::get) + .min(PARALLEL_FIND_MAX_WORKERS) + .min(engine_count) +} + +fn split_literal_find_iter( + slot: &SplitLiteralSlot, + haystack: &str, +) -> Result> { + let matches = if should_parallel_split_literal_find(&slot.engines, haystack) { + split_literal_find_iter_parallel(&slot.engines, haystack)? + } else { + split_literal_find_iter_sequential(&slot.engines, haystack)? + }; + if slot.overlap_strategy == OverlapStrategy::All { return Ok(matches); } @@ -2285,6 +3116,491 @@ fn split_literal_find_iter( Ok(select_leftmost_longest_matches(matches)) } +fn split_literal_find_iter_with_stats( + slot: &SplitLiteralSlot, + haystack: &str, + slot_index: usize, +) -> Result { + let mut result = + if should_parallel_split_literal_find(&slot.engines, haystack) { + split_literal_find_iter_parallel_with_stats( + &slot.engines, + haystack, + slot_index, + )? + } else { + split_literal_find_iter_sequential_with_stats( + &slot.engines, + haystack, + slot_index, + 0, + )? + }; + + if slot.overlap_strategy != OverlapStrategy::All { + result.matches = select_leftmost_longest_matches(result.matches); + } + Ok(result) +} + +fn split_literal_find_iter_sequential( + engines: &[SplitLiteralEngine], + haystack: &str, +) -> Result> { + let mut matches = Vec::new(); + for engine in engines { + matches.extend(split_literal_engine_find_iter(engine, haystack)?); + } + Ok(matches) +} + +fn split_literal_find_iter_sequential_with_stats( + engines: &[SplitLiteralEngine], + haystack: &str, + slot_index: usize, + subslot_offset: usize, +) -> Result { + let mut matches = Vec::new(); + let mut stats = Vec::new(); + for (index, engine) in engines.iter().enumerate() { + let subslot = subslot_offset.saturating_add(index); + let start = Instant::now(); + let engine_matches = split_literal_engine_find_iter(engine, haystack)?; + stats.push(find_stats_for_engine( + slot_index, + Some(subslot), + EngineKind::SplitLiteral, + split_literal_engine_pattern_count(engine), + split_literal_engine_pattern_bounds(engine), + engine_matches.len(), + start, + )); + matches.extend(engine_matches); + } + Ok(TextSearchFindResult { matches, stats }) +} + +fn split_literal_find_iter_parallel( + engines: &[SplitLiteralEngine], + haystack: &str, +) -> Result> { + let workers = parallel_find_workers(engines.len()); + if workers <= 1 { + return split_literal_find_iter_sequential(engines, haystack); + } + + let chunk_size = engines.len().div_ceil(workers); + std::thread::scope(|scope| { + let mut handles = Vec::with_capacity(workers); + for chunk in engines.chunks(chunk_size) { + handles.push( + scope + .spawn(move || split_literal_find_iter_sequential(chunk, haystack)), + ); + } + + let mut matches = Vec::new(); + for handle in handles { + let chunk_matches = handle.join().map_err(|_| { + Error::BuildLiteral(String::from( + "Parallel split literal search panicked", + )) + })??; + matches.extend(chunk_matches); + } + Ok(matches) + }) +} + +fn split_literal_find_iter_parallel_with_stats( + engines: &[SplitLiteralEngine], + haystack: &str, + slot_index: usize, +) -> Result { + let workers = parallel_find_workers(engines.len()); + if workers <= 1 { + return split_literal_find_iter_sequential_with_stats( + engines, haystack, slot_index, 0, + ); + } + + let chunk_size = engines.len().div_ceil(workers); + std::thread::scope(|scope| { + let mut handles = Vec::with_capacity(workers); + for (chunk_index, chunk) in engines.chunks(chunk_size).enumerate() { + let subslot_offset = chunk_index.saturating_mul(chunk_size); + handles.push(scope.spawn(move || { + split_literal_find_iter_sequential_with_stats( + chunk, + haystack, + slot_index, + subslot_offset, + ) + })); + } + + let mut matches = Vec::new(); + let mut stats = Vec::new(); + for handle in handles { + let result = handle.join().map_err(|_| { + Error::BuildLiteral(String::from( + "Parallel split literal search panicked", + )) + })??; + matches.extend(result.matches); + stats.extend(result.stats); + } + Ok(TextSearchFindResult { matches, stats }) + }) +} + +fn split_literal_engine_find_iter( + engine: &SplitLiteralEngine, + haystack: &str, +) -> Result> { + let packed = engine + .engine + .find_overlapping_iter_packed_bytes(haystack) + .map_err(|error| Error::BuildLiteral(error.to_string()))?; + extend_triple_matches( + SearchEngine::Literal, + haystack, + &packed, + &Remap::Offset { + pattern_offset: engine.pattern_offset, + }, + ) +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct ByteWindow { + start: usize, + end: usize, +} + +fn regex_slot_is_match(slot: &RegexSlot, haystack: &str) -> Result { + if !regex_prefilter_matches(slot, haystack)? { + return Ok(false); + } + if slot.prefilter_window_bytes.is_some() { + return Ok(!regex_slot_find_iter_packed_bytes(slot, haystack)?.is_empty()); + } + Ok(regex_slot_engine(slot)?.is_match(haystack)) +} + +fn regex_slot_find_iter_packed_bytes( + slot: &RegexSlot, + haystack: &str, +) -> Result> { + if !regex_prefilter_matches(slot, haystack)? { + return Ok(Vec::new()); + } + let engine = regex_slot_engine(slot)?; + let Some(radius) = slot.prefilter_window_bytes else { + return engine + .find_iter_packed_bytes(haystack) + .map_err(|error| Error::BuildRegex(error.to_string())); + }; + regex_slot_find_iter_packed_windowed(slot, engine, haystack, radius) +} + +fn regex_slot_find_iter_packed_windowed( + slot: &RegexSlot, + engine: ®ex_core::RegexSet, + haystack: &str, + radius: usize, +) -> Result> { + let Some(prefilter) = &slot.prefilter else { + return engine + .find_iter_packed_bytes(haystack) + .map_err(|error| Error::BuildRegex(error.to_string())); + }; + let hits = literal_prefilter_hit_ranges(prefilter, haystack)?; + if hits.is_empty() { + return Ok(Vec::new()); + } + + let windows = merged_prefilter_windows(haystack, &hits, radius); + if slot.prefilter_window_needs_full_context { + return regex_slot_find_iter_packed_windowed_full_context( + engine, haystack, &windows, + ); + } + + let mut triples = Vec::<[u32; MATCH_FIELDS]>::new(); + let mut needs_full_context = false; + for window in &windows { + let slice = str_span(haystack, window.start, window.end)?; + let local = engine + .find_iter_packed_bytes(slice) + .map_err(|error| Error::BuildRegex(error.to_string()))?; + let chunks = local.chunks_exact(MATCH_FIELDS); + if !chunks.remainder().is_empty() { + return Err(Error::InvalidPackedSearchResult { + engine: SearchEngine::Regex, + len: local.len(), + }); + } + for chunk in chunks { + let [pattern, local_start, local_end] = chunk else { + return Err(Error::InvalidPackedSearchResult { + engine: SearchEngine::Regex, + len: local.len(), + }); + }; + let start = window.start.saturating_add(byte_index(*local_start)); + let end = window.start.saturating_add(byte_index(*local_end)); + if touches_internal_window_edge(*window, haystack.len(), start, end) { + needs_full_context = true; + continue; + } + triples.push([ + *pattern, + match_offset(start, end)?, + match_offset(end, start)?, + ]); + } + } + if needs_full_context { + return regex_slot_find_iter_packed_windowed_full_context( + engine, haystack, &windows, + ); + } + Ok(pack_regex_triples(triples)) +} + +fn regex_slot_find_iter_packed_windowed_full_context( + engine: ®ex_core::RegexSet, + haystack: &str, + windows: &[ByteWindow], +) -> Result> { + let packed = engine + .find_iter_packed_bytes(haystack) + .map_err(|error| Error::BuildRegex(error.to_string()))?; + let chunks = packed.chunks_exact(MATCH_FIELDS); + if !chunks.remainder().is_empty() { + return Err(Error::InvalidPackedSearchResult { + engine: SearchEngine::Regex, + len: packed.len(), + }); + } + let mut triples = Vec::<[u32; MATCH_FIELDS]>::new(); + for chunk in chunks { + let [pattern, start, end] = chunk else { + return Err(Error::InvalidPackedSearchResult { + engine: SearchEngine::Regex, + len: packed.len(), + }); + }; + let start = byte_index(*start); + let end = byte_index(*end); + if !match_is_inside_prefilter_window(start, end, windows) { + continue; + } + triples.push([ + *pattern, + match_offset(start, end)?, + match_offset(end, start)?, + ]); + } + Ok(pack_regex_triples(triples)) +} + +fn pack_regex_triples(mut triples: Vec<[u32; MATCH_FIELDS]>) -> Vec { + triples.sort_unstable(); + triples.dedup(); + let packed_capacity = triples.len().checked_mul(MATCH_FIELDS).unwrap_or(0); + let mut packed = Vec::with_capacity(packed_capacity); + for [pattern, start, end] in triples { + packed.push(pattern); + packed.push(start); + packed.push(end); + } + packed +} + +const fn touches_internal_window_edge( + window: ByteWindow, + haystack_len: usize, + start: usize, + end: usize, +) -> bool { + (start == window.start && window.start > 0) + || (end == window.end && window.end < haystack_len) +} + +fn match_is_inside_prefilter_window( + start: usize, + end: usize, + windows: &[ByteWindow], +) -> bool { + windows + .iter() + .any(|window| window.start <= start && end <= window.end) +} + +fn match_offset(value: usize, peer: usize) -> Result { + u32::try_from(value).map_err(|_| Error::InvalidUtf8Span { + start: value, + end: peer, + }) +} + +fn merged_prefilter_windows( + haystack: &str, + hits: &[(usize, usize)], + radius: usize, +) -> Vec { + let mut windows = Vec::with_capacity(hits.len()); + for (hit_start, hit_end) in hits { + let start = floor_char_boundary(haystack, hit_start.saturating_sub(radius)); + let end = ceil_char_boundary( + haystack, + hit_end.saturating_add(radius).min(haystack.len()), + ); + let Some(last) = windows.last_mut() else { + windows.push(ByteWindow { start, end }); + continue; + }; + if start <= last.end { + last.end = last.end.max(end); + } else { + windows.push(ByteWindow { start, end }); + } + } + windows +} + +fn floor_char_boundary(haystack: &str, mut index: usize) -> usize { + index = index.min(haystack.len()); + while index > 0 && !haystack.is_char_boundary(index) { + index = index.saturating_sub(1); + } + index +} + +fn ceil_char_boundary(haystack: &str, mut index: usize) -> usize { + index = index.min(haystack.len()); + while index < haystack.len() && !haystack.is_char_boundary(index) { + index = index.saturating_add(1); + } + index +} + +const fn should_parallel_split_literal_find( + engines: &[SplitLiteralEngine], + haystack: &str, +) -> bool { + haystack.len() >= PARALLEL_FIND_MIN_BYTES + && engines.len() >= PARALLEL_SPLIT_LITERAL_MIN_ENGINES +} + +fn finalize_find_matches( + matches: &mut Vec, + overlap_strategy: OverlapStrategy, +) { + if matches.len() <= 1 { + return; + } + if overlap_strategy == OverlapStrategy::All { + matches.sort_by_key(|found| found.start); + return; + } + + *matches = merge_and_select(std::mem::take(matches)); +} + +fn find_stats_for_engine( + slot: usize, + subslot: Option, + engine: EngineKind, + pattern_count: usize, + pattern_bounds: PatternBounds, + match_count: usize, + start: Instant, +) -> FindStats { + FindStats { + slot, + subslot, + engine, + pattern_count, + first_pattern: pattern_bounds.first, + last_pattern: pattern_bounds.last, + match_count, + elapsed_us: elapsed_us(start), + } +} + +fn engine_pattern_count(engine: &EngineSlot) -> usize { + match engine { + EngineSlot::Literal(slot) => literal_slot_pattern_count(slot), + EngineSlot::SplitLiteral(slot) => slot + .engines + .iter() + .map(split_literal_engine_pattern_count) + .fold(0usize, usize::saturating_add), + EngineSlot::Regex(slot) => slot.index_map.len(), + EngineSlot::Fuzzy(slot) => slot.index_map.len(), + } +} + +fn engine_pattern_bounds(engine: &EngineSlot) -> PatternBounds { + match engine { + EngineSlot::Literal(slot) => literal_slot_pattern_bounds(slot), + EngineSlot::SplitLiteral(slot) => { + let count = engine_pattern_count(engine); + let first = slot.engines.first().map(|split| split.pattern_offset); + let last = count + .checked_sub(1) + .and_then(|index| u32::try_from(index).ok()) + .and_then(|index| first.and_then(|first| first.checked_add(index))); + PatternBounds { first, last } + } + EngineSlot::Regex(slot) => pattern_bounds_from_index_map(&slot.index_map), + EngineSlot::Fuzzy(slot) => pattern_bounds_from_index_map(&slot.index_map), + } +} + +fn literal_slot_pattern_count(slot: &LiteralSlot) -> usize { + if slot.identity_map { + return aho_pattern_count(&slot.engine); + } + slot.index_map.len() +} + +fn literal_slot_pattern_bounds(slot: &LiteralSlot) -> PatternBounds { + if slot.identity_map { + return identity_pattern_bounds(literal_slot_pattern_count(slot)); + } + pattern_bounds_from_index_map(&slot.index_map) +} + +fn split_literal_engine_pattern_count(engine: &SplitLiteralEngine) -> usize { + aho_pattern_count(&engine.engine) +} + +fn split_literal_engine_pattern_bounds( + engine: &SplitLiteralEngine, +) -> PatternBounds { + let first = Some(engine.pattern_offset); + let last = split_literal_engine_pattern_count(engine) + .checked_sub(1) + .and_then(|index| u32::try_from(index).ok()) + .and_then(|index| engine.pattern_offset.checked_add(index)); + PatternBounds { first, last } +} + +const fn pattern_bounds_from_index_map(index_map: &[u32]) -> PatternBounds { + PatternBounds { + first: index_map.first().copied(), + last: index_map.last().copied(), + } +} + +fn aho_pattern_count(engine: &aho_core::AhoCorasick) -> usize { + usize::try_from(engine.pattern_count()).map_or(usize::MAX, |count| count) +} + #[derive(Clone, Copy)] enum Remap<'a> { Mapped { @@ -2413,16 +3729,10 @@ fn remap_pattern( } fn regex_prefilter_matches(slot: &RegexSlot, haystack: &str) -> Result { - if let Some(prefilter) = &slot.prefilter { - let literal_matches = match prefilter { - LiteralPrefilter::Single { needle } => haystack.contains(needle), - LiteralPrefilter::Many(engine) => engine - .is_match(haystack) - .map_err(|error| Error::BuildLiteral(error.to_string()))?, - }; - if !literal_matches { - return Ok(false); - } + if let Some(prefilter) = &slot.prefilter + && !literal_prefilter_matches(prefilter, haystack)? + { + return Ok(false); } if let Some(prefilter_regex) = &slot.prefilter_regex && !prefilter_regex.is_match(haystack) @@ -2432,6 +3742,86 @@ fn regex_prefilter_matches(slot: &RegexSlot, haystack: &str) -> Result { Ok(true) } +fn literal_prefilter_matches( + prefilter: &LiteralPrefilter, + haystack: &str, +) -> Result { + match prefilter { + LiteralPrefilter::Single { needle } => Ok(haystack.contains(needle)), + LiteralPrefilter::Inline { + needles, + case_insensitive, + } => Ok(needles.iter().any(|needle| { + inline_literal_prefilter_matches(haystack, needle, *case_insensitive) + })), + LiteralPrefilter::Many(engine) => engine + .is_match(haystack) + .map_err(|error| Error::BuildLiteral(error.to_string())), + } +} + +fn literal_prefilter_hit_ranges( + prefilter: &LiteralPrefilter, + haystack: &str, +) -> Result> { + match prefilter { + LiteralPrefilter::Single { needle } => { + Ok(overlapping_literal_hit_ranges(haystack, needle)) + } + LiteralPrefilter::Inline { .. } => { + if literal_prefilter_matches(prefilter, haystack)? { + Ok(vec![(0, haystack.len())]) + } else { + Ok(Vec::new()) + } + } + LiteralPrefilter::Many(engine) => { + let packed = engine + .find_overlapping_iter_packed_bytes(haystack) + .map_err(|error| Error::BuildLiteral(error.to_string()))?; + let chunks = packed.chunks_exact(MATCH_FIELDS); + if !chunks.remainder().is_empty() { + return Err(Error::InvalidPackedSearchResult { + engine: SearchEngine::Literal, + len: packed.len(), + }); + } + let mut ranges = Vec::with_capacity(chunks.len()); + for chunk in chunks { + let [_pattern, start, end] = chunk else { + return Err(Error::InvalidPackedSearchResult { + engine: SearchEngine::Literal, + len: packed.len(), + }); + }; + ranges.push((byte_index(*start), byte_index(*end))); + } + Ok(ranges) + } + } +} + +fn overlapping_literal_hit_ranges( + haystack: &str, + needle: &str, +) -> Vec<(usize, usize)> { + if needle.is_empty() { + return vec![(0, haystack.len())]; + } + + let mut ranges = Vec::new(); + let mut offset = 0; + while let Some(rest) = haystack.get(offset..) { + let Some(relative_start) = rest.find(needle) else { + break; + }; + let start = offset.saturating_add(relative_start); + ranges.push((start, start.saturating_add(needle.len()))); + offset = ceil_char_boundary(haystack, start.saturating_add(1)); + } + ranges +} + fn merge_and_select(mut matches: Vec) -> Vec { if matches.len() <= 1 { return matches; diff --git a/crates/core/tests/text_search.rs b/crates/core/tests/text_search.rs index 11e22e1..f23e215 100644 --- a/crates/core/tests/text_search.rs +++ b/crates/core/tests/text_search.rs @@ -4,12 +4,17 @@ clippy::unwrap_used )] +use proptest::test_runner::Config as ProptestConfig; use stella_text_search_core::{ - Error, FuzzyDistance, FuzzyPattern, LiteralPattern, OverlapStrategy, - PatternEntry, PreparedTextSearchArtifacts, RegexPattern, TextSearch, - TextSearchOptions, classify_patterns, count_alternations, + EngineKind, Error, FuzzyDistance, FuzzyPattern, LiteralPattern, + OverlapStrategy, PatternEntry, PreparedTextSearchArtifacts, RegexPattern, + TextSearch, TextSearchOptions, classify_patterns, count_alternations, }; +const SPLIT_LITERAL_FIXTURE_CHUNK_SIZE: usize = 100_000; +const SPLIT_LITERAL_FIXTURE_SIZE: usize = SPLIT_LITERAL_FIXTURE_CHUNK_SIZE + 1; +const SPLIT_LITERAL_FIXTURE_CHUNK_PATTERN: u32 = 100_000; + #[test] fn routes_literal_regex_and_fuzzy_patterns() { let search = TextSearch::new( @@ -151,6 +156,71 @@ fn prepared_artifacts_match_direct_search() { ); } +#[test] +fn prepared_build_stats_report_internal_regex_slots() { + let patterns = vec![ + PatternEntry::Regex(RegexPattern::new(r"\b[A-Z]{2}\d{4}\b")), + PatternEntry::Regex(RegexPattern::new(r"\b[A-Z]{3}\d{3}\b")), + ]; + let artifacts = TextSearch::prepare_artifacts( + patterns.clone(), + TextSearchOptions::default(), + ) + .unwrap(); + + let result = TextSearch::with_prepared_artifacts_build_stats( + patterns, + TextSearchOptions::default(), + &artifacts, + ) + .unwrap(); + let regex_stats = result + .stats + .iter() + .filter(|stat| stat.engine == EngineKind::Regex) + .collect::>(); + + assert!(!regex_stats.is_empty()); + assert_eq!( + regex_stats + .iter() + .map(|stat| stat.pattern_count) + .sum::(), + 2 + ); + assert!(regex_stats.iter().any(|stat| { + stat.regex_artifact_count > 0 && stat.regex_artifact_bytes > 0 + })); +} + +#[test] +fn long_multi_slot_regex_search_preserves_match_order() { + let mut patterns = Vec::new(); + for index in 0..6 { + let mut pattern = RegexPattern::new(format!(r"TOKEN{index}-\d+")); + pattern.lazy = true; + patterns.push(PatternEntry::Regex(pattern)); + } + let search = TextSearch::new(patterns, TextSearchOptions::default()).unwrap(); + assert_eq!(search.engine_stats().regex_slots, 6); + + let haystack = format!( + "{} TOKEN3-42 {} TOKEN0-9 {} TOKEN5-77", + "x".repeat(33_000), + "y".repeat(128), + "z".repeat(128), + ); + + let matches = search.find_iter(&haystack).unwrap(); + assert_eq!( + matches + .iter() + .map(|found| (found.pattern, found.text.as_str())) + .collect::>(), + vec![(3, "TOKEN3-42"), (0, "TOKEN0-9"), (5, "TOKEN5-77")] + ); +} + #[test] fn prepared_artifacts_capture_regex_sets() { let mut account = RegexPattern::new(r"ACC-\d{3}"); @@ -234,6 +304,31 @@ fn prepared_artifacts_capture_lazy_regex_sets() { assert!(!prepared.is_match("token").unwrap()); } +#[test] +fn small_lazy_prefilters_stay_inline_in_prepared_artifacts() { + let mut regex = RegexPattern::new( + r"(?i)\boddíl[^\S\n]+[A-Z][^\S\n]*,[^\S\n]*vložka[^\S\n]+\d{1,6}\b", + ); + regex.lazy = true; + regex.prefilter_any.push(String::from("oddíl")); + regex.prefilter_any.push(String::from("vložka")); + regex.prefilter_case_insensitive = Some(true); + let patterns = vec![PatternEntry::Regex(regex)]; + let options = TextSearchOptions::default(); + + let artifacts = + TextSearch::prepare_artifacts(patterns.clone(), options).unwrap(); + + assert_eq!(artifacts.aho_automata.len(), 0); + assert_eq!(artifacts.regex_sets.len(), 1); + + let prepared = + TextSearch::with_prepared_artifacts(patterns, options, &artifacts).unwrap(); + + assert!(prepared.is_match("ODDÍL C, VLOŽKA 334648").unwrap()); + assert!(!prepared.is_match("plain contract text").unwrap()); +} + #[test] fn prepared_artifacts_preserve_ascii_boundaries() { let patterns = vec![PatternEntry::from("idea")]; @@ -426,11 +521,12 @@ fn prepared_regex_artifacts_roundtrip_bytes() { #[test] fn prepared_all_literal_artifacts_load_without_patterns() { - let mut patterns = (0..20_001) + let mut patterns = (0..SPLIT_LITERAL_FIXTURE_SIZE) .map(|index| PatternEntry::from(format!("term-{index}"))) .collect::>(); *patterns.get_mut(0).unwrap() = PatternEntry::from("alpha"); - *patterns.get_mut(20_000).unwrap() = PatternEntry::from("alpha beta"); + *patterns.get_mut(SPLIT_LITERAL_FIXTURE_CHUNK_SIZE).unwrap() = + PatternEntry::from("alpha beta"); let options = TextSearchOptions { all_literal: true, whole_words: true, @@ -455,7 +551,7 @@ fn prepared_all_literal_artifacts_load_without_patterns() { #[test] fn prepared_all_literal_artifacts_preserve_exact_split_threshold() { - let mut patterns = (0..20_000) + let mut patterns = (0..SPLIT_LITERAL_FIXTURE_CHUNK_SIZE) .map(|index| PatternEntry::from(format!("term-{index}"))) .collect::>(); *patterns.get_mut(0).unwrap() = PatternEntry::from("alpha"); @@ -682,6 +778,29 @@ fn warm_lazy_regex_initializes_without_prefilter_hit() { assert!(search.warm_lazy_regex().is_err()); } +#[test] +fn warm_lazy_regex_initializes_prepared_lazy_slots() { + let mut patterns = Vec::new(); + for index in 0..8 { + let mut regex = RegexPattern::new(format!("token{index}\\s+\\d+")); + regex.lazy = true; + patterns.push(PatternEntry::Regex(regex)); + } + let options = TextSearchOptions::default(); + let artifacts = + TextSearch::prepare_artifacts(patterns.clone(), options).unwrap(); + let search = + TextSearch::with_prepared_artifacts(patterns, options, &artifacts).unwrap(); + + assert_eq!(search.engine_stats().regex_slots, 8); + search.warm_lazy_regex().unwrap(); + + let matches = search.find_iter("before token6 42 after").unwrap(); + + assert_eq!(matches.len(), 1); + assert_eq!(matches.first().map(|match_| match_.pattern), Some(6)); +} + #[test] fn non_lazy_prefilter_does_not_gate_sibling_patterns_in_shared_slot() { // A non-lazy regex carrying `prefilter_any` shares a chunked slot with other @@ -715,11 +834,12 @@ fn non_lazy_prefilter_does_not_gate_sibling_patterns_in_shared_slot() { #[test] fn all_literal_identity_sets_split_and_select_globally() { - let mut patterns = (0..20_001) + let mut patterns = (0..SPLIT_LITERAL_FIXTURE_SIZE) .map(|index| PatternEntry::from(format!("term-{index}"))) .collect::>(); *patterns.get_mut(0).unwrap() = PatternEntry::from("alpha"); - *patterns.get_mut(20_000).unwrap() = PatternEntry::from("alpha beta"); + *patterns.get_mut(SPLIT_LITERAL_FIXTURE_CHUNK_SIZE).unwrap() = + PatternEntry::from("alpha beta"); let search = TextSearch::new( patterns, @@ -742,7 +862,129 @@ fn all_literal_identity_sets_split_and_select_globally() { .iter() .map(|found| (found.pattern, found.text.as_str())) .collect::>(), - vec![(0, "ALPHA"), (20_000, "ALPHA beta")] + vec![ + (0, "ALPHA"), + (SPLIT_LITERAL_FIXTURE_CHUNK_PATTERN, "ALPHA beta") + ] + ); +} + +#[test] +fn long_split_literal_search_preserves_global_overlap_order() { + let mut patterns = (0..SPLIT_LITERAL_FIXTURE_SIZE) + .map(|index| PatternEntry::from(format!("term-{index}"))) + .collect::>(); + *patterns.get_mut(0).unwrap() = PatternEntry::from("alpha"); + *patterns.get_mut(SPLIT_LITERAL_FIXTURE_CHUNK_SIZE).unwrap() = + PatternEntry::from("alpha beta"); + + let search = TextSearch::new( + patterns, + TextSearchOptions { + all_literal: true, + whole_words: true, + case_insensitive: true, + overlap_strategy: OverlapStrategy::All, + ..TextSearchOptions::default() + }, + ) + .unwrap(); + assert_eq!(search.engine_stats().split_literal_engines, 2); + + let haystack = format!("{} ALPHA beta", "x ".repeat(17_000)); + let matches = search.find_iter(&haystack).unwrap(); + assert_eq!( + matches + .iter() + .map(|found| (found.pattern, found.text.as_str())) + .collect::>(), + vec![ + (0, "ALPHA"), + (SPLIT_LITERAL_FIXTURE_CHUNK_PATTERN, "ALPHA beta") + ] + ); +} + +#[test] +fn find_stats_report_regex_slots_and_split_literal_subslots() { + let regex_search = TextSearch::new( + (0..6).map(|index| { + let mut pattern = RegexPattern::new(format!(r"TOKEN{index}-\d+")); + pattern.lazy = true; + PatternEntry::Regex(pattern) + }), + TextSearchOptions::default(), + ) + .unwrap(); + let regex_result = regex_search + .find_iter_with_stats(&format!("{} TOKEN3-42", "x".repeat(33_000))) + .unwrap(); + + assert_eq!( + regex_result + .matches + .iter() + .map(|found| (found.pattern, found.text.as_str())) + .collect::>(), + vec![(3, "TOKEN3-42")] + ); + assert_eq!( + regex_result + .stats + .iter() + .filter(|stat| stat.engine == EngineKind::Regex) + .map(|stat| stat.pattern_count) + .sum::(), + 6 + ); + + let mut literal_patterns = (0..SPLIT_LITERAL_FIXTURE_SIZE) + .map(|index| PatternEntry::from(format!("term-{index}"))) + .collect::>(); + *literal_patterns.get_mut(0).unwrap() = PatternEntry::from("alpha"); + *literal_patterns + .get_mut(SPLIT_LITERAL_FIXTURE_CHUNK_SIZE) + .unwrap() = PatternEntry::from("alpha beta"); + let literal_search = TextSearch::new( + literal_patterns, + TextSearchOptions { + all_literal: true, + whole_words: true, + case_insensitive: true, + overlap_strategy: OverlapStrategy::All, + ..TextSearchOptions::default() + }, + ) + .unwrap(); + let literal_result = literal_search + .find_iter_with_stats(&format!("{} ALPHA beta", "x ".repeat(17_000))) + .unwrap(); + + assert_eq!( + literal_result + .matches + .iter() + .map(|found| (found.pattern, found.text.as_str())) + .collect::>(), + vec![ + (0, "ALPHA"), + (SPLIT_LITERAL_FIXTURE_CHUNK_PATTERN, "ALPHA beta") + ] + ); + assert_eq!( + literal_result + .stats + .iter() + .filter(|stat| stat.engine == EngineKind::SplitLiteral) + .map(|stat| stat.pattern_count) + .sum::(), + SPLIT_LITERAL_FIXTURE_SIZE + ); + assert!( + literal_result + .stats + .iter() + .any(|stat| stat.subslot == Some(1)) ); } @@ -915,6 +1157,235 @@ fn lazy_regex_prefilter_regex_gates_engine_build() { assert!(search.is_match("year 123").is_err()); } +#[test] +fn inline_prefilter_matches_ascii_needles_in_unicode_haystacks() { + let mut regex = RegexPattern::new("("); + regex.lazy = true; + regex.prefilter_any.push(String::from("token")); + regex.prefilter_case_insensitive = Some(true); + let search = TextSearch::new( + vec![PatternEntry::Regex(regex)], + TextSearchOptions::default(), + ) + .unwrap(); + + assert!(search.is_match("případ TOKEN").is_err()); +} + +#[test] +fn inline_prefilter_matches_unicode_case_folds() { + let mut regex = RegexPattern::new("("); + regex.lazy = true; + regex.prefilter_any.push(String::from("straße")); + regex.prefilter_case_insensitive = Some(true); + let search = TextSearch::new( + vec![PatternEntry::Regex(regex)], + TextSearchOptions::default(), + ) + .unwrap(); + + assert!(search.is_match("STRASSE").is_err()); +} + +#[test] +fn lazy_regex_prefilter_window_matches_unbounded_results() { + let haystack = format!( + "{} alpha token 42 {} beta token 77 {}", + "noise ".repeat(4_000), + "noise ".repeat(2_000), + "noise ".repeat(1_000) + ); + + let mut bounded = + RegexPattern::new(r"(?>(), + vec!["2026-06-29"] + ); +} + +#[test] +fn lazy_regex_prefilter_window_keeps_exact_internal_edge_matches() { + let mut regex = RegexPattern::new("foo"); + regex.lazy = true; + regex.prefilter_any.push(String::from("foo")); + regex.prefilter_window_bytes = Some(0); + + let search = TextSearch::new( + vec![PatternEntry::Regex(regex)], + TextSearchOptions::default(), + ) + .unwrap(); + let matches = search.find_iter("xfoo").unwrap(); + + assert_eq!( + matches + .iter() + .map(|found| found.text.as_str()) + .collect::>(), + vec!["foo"] + ); +} + +#[test] +fn lazy_regex_prefilter_window_verifies_internal_edge_matches() { + let mut regex = RegexPattern::new(r"(?