From 0af02e6f9633958f75d17de3d262735d9e56f87d Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Mon, 29 Jun 2026 17:01:00 +0200 Subject: [PATCH 1/3] feat: allow omitting regex artifacts --- crates/core/src/lib.rs | 96 +++++++++++++++++++-- crates/core/tests/text_search.rs | 144 ++++++++++++++++++++++++++++++- 2 files changed, 230 insertions(+), 10 deletions(-) diff --git a/crates/core/src/lib.rs b/crates/core/src/lib.rs index ca59079..6dc394c 100644 --- a/crates/core/src/lib.rs +++ b/crates/core/src/lib.rs @@ -235,6 +235,7 @@ pub struct RegexPattern { /// Optional byte radius around literal prefilter hits for lazy single-pattern /// scans. Keeps broad cue words from forcing a full-haystack regex pass. pub prefilter_window_bytes: Option, + pub prepared_artifact_policy: PreparedArtifactPolicy, } impl RegexPattern { @@ -248,6 +249,7 @@ impl RegexPattern { prefilter_case_insensitive: None, prefilter_regex: None, prefilter_window_bytes: None, + prepared_artifact_policy: PreparedArtifactPolicy::Inherit, } } } @@ -310,6 +312,33 @@ pub enum OverlapStrategy { All, } +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub enum RegexArtifactPolicy { + #[default] + Include, + Omit, +} + +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub enum PreparedArtifactPolicy { + #[default] + Inherit, + Include, + Omit, +} + +impl PreparedArtifactPolicy { + const fn should_capture(self, default_policy: RegexArtifactPolicy) -> bool { + match self { + Self::Inherit => { + matches!(default_policy, RegexArtifactPolicy::Include) + } + Self::Include => true, + Self::Omit => false, + } + } +} + #[allow(clippy::struct_excessive_bools)] #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub struct TextSearchOptions { @@ -317,6 +346,7 @@ pub struct TextSearchOptions { pub whole_words: bool, pub max_alternations: u32, pub regex_chunk_size: Option, + pub regex_artifact_policy: RegexArtifactPolicy, pub fuzzy_metric: FuzzyMetric, pub normalize_diacritics: bool, pub case_insensitive: bool, @@ -331,6 +361,7 @@ impl Default for TextSearchOptions { whole_words: false, max_alternations: 50, regex_chunk_size: None, + regex_artifact_policy: RegexArtifactPolicy::Include, fuzzy_metric: FuzzyMetric::Levenshtein, normalize_diacritics: false, case_insensitive: false, @@ -569,6 +600,7 @@ pub struct RegexOptions { pub prefilter_case_insensitive: Option, pub prefilter_regex: Option, pub prefilter_window_bytes: Option, + pub prepared_artifact_policy: PreparedArtifactPolicy, } pub struct TextSearch { @@ -1436,11 +1468,11 @@ fn partition_classified_patterns( fuzzy.push(pattern); } else if pattern.is_literal { literals.push(pattern); - } else if pattern - .regex_options - .as_ref() - .is_some_and(|regex_options| regex_options.lazy) - || pattern.alternation_count > options.max_alternations + } else if pattern.regex_options.as_ref().is_some_and(|regex_options| { + regex_options.lazy + || regex_options.prepared_artifact_policy + != PreparedArtifactPolicy::Inherit + }) || pattern.alternation_count > options.max_alternations { isolated_regex.push(pattern); } else { @@ -1682,6 +1714,7 @@ fn classify_pattern_entries( prefilter_case_insensitive, prefilter_regex, prefilter_window_bytes, + prepared_artifact_policy, } = regex_pattern; let alternation_count = count_alternations(&source); let regex_complexity = @@ -1700,6 +1733,7 @@ fn classify_pattern_entries( prefilter_case_insensitive, prefilter_regex, prefilter_window_bytes, + prepared_artifact_policy, }), regex_complexity, } @@ -2174,8 +2208,15 @@ fn build_regex_engine( .map(|source| build_prefilter_regex(source, regex_mode)) .transpose()? .map(Box::new); - let prepared = - capture_or_load_lazy_regex(&values, engine_options, regex_mode)?; + let capture_prepared = lazy_options + .prepared_artifact_policy + .should_capture(options.regex_artifact_policy); + let prepared = capture_or_load_lazy_regex( + &values, + engine_options, + regex_mode, + capture_prepared, + )?; let engine = RegexEngine::Lazy { patterns: values, options: engine_options, @@ -2185,10 +2226,15 @@ fn build_regex_engine( (engine, prefilter, prefilter_regex) } _ => { + let capture_prepared = should_capture_eager_regex_artifact( + lazy_options.as_ref(), + options.regex_artifact_policy, + ); let engine = RegexEngine::Eager(Box::new(build_regex_set( values, engine_options, regex_mode, + capture_prepared, )?)); (engine, inferred_prefilter, None) } @@ -2205,6 +2251,20 @@ fn build_regex_engine( }) } +fn should_capture_eager_regex_artifact( + regex_options: Option<&RegexOptions>, + default_policy: RegexArtifactPolicy, +) -> bool { + regex_options.map_or( + matches!(default_policy, RegexArtifactPolicy::Include), + |options| { + options + .prepared_artifact_policy + .should_capture(default_policy) + }, + ) +} + fn regex_slot_engine(slot: &RegexSlot) -> Result<®ex_core::RegexSet> { match &slot.engine { RegexEngine::Eager(engine) => Ok(engine.as_ref()), @@ -2243,10 +2303,16 @@ fn build_regex_set( patterns: Vec, options: regex_core::Options, regex_mode: &mut RegexBuildMode<'_>, + capture_prepared: bool, ) -> Result { match regex_mode { RegexBuildMode::Build => regex_core::RegexSet::new(patterns, options), RegexBuildMode::Capture(artifacts) => { + if !capture_prepared { + artifacts.push(PreparedRegexArtifact { bytes: Vec::new() }); + return regex_core::RegexSet::new(patterns, options) + .map_err(|error| Error::BuildRegex(error.to_string())); + } let bytes = regex_core::RegexSet::prepare(patterns.clone(), options) .map_err(|error| Error::BuildRegex(error.to_string()))?; let set = regex_core::RegexSet::with_prepared(patterns, options, &bytes); @@ -2255,6 +2321,10 @@ fn build_regex_set( } RegexBuildMode::Load { .. } => { let bytes = regex_mode.next_prepared_regex()?; + if bytes.is_empty() { + return regex_core::RegexSet::new(patterns, options) + .map_err(|error| Error::BuildRegex(error.to_string())); + } regex_core::RegexSet::with_prepared(patterns, options, bytes) } } @@ -2265,10 +2335,15 @@ fn capture_or_load_lazy_regex( patterns: &[String], options: regex_core::Options, regex_mode: &mut RegexBuildMode<'_>, + capture_prepared: bool, ) -> Result>> { match regex_mode { RegexBuildMode::Build => Ok(None), RegexBuildMode::Capture(artifacts) => { + if !capture_prepared { + artifacts.push(PreparedRegexArtifact { bytes: Vec::new() }); + return Ok(None); + } let bytes = regex_core::RegexSet::prepare(patterns.to_vec(), options) .map_err(|error| Error::BuildRegex(error.to_string()))?; artifacts.push(PreparedRegexArtifact { @@ -2277,7 +2352,11 @@ fn capture_or_load_lazy_regex( Ok(Some(bytes)) } RegexBuildMode::Load { .. } => { - Ok(Some(regex_mode.next_prepared_regex()?.to_vec())) + let bytes = regex_mode.next_prepared_regex()?; + if bytes.is_empty() { + return Ok(None); + } + Ok(Some(bytes.to_vec())) } } } @@ -2609,6 +2688,7 @@ fn build_prefilter_regex( unicode_boundaries: true, }, regex_mode, + true, ) } diff --git a/crates/core/tests/text_search.rs b/crates/core/tests/text_search.rs index f23e215..f987987 100644 --- a/crates/core/tests/text_search.rs +++ b/crates/core/tests/text_search.rs @@ -7,8 +7,9 @@ use proptest::test_runner::Config as ProptestConfig; use stella_text_search_core::{ EngineKind, Error, FuzzyDistance, FuzzyPattern, LiteralPattern, - OverlapStrategy, PatternEntry, PreparedTextSearchArtifacts, RegexPattern, - TextSearch, TextSearchOptions, classify_patterns, count_alternations, + OverlapStrategy, PatternEntry, PreparedArtifactPolicy, + PreparedTextSearchArtifacts, RegexArtifactPolicy, RegexPattern, TextSearch, + TextSearchOptions, classify_patterns, count_alternations, }; const SPLIT_LITERAL_FIXTURE_CHUNK_SIZE: usize = 100_000; @@ -519,6 +520,145 @@ fn prepared_regex_artifacts_roundtrip_bytes() { ); } +#[test] +fn prepared_lazy_regex_artifacts_can_be_omitted() { + let mut regex = RegexPattern::new(r"\bTicket-\d{4}\b"); + regex.lazy = true; + regex.prefilter_any = vec![String::from("Ticket-")]; + regex.prepared_artifact_policy = PreparedArtifactPolicy::Omit; + let patterns = vec![PatternEntry::Regex(regex)]; + let options = TextSearchOptions::default(); + + let artifacts = + TextSearch::prepare_artifacts(patterns.clone(), options).unwrap(); + assert!(!artifacts.regex_sets.is_empty()); + assert!( + artifacts + .regex_sets + .iter() + .all(|artifact| artifact.bytes.is_empty()) + ); + + let direct = TextSearch::new(patterns.clone(), options).unwrap(); + let prepared = + TextSearch::with_prepared_artifacts(patterns, options, &artifacts).unwrap(); + + assert_eq!( + prepared.find_iter("Ticket-1234").unwrap(), + direct.find_iter("Ticket-1234").unwrap() + ); + assert!(prepared.find_iter("Invoice-1234").unwrap().is_empty()); +} + +#[test] +fn prepared_lazy_regex_artifacts_can_be_omitted_by_default() { + let mut regex = RegexPattern::new(r"\bCase-\d{4}\b"); + regex.lazy = true; + regex.prefilter_any = vec![String::from("Case-")]; + let patterns = vec![PatternEntry::Regex(regex)]; + let options = TextSearchOptions { + regex_artifact_policy: RegexArtifactPolicy::Omit, + ..TextSearchOptions::default() + }; + + let artifacts = + TextSearch::prepare_artifacts(patterns.clone(), options).unwrap(); + assert!(!artifacts.regex_sets.is_empty()); + assert!( + artifacts + .regex_sets + .iter() + .all(|artifact| artifact.bytes.is_empty()) + ); + + let prepared = + TextSearch::with_prepared_artifacts(patterns, options, &artifacts).unwrap(); + assert_eq!(prepared.which_match("Case-1234").unwrap(), vec![0]); +} + +#[test] +fn prepared_lazy_regex_artifacts_can_override_global_omit() { + let mut regex = RegexPattern::new(r"\bClaim-\d{4}\b"); + regex.lazy = true; + regex.prefilter_any = vec![String::from("Claim-")]; + regex.prepared_artifact_policy = PreparedArtifactPolicy::Include; + let patterns = vec![PatternEntry::Regex(regex)]; + let options = TextSearchOptions { + regex_artifact_policy: RegexArtifactPolicy::Omit, + ..TextSearchOptions::default() + }; + + let artifacts = + TextSearch::prepare_artifacts(patterns.clone(), options).unwrap(); + assert_eq!(artifacts.regex_sets.len(), 1); + assert!( + artifacts + .regex_sets + .first() + .is_some_and(|artifact| !artifact.bytes.is_empty()) + ); + + let prepared = + TextSearch::with_prepared_artifacts(patterns, options, &artifacts).unwrap(); + assert_eq!(prepared.which_match("Claim-1234").unwrap(), vec![0]); +} + +#[test] +fn prepared_eager_regex_artifacts_can_be_omitted() { + let mut regex = RegexPattern::new(r"\bOrder-\d{4}\b"); + regex.prepared_artifact_policy = PreparedArtifactPolicy::Omit; + let patterns = vec![PatternEntry::Regex(regex)]; + let options = TextSearchOptions::default(); + + let artifacts = + TextSearch::prepare_artifacts(patterns.clone(), options).unwrap(); + assert!(!artifacts.regex_sets.is_empty()); + assert!( + artifacts + .regex_sets + .iter() + .all(|artifact| artifact.bytes.is_empty()) + ); + + let direct = TextSearch::new(patterns.clone(), options).unwrap(); + let prepared = + TextSearch::with_prepared_artifacts(patterns, options, &artifacts).unwrap(); + assert_eq!( + prepared.find_iter("Order-1234").unwrap(), + direct.find_iter("Order-1234").unwrap() + ); +} + +#[test] +fn prepared_eager_regex_artifacts_can_be_omitted_by_default() { + let patterns = vec![ + PatternEntry::Regex(RegexPattern::new(r"\bAlpha-\d{4}\b")), + PatternEntry::Regex(RegexPattern::new(r"\bBeta-\d{4}\b")), + ]; + let options = TextSearchOptions { + regex_artifact_policy: RegexArtifactPolicy::Omit, + ..TextSearchOptions::default() + }; + + let artifacts = + TextSearch::prepare_artifacts(patterns.clone(), options).unwrap(); + assert!(!artifacts.regex_sets.is_empty()); + assert!( + artifacts + .regex_sets + .iter() + .all(|artifact| artifact.bytes.is_empty()) + ); + + let direct = TextSearch::new(patterns.clone(), options).unwrap(); + let prepared = + TextSearch::with_prepared_artifacts(patterns, options, &artifacts).unwrap(); + assert_eq!( + prepared.find_iter("Alpha-1234 Beta-1234").unwrap(), + direct.find_iter("Alpha-1234 Beta-1234").unwrap() + ); +} + #[test] fn prepared_all_literal_artifacts_load_without_patterns() { let mut patterns = (0..SPLIT_LITERAL_FIXTURE_SIZE) From 654d9b13af3a3db7a01923935234d8054dfd74ce Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Tue, 30 Jun 2026 02:00:51 +0200 Subject: [PATCH 2/3] perf: borrow prepared artifact bytes --- crates/core/src/lib.rs | 161 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 153 insertions(+), 8 deletions(-) diff --git a/crates/core/src/lib.rs b/crates/core/src/lib.rs index 6dc394c..4064848 100644 --- a/crates/core/src/lib.rs +++ b/crates/core/src/lib.rs @@ -464,6 +464,12 @@ pub struct PreparedTextSearchArtifacts { pub regex_sets: Vec, } +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct PreparedTextSearchArtifactsView<'a> { + pub aho_automata: Vec>, + pub regex_sets: Vec>, +} + #[derive(Clone, Debug, Eq, PartialEq)] pub struct PreparedAhoArtifact { pub fingerprint: u64, @@ -472,11 +478,24 @@ pub struct PreparedAhoArtifact { pub bytes: Vec, } +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct PreparedAhoArtifactView<'a> { + pub fingerprint: u64, + pub options: LiteralOptions, + pub identity: bool, + pub bytes: &'a [u8], +} + #[derive(Clone, Debug, Eq, PartialEq)] pub struct PreparedRegexArtifact { pub bytes: Vec, } +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub struct PreparedRegexArtifactView<'a> { + pub bytes: &'a [u8], +} + impl PreparedTextSearchArtifacts { pub fn to_bytes(&self) -> Result> { let mut bytes = Vec::new(); @@ -511,6 +530,28 @@ impl PreparedTextSearchArtifacts { } pub fn from_bytes(bytes: &[u8]) -> Result { + Ok(PreparedTextSearchArtifactsView::from_bytes(bytes)?.into_owned()) + } + + #[must_use] + pub fn as_view(&self) -> PreparedTextSearchArtifactsView<'_> { + PreparedTextSearchArtifactsView { + aho_automata: self + .aho_automata + .iter() + .map(PreparedAhoArtifactView::from) + .collect(), + regex_sets: self + .regex_sets + .iter() + .map(PreparedRegexArtifactView::from) + .collect(), + } + } +} + +impl<'a> PreparedTextSearchArtifactsView<'a> { + pub fn from_bytes(bytes: &'a [u8]) -> Result { let mut reader = PreparedArtifactReader::new(bytes); let magic = reader.read_bytes(PREPARED_ARTIFACTS_MAGIC.len())?; if magic != PREPARED_ARTIFACTS_MAGIC { @@ -534,8 +575,8 @@ impl PreparedTextSearchArtifacts { let fingerprint = reader.read_u64()?; let options = literal_options_from_flags(reader.read_u8()?)?; let identity = read_identity_flag(reader.read_u8()?)?; - let automaton = reader.read_len_prefixed_bytes()?.to_vec(); - aho_automata.push(PreparedAhoArtifact { + let automaton = reader.read_len_prefixed_bytes()?; + aho_automata.push(PreparedAhoArtifactView { fingerprint, options, identity, @@ -555,8 +596,8 @@ impl PreparedTextSearchArtifacts { } let mut regex_sets = Vec::with_capacity(regex_count); for _ in 0..regex_count { - regex_sets.push(PreparedRegexArtifact { - bytes: reader.read_len_prefixed_bytes()?.to_vec(), + regex_sets.push(PreparedRegexArtifactView { + bytes: reader.read_len_prefixed_bytes()?, }); } reader.finish()?; @@ -565,6 +606,60 @@ impl PreparedTextSearchArtifacts { regex_sets, }) } + + #[must_use] + pub fn into_owned(self) -> PreparedTextSearchArtifacts { + PreparedTextSearchArtifacts { + aho_automata: self + .aho_automata + .into_iter() + .map(PreparedAhoArtifact::from) + .collect(), + regex_sets: self + .regex_sets + .into_iter() + .map(PreparedRegexArtifact::from) + .collect(), + } + } +} + +impl<'a> From<&'a PreparedAhoArtifact> for PreparedAhoArtifactView<'a> { + fn from(artifact: &'a PreparedAhoArtifact) -> Self { + Self { + fingerprint: artifact.fingerprint, + options: artifact.options, + identity: artifact.identity, + bytes: &artifact.bytes, + } + } +} + +impl From> for PreparedAhoArtifact { + fn from(artifact: PreparedAhoArtifactView<'_>) -> Self { + Self { + fingerprint: artifact.fingerprint, + options: artifact.options, + identity: artifact.identity, + bytes: artifact.bytes.to_vec(), + } + } +} + +impl<'a> From<&'a PreparedRegexArtifact> for PreparedRegexArtifactView<'a> { + fn from(artifact: &'a PreparedRegexArtifact) -> Self { + Self { + bytes: &artifact.bytes, + } + } +} + +impl From> for PreparedRegexArtifact { + fn from(artifact: PreparedRegexArtifactView<'_>) -> Self { + Self { + bytes: artifact.bytes.to_vec(), + } + } } #[derive(Clone, Debug, Eq, PartialEq)] @@ -675,7 +770,7 @@ enum AhoBuildMode<'a> { Build, Capture(&'a mut Vec), Load { - automata: &'a [PreparedAhoArtifact], + automata: &'a [PreparedAhoArtifactView<'a>], index: usize, }, } @@ -684,7 +779,7 @@ enum RegexBuildMode<'a> { Build, Capture(&'a mut Vec), Load { - artifacts: &'a [PreparedRegexArtifact], + artifacts: &'a [PreparedRegexArtifactView<'a>], index: usize, }, } @@ -727,7 +822,7 @@ impl RegexBuildMode<'_> { return Err(Error::PreparedRegexArtifactMissing { index: current }); }; *index = current.saturating_add(1); - Ok(&artifact.bytes) + Ok(artifact.bytes) } const fn finish(&self) -> Result<()> { @@ -792,7 +887,7 @@ impl AhoBuildMode<'_> { artifact.options, artifact.identity, artifact.fingerprint, - &artifact.bytes, + artifact.bytes, )) } @@ -820,12 +915,24 @@ impl PreparedArtifactBytes for PreparedAhoArtifact { } } +impl PreparedArtifactBytes for PreparedAhoArtifactView<'_> { + fn byte_len(&self) -> usize { + self.bytes.len() + } +} + impl PreparedArtifactBytes for PreparedRegexArtifact { fn byte_len(&self) -> usize { self.bytes.len() } } +impl PreparedArtifactBytes for PreparedRegexArtifactView<'_> { + fn byte_len(&self) -> usize { + self.bytes.len() + } +} + fn artifact_metrics( artifacts: &[impl PreparedArtifactBytes], ) -> ArtifactMetrics { @@ -988,6 +1095,15 @@ impl TextSearch { patterns: impl IntoIterator, options: TextSearchOptions, artifacts: &PreparedTextSearchArtifacts, + ) -> Result { + let artifacts = artifacts.as_view(); + Self::with_prepared_artifacts_view(patterns, options, &artifacts) + } + + pub fn with_prepared_artifacts_view( + patterns: impl IntoIterator, + options: TextSearchOptions, + artifacts: &PreparedTextSearchArtifactsView<'_>, ) -> Result { let mut aho_mode = AhoBuildMode::Load { automata: &artifacts.aho_automata, @@ -1012,6 +1128,17 @@ impl TextSearch { patterns: impl IntoIterator, options: TextSearchOptions, artifacts: &PreparedTextSearchArtifacts, + ) -> Result { + let artifacts = artifacts.as_view(); + Self::with_prepared_artifacts_view_build_stats( + patterns, options, &artifacts, + ) + } + + pub fn with_prepared_artifacts_view_build_stats( + patterns: impl IntoIterator, + options: TextSearchOptions, + artifacts: &PreparedTextSearchArtifactsView<'_>, ) -> Result { let mut aho_mode = AhoBuildMode::Load { automata: &artifacts.aho_automata, @@ -1035,6 +1162,14 @@ impl TextSearch { pub fn with_prepared_all_literal_artifacts( options: TextSearchOptions, artifacts: &PreparedTextSearchArtifacts, + ) -> Result { + let artifacts = artifacts.as_view(); + Self::with_prepared_all_literal_artifacts_view(options, &artifacts) + } + + pub fn with_prepared_all_literal_artifacts_view( + options: TextSearchOptions, + artifacts: &PreparedTextSearchArtifactsView<'_>, ) -> Result { let mut aho_mode = AhoBuildMode::Load { automata: &artifacts.aho_automata, @@ -1054,6 +1189,16 @@ impl TextSearch { pub fn with_prepared_all_literal_artifacts_build_stats( options: TextSearchOptions, artifacts: &PreparedTextSearchArtifacts, + ) -> Result { + let artifacts = artifacts.as_view(); + Self::with_prepared_all_literal_artifacts_view_build_stats( + options, &artifacts, + ) + } + + pub fn with_prepared_all_literal_artifacts_view_build_stats( + options: TextSearchOptions, + artifacts: &PreparedTextSearchArtifactsView<'_>, ) -> Result { let mut aho_mode = AhoBuildMode::Load { automata: &artifacts.aho_automata, From d4562c43a6f3109f7a4077ba2a8926c89f4f1e8e Mon Sep 17 00:00:00 2001 From: jan-kubica Date: Tue, 30 Jun 2026 09:25:28 +0200 Subject: [PATCH 3/3] fix: respect omitted regex artifacts --- crates/core/src/lib.rs | 36 ++++++++++++-------- crates/core/tests/text_search.rs | 56 ++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 13 deletions(-) diff --git a/crates/core/src/lib.rs b/crates/core/src/lib.rs index 4064848..0dbfbc8 100644 --- a/crates/core/src/lib.rs +++ b/crates/core/src/lib.rs @@ -2336,6 +2336,9 @@ fn build_regex_engine( let (engine, prefilter, prefilter_regex) = match lazy_options { Some(lazy_options) if lazy_options.lazy => { let windowed = lazy_options.prefilter_window_bytes.is_some(); + let capture_prepared = lazy_options + .prepared_artifact_policy + .should_capture(options.regex_artifact_policy); let prefilter = if lazy_options.prefilter_any.is_empty() { None } else { @@ -2350,12 +2353,11 @@ fn build_regex_engine( }; let prefilter_regex = lazy_options .prefilter_regex - .map(|source| build_prefilter_regex(source, regex_mode)) + .map(|source| { + build_prefilter_regex(source, regex_mode, capture_prepared) + }) .transpose()? .map(Box::new); - let capture_prepared = lazy_options - .prepared_artifact_policy - .should_capture(options.regex_artifact_policy); let prepared = capture_or_load_lazy_regex( &values, engine_options, @@ -2451,29 +2453,36 @@ fn build_regex_set( capture_prepared: bool, ) -> Result { match regex_mode { - RegexBuildMode::Build => regex_core::RegexSet::new(patterns, options), + RegexBuildMode::Build => build_source_regex_set(patterns, options), RegexBuildMode::Capture(artifacts) => { if !capture_prepared { artifacts.push(PreparedRegexArtifact { bytes: Vec::new() }); - return regex_core::RegexSet::new(patterns, options) - .map_err(|error| Error::BuildRegex(error.to_string())); + return build_source_regex_set(patterns, options); } let bytes = regex_core::RegexSet::prepare(patterns.clone(), options) .map_err(|error| Error::BuildRegex(error.to_string()))?; - let set = regex_core::RegexSet::with_prepared(patterns, options, &bytes); + let set = regex_core::RegexSet::with_prepared(patterns, options, &bytes) + .map_err(|error| Error::BuildRegex(error.to_string()))?; artifacts.push(PreparedRegexArtifact { bytes }); - set + Ok(set) } RegexBuildMode::Load { .. } => { let bytes = regex_mode.next_prepared_regex()?; if bytes.is_empty() { - return regex_core::RegexSet::new(patterns, options) - .map_err(|error| Error::BuildRegex(error.to_string())); + return build_source_regex_set(patterns, options); } regex_core::RegexSet::with_prepared(patterns, options, bytes) + .map_err(|error| Error::BuildRegex(error.to_string())) } } - .map_err(|error| Error::BuildRegex(error.to_string())) +} + +fn build_source_regex_set( + patterns: Vec, + options: regex_core::Options, +) -> Result { + regex_core::RegexSet::new(patterns, options) + .map_err(|error| Error::BuildRegex(error.to_string())) } fn capture_or_load_lazy_regex( @@ -2825,6 +2834,7 @@ where fn build_prefilter_regex( source: String, regex_mode: &mut RegexBuildMode<'_>, + capture_prepared: bool, ) -> Result { build_regex_set( vec![source], @@ -2833,7 +2843,7 @@ fn build_prefilter_regex( unicode_boundaries: true, }, regex_mode, - true, + capture_prepared, ) } diff --git a/crates/core/tests/text_search.rs b/crates/core/tests/text_search.rs index f987987..92e8ed3 100644 --- a/crates/core/tests/text_search.rs +++ b/crates/core/tests/text_search.rs @@ -550,6 +550,37 @@ fn prepared_lazy_regex_artifacts_can_be_omitted() { assert!(prepared.find_iter("Invoice-1234").unwrap().is_empty()); } +#[test] +fn prepared_lazy_prefilter_regex_artifacts_can_be_omitted() { + let mut regex = RegexPattern::new(r"\bTicket-\d{4}\b"); + regex.lazy = true; + regex.prefilter_any = vec![String::from("Ticket-")]; + regex.prefilter_regex = Some(String::from(r"\d{4}")); + regex.prepared_artifact_policy = PreparedArtifactPolicy::Omit; + let patterns = vec![PatternEntry::Regex(regex)]; + let options = TextSearchOptions::default(); + + let artifacts = + TextSearch::prepare_artifacts(patterns.clone(), options).unwrap(); + assert_eq!(artifacts.regex_sets.len(), 2); + assert!( + artifacts + .regex_sets + .iter() + .all(|artifact| artifact.bytes.is_empty()) + ); + + let direct = TextSearch::new(patterns.clone(), options).unwrap(); + let prepared = + TextSearch::with_prepared_artifacts(patterns, options, &artifacts).unwrap(); + + assert_eq!( + prepared.find_iter("Ticket-1234").unwrap(), + direct.find_iter("Ticket-1234").unwrap() + ); + assert!(prepared.find_iter("Ticket-abcd").unwrap().is_empty()); +} + #[test] fn prepared_lazy_regex_artifacts_can_be_omitted_by_default() { let mut regex = RegexPattern::new(r"\bCase-\d{4}\b"); @@ -603,6 +634,31 @@ fn prepared_lazy_regex_artifacts_can_override_global_omit() { assert_eq!(prepared.which_match("Claim-1234").unwrap(), vec![0]); } +#[test] +fn prepared_eager_regex_artifacts_can_override_global_omit() { + let mut regex = RegexPattern::new(r"\bReceipt-\d{4}\b"); + regex.prepared_artifact_policy = PreparedArtifactPolicy::Include; + let patterns = vec![PatternEntry::Regex(regex)]; + let options = TextSearchOptions { + regex_artifact_policy: RegexArtifactPolicy::Omit, + ..TextSearchOptions::default() + }; + + let artifacts = + TextSearch::prepare_artifacts(patterns.clone(), options).unwrap(); + assert_eq!(artifacts.regex_sets.len(), 1); + assert!( + artifacts + .regex_sets + .first() + .is_some_and(|artifact| !artifact.bytes.is_empty()) + ); + + let prepared = + TextSearch::with_prepared_artifacts(patterns, options, &artifacts).unwrap(); + assert_eq!(prepared.which_match("Receipt-1234").unwrap(), vec![0]); +} + #[test] fn prepared_eager_regex_artifacts_can_be_omitted() { let mut regex = RegexPattern::new(r"\bOrder-\d{4}\b");